From 9d590a52fdad54ecfea06ae9792f0cc49f1ad1ef Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Tue, 17 Aug 2021 15:47:01 -0400 Subject: [PATCH] factorization with new data structure enbaled with GPU acceleration --- SRC/CMakeLists.txt | 1 + SRC/TRF3dV100/lupanels.cpp | 185 ++++++++-------- SRC/TRF3dV100/lupanels.hpp | 63 +++--- SRC/TRF3dV100/lupanelsComm3dGPU.cpp | 144 ++++++++++++ SRC/TRF3dV100/lupanels_GPU.cpp | 18 ++ SRC/TRF3dV100/lupanels_comm3d.cpp | 4 +- SRC/TRF3dV100/pdgstrf3d_summit.cpp | 331 ++++++++++++++-------------- SRC/TRF3dV100/schurCompUpdate.cu | 2 +- cmake_thimble.sh | 13 ++ 9 files changed, 477 insertions(+), 284 deletions(-) create mode 100644 SRC/TRF3dV100/lupanelsComm3dGPU.cpp create mode 100644 cmake_thimble.sh diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index bc4f657c..e049891c 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -116,6 +116,7 @@ if(enable_double) TRF3dV100/dsparseTreeFactorGPU.cpp TRF3dV100/lupanels.cpp TRF3dV100/lupanels_comm3d.cpp + TRF3dV100/lupanelsComm3dGPU.cpp TRF3dV100/l_panels.cpp TRF3dV100/u_panels.cpp TRF3dV100/lupanels_GPU.cpp diff --git a/SRC/TRF3dV100/lupanels.cpp b/SRC/TRF3dV100/lupanels.cpp index fe987a8b..8d6b0640 100644 --- a/SRC/TRF3dV100/lupanels.cpp +++ b/SRC/TRF3dV100/lupanels.cpp @@ -1,19 +1,18 @@ #include #include -#include +#include #include "lupanels_GPU.cuh" #include "lupanels.hpp" - LUstruct_v100::LUstruct_v100(int_t nsupers_, int_t ldt_, - int_t *isNodeInMyGrid_, int superluAccOffload_, - dLUstruct_t *LUstruct, - gridinfo3d_t *grid3d_in, - SCT_t *SCT_, superlu_dist_options_t *options_, - SuperLUStat_t *stat_) : isNodeInMyGrid(isNodeInMyGrid_), - nsupers(nsupers_), ldt(ldt_), grid3d(grid3d_in), superluAccOffload(superluAccOffload_), - SCT(SCT_), options(options_), stat(stat_) + int_t *isNodeInMyGrid_, int superluAccOffload_, + dLUstruct_t *LUstruct, + gridinfo3d_t *grid3d_in, + SCT_t *SCT_, superlu_dist_options_t *options_, + SuperLUStat_t *stat_) : isNodeInMyGrid(isNodeInMyGrid_), + nsupers(nsupers_), ldt(ldt_), grid3d(grid3d_in), superluAccOffload(superluAccOffload_), + SCT(SCT_), options(options_), stat(stat_) { grid = &(grid3d->grid2d); @@ -31,17 +30,15 @@ LUstruct_v100::LUstruct_v100(int_t nsupers_, int_t ldt_, lPanelVec = new lpanel_t[CEILING(nsupers, Pc)]; uPanelVec = new upanel_t[CEILING(nsupers, Pr)]; // create the lvectors - maxLvalCount =0; - maxLidxCount =0; - maxUvalCount =0; - maxUidxCount =0; - - std::vector localLvalSendCounts(CEILING(nsupers, Pc),0); - std::vector localUvalSendCounts(CEILING(nsupers, Pr),0); - std::vector localLidxSendCounts(CEILING(nsupers, Pc),0); - std::vector localUidxSendCounts(CEILING(nsupers, Pr),0); + maxLvalCount = 0; + maxLidxCount = 0; + maxUvalCount = 0; + maxUidxCount = 0; - + std::vector localLvalSendCounts(CEILING(nsupers, Pc), 0); + std::vector localUvalSendCounts(CEILING(nsupers, Pr), 0); + std::vector localLidxSendCounts(CEILING(nsupers, Pc), 0); + std::vector localUidxSendCounts(CEILING(nsupers, Pr), 0); for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) { @@ -49,15 +46,15 @@ LUstruct_v100::LUstruct_v100(int_t nsupers_, int_t ldt_, if (Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[k0] == 1) { int_t isDiagIncluded = 0; - + if (myrow == krow(k0)) isDiagIncluded = 1; lpanel_t lpanel(k0, Lrowind_bc_ptr[i], Lnzval_bc_ptr[i], xsup, isDiagIncluded); lPanelVec[i] = lpanel; - maxLvalCount = std::max(lPanelVec[i].nzvalSize(),maxLvalCount ); - maxLidxCount = std::max(lPanelVec[i].indexSize(),maxLidxCount ); - localLvalSendCounts[i] =lPanelVec[i].nzvalSize(); - localLidxSendCounts[i] =lPanelVec[i].indexSize(); + maxLvalCount = std::max(lPanelVec[i].nzvalSize(), maxLvalCount); + maxLidxCount = std::max(lPanelVec[i].indexSize(), maxLidxCount); + localLvalSendCounts[i] = lPanelVec[i].nzvalSize(); + localLidxSendCounts[i] = lPanelVec[i].indexSize(); } } @@ -69,62 +66,60 @@ LUstruct_v100::LUstruct_v100(int_t nsupers_, int_t ldt_, int_t globalId = i * Pr + myrow; upanel_t upanel(globalId, Ufstnz_br_ptr[i], Unzval_br_ptr[i], xsup); uPanelVec[i] = upanel; - maxUvalCount = std::max(uPanelVec[i].nzvalSize(),maxUvalCount ); - maxUidxCount = std::max(uPanelVec[i].indexSize(),maxUidxCount ); - localUvalSendCounts[i] =uPanelVec[i].nzvalSize(); - localUidxSendCounts[i] =uPanelVec[i].indexSize(); + maxUvalCount = std::max(uPanelVec[i].nzvalSize(), maxUvalCount); + maxUidxCount = std::max(uPanelVec[i].indexSize(), maxUidxCount); + localUvalSendCounts[i] = uPanelVec[i].nzvalSize(); + localUidxSendCounts[i] = uPanelVec[i].indexSize(); } } // compute the send sizes - // send and recv count for 2d comm + // send and recv count for 2d comm LvalSendCounts.resize(nsupers); UvalSendCounts.resize(nsupers); LidxSendCounts.resize(nsupers); UidxSendCounts.resize(nsupers); - std::vector recvBuf( std::max( CEILING(nsupers, Pr), CEILING(nsupers, Pc) ),0 ); + std::vector recvBuf(std::max(CEILING(nsupers, Pr), CEILING(nsupers, Pc)), 0); - for(int pr=0;prcscp.comm); - for(int i=0; i*Pr + pr< nsupers; i++ ) + MPI_Bcast((void *)recvBuf.data(), npr, mpi_int_t, pr, grid3d->cscp.comm); + for (int i = 0; i * Pr + pr < nsupers; i++) { - UvalSendCounts[i*Pr+pr] = recvBuf[i]; + UvalSendCounts[i * Pr + pr] = recvBuf[i]; } std::copy(localUidxSendCounts.begin(), localUidxSendCounts.end(), recvBuf.begin()); - // send the index count - MPI_Bcast((void *) recvBuf.data(), npr, mpi_int_t, pr, grid3d->cscp.comm); - for(int i=0; i*Pr + pr< nsupers; i++ ) + // send the index count + MPI_Bcast((void *)recvBuf.data(), npr, mpi_int_t, pr, grid3d->cscp.comm); + for (int i = 0; i * Pr + pr < nsupers; i++) { - UidxSendCounts[i*Pr+pr] = recvBuf[i]; + UidxSendCounts[i * Pr + pr] = recvBuf[i]; } - } - for(int pc=0;pcrscp.comm); - for(int i=0; i*Pc + pc< nsupers; i++ ) + MPI_Bcast((void *)recvBuf.data(), npc, mpi_int_t, pc, grid3d->rscp.comm); + for (int i = 0; i * Pc + pc < nsupers; i++) { - LvalSendCounts[i*Pc+pc] = recvBuf[i]; + LvalSendCounts[i * Pc + pc] = recvBuf[i]; } std::copy(localLidxSendCounts.begin(), localLidxSendCounts.end(), recvBuf.begin()); - // send the index count - MPI_Bcast((void *) recvBuf.data(), npc, mpi_int_t, pc, grid3d->rscp.comm); - for(int i=0; i*Pc + pc< nsupers; i++ ) + // send the index count + MPI_Bcast((void *)recvBuf.data(), npc, mpi_int_t, pc, grid3d->rscp.comm); + for (int i = 0; i * Pc + pc < nsupers; i++) { - LidxSendCounts[i*Pc+pc] = recvBuf[i]; + LidxSendCounts[i * Pc + pc] = recvBuf[i]; } - } maxUvalCount = *std::max_element(UvalSendCounts.begin(), UvalSendCounts.end()); @@ -139,26 +134,24 @@ LUstruct_v100::LUstruct_v100(int_t nsupers_, int_t ldt_, indirectRow = (int_t *)SUPERLU_MALLOC(nThreads * ldt * sizeof(int_t)); indirectCol = (int_t *)SUPERLU_MALLOC(nThreads * ldt * sizeof(int_t)); - - // allocating communication buffers + // allocating communication buffers LvalRecvBufs.resize(options->num_lookaheads); UvalRecvBufs.resize(options->num_lookaheads); LidxRecvBufs.resize(options->num_lookaheads); UidxRecvBufs.resize(options->num_lookaheads); - for(int_t i=0; inum_lookaheads; i++) + for (int_t i = 0; i < options->num_lookaheads; i++) { - LvalRecvBufs[i] = (double*) SUPERLU_MALLOC(sizeof(double)*maxLvalCount); - UvalRecvBufs[i] = (double*) SUPERLU_MALLOC(sizeof(double)*maxUvalCount); - LidxRecvBufs[i] = (int_t*) SUPERLU_MALLOC(sizeof(int_t)*maxLidxCount); - UidxRecvBufs[i] = (int_t*) SUPERLU_MALLOC(sizeof(int_t)*maxUidxCount); + LvalRecvBufs[i] = (double *)SUPERLU_MALLOC(sizeof(double) * maxLvalCount); + UvalRecvBufs[i] = (double *)SUPERLU_MALLOC(sizeof(double) * maxUvalCount); + LidxRecvBufs[i] = (int_t *)SUPERLU_MALLOC(sizeof(int_t) * maxLidxCount); + UidxRecvBufs[i] = (int_t *)SUPERLU_MALLOC(sizeof(int_t) * maxUidxCount); } // - - if(superluAccOffload) - setLUstruct_GPU(); + // if (superluAccOffload) + // for(int pc=0;pcnum_lookaheads); + assert(A_gpu.numCudaStreams < options->num_lookaheads); // cudaMalloc(&A_gpu.LvalRecvBufs, sizeof(double*)*A_gpu.numCudaStreams); - for(int stream=0; streamzscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + + int_t sender, receiver; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + receiver = myGrid; + } + else + { + sender = myGrid; + receiver = myGrid - (1 << ilvl); + } + + /*Reduce all the ancestors*/ + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t numNodes = myNodeCount[alvl]; + int_t *nodeList = treePerm[alvl]; + double treduce = SuperLU_timer_(); + + + /*first setting the L blocks to zero*/ + for (int_t node = 0; node < numNodes; ++node) /* for each block column ... */ + { + int_t k0 = nodeList[node]; + + if (myGrid == sender) + { + zSendLPanelGPU(k0, receiver); + zSendUPanelGPU(k0, receiver); + } + else + { + double alpha = 1.0, beta = 1.0; + + zRecvLPanelGPU(k0, sender, alpha, beta); + zRecvUPanelGPU(k0, sender, alpha, beta); + } + } + // return 0; + SCT->ancsReduce += SuperLU_timer_() - treduce; + } + return 0; +} + + +int_t LUstruct_v100::zSendLPanelGPU(int_t k0, int_t receiverGrid) +{ + + if (mycol == kcol(k0)) + { + int_t lk = g2lCol(k0); + if (!lPanelVec[lk].isEmpty()) + { + MPI_Send(lPanelVec[lk].blkPtrGPU(0), lPanelVec[lk].nzvalSize(), + MPI_DOUBLE, receiverGrid, k0, grid3d->zscp.comm); + SCT->commVolRed += lPanelVec[lk].nzvalSize() * sizeof(double); + } + } + return 0; +} + + +int_t LUstruct_v100::zRecvLPanelGPU(int_t k0, int_t senderGrid, double alpha, double beta) +{ + if (mycol == kcol(k0)) + { + int_t lk = g2lCol(k0); + if (!lPanelVec[lk].isEmpty()) + { + + MPI_Status status; + MPI_Recv(LvalRecvBufs[0], lPanelVec[lk].nzvalSize(), MPI_DOUBLE, senderGrid, k0, + grid3d->zscp.comm, &status); + + /*reduce the updates*/ + cublasHandle_t handle=A_gpu.cuHandles[0]; + cublasDscal(handle, lPanelVec[lk].nzvalSize(), &alpha, lPanelVec[lk].blkPtrGPU(0), 1); + cublasDaxpy(handle, lPanelVec[lk].nzvalSize(), &beta, LvalRecvBufs[0], 1, lPanelVec[lk].blkPtrGPU(0), 1); + + // cublasDscal(cublasHandle_t handle, int n, + // const double *alpha, + // double *x, int incx) + // cublasDaxpy(cublasHandle_t handle, int n, + // const double *alpha, + // const double *x, int incx, + // double *y, int incy) + } + } + return 0; +} + + +int_t LUstruct_v100::zSendUPanelGPU(int_t k0, int_t receiverGrid) +{ + + if (myrow == krow(k0)) + { + int_t lk = g2lRow(k0); + if (!uPanelVec[lk].isEmpty()) + { + MPI_Send(uPanelVec[lk].blkPtrGPU(0), uPanelVec[lk].nzvalSize(), + MPI_DOUBLE, receiverGrid, k0, grid3d->zscp.comm); + SCT->commVolRed += uPanelVec[lk].nzvalSize() * sizeof(double); + } + } + return 0; +} + + +int_t LUstruct_v100::zRecvUPanelGPU(int_t k0, int_t senderGrid, double alpha, double beta) +{ + if (myrow == krow(k0)) + { + int_t lk = g2lRow(k0); + if (!uPanelVec[lk].isEmpty()) + { + + MPI_Status status; + MPI_Recv(UvalRecvBufs[0], uPanelVec[lk].nzvalSize(), MPI_DOUBLE, senderGrid, k0, + grid3d->zscp.comm, &status); + + /*reduce the updates*/ + cublasHandle_t handle=A_gpu.cuHandles[0]; + cublasDscal(handle, uPanelVec[lk].nzvalSize(), &alpha, uPanelVec[lk].blkPtrGPU(0), 1); + cublasDaxpy(handle, uPanelVec[lk].nzvalSize(), &beta, UvalRecvBufs[0], 1, uPanelVec[lk].blkPtrGPU(0), 1); + } + } + return 0; +} + + diff --git a/SRC/TRF3dV100/lupanels_GPU.cpp b/SRC/TRF3dV100/lupanels_GPU.cpp index 24ca34a3..8054803b 100644 --- a/SRC/TRF3dV100/lupanels_GPU.cpp +++ b/SRC/TRF3dV100/lupanels_GPU.cpp @@ -36,6 +36,24 @@ lpanelGPU_t lpanel_t::copyToGPU() return gpuPanel; } +int_t lpanel_t::copyFromGPU() +{ + if(isEmpty()) + return 0; + size_t valSize = sizeof(double) * nzvalSize(); + cudaMemcpy(val, gpuPanel.val, valSize, cudaMemcpyDeviceToHost); + +} + +int_t upanel_t::copyFromGPU() +{ + if(isEmpty()) + return 0; + size_t valSize = sizeof(double) * nzvalSize(); + cudaMemcpy(val, gpuPanel.val, valSize, cudaMemcpyDeviceToHost); + +} + upanelGPU_t upanel_t::copyToGPU() { diff --git a/SRC/TRF3dV100/lupanels_comm3d.cpp b/SRC/TRF3dV100/lupanels_comm3d.cpp index 499ba546..b3258733 100644 --- a/SRC/TRF3dV100/lupanels_comm3d.cpp +++ b/SRC/TRF3dV100/lupanels_comm3d.cpp @@ -129,4 +129,6 @@ int_t LUstruct_v100::zRecvUPanel(int_t k0, int_t senderGrid, double alpha, doubl } } return 0; -} \ No newline at end of file +} + + diff --git a/SRC/TRF3dV100/pdgstrf3d_summit.cpp b/SRC/TRF3dV100/pdgstrf3d_summit.cpp index 0cfe6ca2..45715875 100644 --- a/SRC/TRF3dV100/pdgstrf3d_summit.cpp +++ b/SRC/TRF3dV100/pdgstrf3d_summit.cpp @@ -1,7 +1,7 @@ #include "superlu_ddefs.h" #ifdef MAP_PROFILE -#include "mapsampler_api.h" +#include "mapsampler_api.h" #endif #ifdef GPU_ACC @@ -13,202 +13,209 @@ #include "superlu_summit.h" #ifdef __cplusplus - extern "C" { +extern "C" +{ #endif -int_t pdgstrf3d_summit(superlu_dist_options_t *options, int m, int n, double anorm, - trf3Dpartition_t* trf3Dpartition, SCT_t *SCT, - dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, - SuperLUStat_t *stat, int *info) -{ - gridinfo_t* grid = &(grid3d->grid2d); - dLocalLU_t *Llu = LUstruct->Llu; + int_t pdgstrf3d_summit(superlu_dist_options_t *options, int m, int n, double anorm, + trf3Dpartition_t *trf3Dpartition, SCT_t *SCT, + dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, + SuperLUStat_t *stat, int *info) + { + gridinfo_t *grid = &(grid3d->grid2d); + dLocalLU_t *Llu = LUstruct->Llu; - // problem specific contants - int_t ldt = sp_ienv_dist (3); /* Size of maximum supernode */ - // double s_eps = slamch_ ("Epsilon"); -Sherry - double s_eps = smach_dist("Epsilon"); - double thresh = s_eps * anorm; + // problem specific contants + int_t ldt = sp_ienv_dist(3); /* Size of maximum supernode */ + // double s_eps = slamch_ ("Epsilon"); -Sherry + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC (grid3d->iam, "Enter pdgstrf3d()"); +#if (DEBUGlevel >= 1) + CHECK_MALLOC(grid3d->iam, "Enter pdgstrf3d()"); #endif - // Initilize stat - stat->ops[FACT] = 0; - stat->current_buffer = 0.0; - stat->peak_buffer = 0.0; - stat->gpu_buffer = 0.0; - //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); + // Initilize stat + stat->ops[FACT] = 0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; + //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); - //getting Nsupers - int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + //getting Nsupers + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); - // Grid related Variables - int_t iam = grid->iam; // in 2D grid - int num_threads = getNumThreads(grid3d->iam); + // Grid related Variables + int_t iam = grid->iam; // in 2D grid + int num_threads = getNumThreads(grid3d->iam); - factStat_t factStat; - initFactStat(nsupers, &factStat); + factStat_t factStat; + initFactStat(nsupers, &factStat); + SCT->tStartup = SuperLU_timer_(); + packLUInfo_t packLUInfo; + initPackLUInfo(nsupers, &packLUInfo); - SCT->tStartup = SuperLU_timer_(); - packLUInfo_t packLUInfo; - initPackLUInfo(nsupers, &packLUInfo); + dscuBufs_t scuBufs; + dinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); - dscuBufs_t scuBufs; - dinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); + factNodelists_t fNlists; + initFactNodelists(ldt, num_threads, nsupers, &fNlists); - factNodelists_t fNlists; - initFactNodelists( ldt, num_threads, nsupers, &fNlists); + // tag_ub initialization + int tag_ub = set_tag_ub(); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; - // tag_ub initialization - int tag_ub = set_tag_ub(); - int_t maxLvl = log2i(grid3d->zscp.Np) + 1; - -#if ( PRNTlevel>=1 ) - if (!iam) { - printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); - } +#if (PRNTlevel >= 1) + if (!iam) + { + printf("MPI tag upper bound = %d\n", tag_ub); + fflush(stdout); + } #endif - // trf3Dpartition_t* trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d); - gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo; - int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno; - int_t* myNodeCount = trf3Dpartition->myNodeCount; - int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; - int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; - sForest_t** sForests = trf3Dpartition->sForests; - int_t** treePerm = trf3Dpartition->treePerm ; - dLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; - - /* Initializing factorization specific buffers */ - - int_t numLA = getNumLookAhead(options); - dLUValSubBuf_t** LUvsbs = dLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct); - msgs_t**msgss = initMsgsArr(numLA); - int_t mxLeafNode = 0; - for (int ilvl = 0; ilvl < maxLvl; ++ilvl) - { - if (sForests[myTreeIdxs[ilvl]] - && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) - mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; - } - ddiagFactBufs_t** dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid); - commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); - - // TODO: remove the following for time being - int_t first_l_block_acc = 0; - int_t first_u_block_acc = 0; - int_t Pc = grid->npcol; - int_t Pr = grid->nprow; - int_t mrb = (nsupers + Pr - 1) / Pr; // Sherry check ... use ceiling - int_t mcb = (nsupers + Pc - 1) / Pc; - HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); - dInit_HyP(HyP, Llu, mcb, mrb); - HyP->first_l_block_acc = first_l_block_acc; - HyP->first_u_block_acc = first_u_block_acc; - - /******************************************* + // trf3Dpartition_t* trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d); + gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo; + int_t *iperm_c_supno = trf3Dpartition->iperm_c_supno; + int_t *myNodeCount = trf3Dpartition->myNodeCount; + int_t *myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t *myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t **sForests = trf3Dpartition->sForests; + int_t **treePerm = trf3Dpartition->treePerm; + dLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; + + /* Initializing factorization specific buffers */ + + int_t numLA = getNumLookAhead(options); + dLUValSubBuf_t **LUvsbs = dLluBufInitArr(SUPERLU_MAX(numLA, grid3d->zscp.Np), LUstruct); + msgs_t **msgss = initMsgsArr(numLA); + int_t mxLeafNode = 0; + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) + { + if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode) + mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; + } + ddiagFactBufs_t **dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid); + commRequests_t **comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); + + // TODO: remove the following for time being + int_t first_l_block_acc = 0; + int_t first_u_block_acc = 0; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t mrb = (nsupers + Pr - 1) / Pr; // Sherry check ... use ceiling + int_t mcb = (nsupers + Pc - 1) / Pc; + HyP_t *HyP = (HyP_t *)SUPERLU_MALLOC(sizeof(HyP_t)); + dInit_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; + HyP->first_u_block_acc = first_u_block_acc; + + /******************************************* * * New code starts * ******************************************/ - // Create the new LU structure - int_t* isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); - int superluAccOffload= get_acc_offload (); - LUstruct_v100 LU_packed(nsupers, ldt, isNodeInMyGrid, superluAccOffload, LUstruct,grid3d, - SCT, options, stat - ); + // Create the new LU structure + int_t *isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); + int superlu_acc_offload = get_acc_offload(); + LUstruct_v100 LU_packed(nsupers, ldt, isNodeInMyGrid, superlu_acc_offload, LUstruct, grid3d, + SCT, options, stat); + if(superlu_acc_offload) + LU_packed.setLUstruct_GPU(); + /*==== starting main factorization loop =====*/ + MPI_Barrier(grid3d->comm); + SCT->tStartup = SuperLU_timer_() - SCT->tStartup; - /*==== starting main factorization loop =====*/ - MPI_Barrier( grid3d->comm); - SCT->tStartup = SuperLU_timer_() - SCT->tStartup; - - SCT->pdgstrfTimer = SuperLU_timer_(); + SCT->pdgstrfTimer = SuperLU_timer_(); - for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) - { - /* if I participate in this level */ - if (!myZeroTrIdxs[ilvl]) + for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) { - - - sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; - - /* main loop over all the supernodes */ - if (sforest) /* 2D factorization at individual subtree */ - { - double tilvl = SuperLU_timer_(); - - LU_packed.dsparseTreeFactor(sforest, comReqss, &scuBufs, &packLUInfo, - msgss, LUvsbs, dFBufs, - &gEtreeInfo, iperm_c_supno, - thresh, tag_ub, info ); - - - /*now reduce the updates*/ - SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl; - sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl]; - } - - if (ilvl < maxLvl - 1) /*then reduce before factorization*/ + /* if I participate in this level */ + if (!myZeroTrIdxs[ilvl]) { - LU_packed.ancestorReduction3d(ilvl, myNodeCount, treePerm); - } - } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/ - - SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer - : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1]; - } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/ - - MPI_Barrier( grid3d->comm); - SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer; - - LU_packed.packedU2skyline(LUstruct); - if(!grid3d->zscp.Iam) - { - SCT_printSummary(grid, SCT); - // if (superlu_acc_offload ) - // printGPUStats(sluGPU->A_gpu, grid); - } - - - + sForest_t *sforest = sForests[myTreeIdxs[ilvl]]; + + /* main loop over all the supernodes */ + if (sforest) /* 2D factorization at individual subtree */ + { + double tilvl = SuperLU_timer_(); + + if (superlu_acc_offload) + LU_packed.dsparseTreeFactorGPU(sforest, comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, + &gEtreeInfo, iperm_c_supno, + thresh, tag_ub, info); + else + LU_packed.dsparseTreeFactor(sforest, comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, + &gEtreeInfo, iperm_c_supno, + thresh, tag_ub, info); + + /*now reduce the updates*/ + SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl; + sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl]; + } + + if (ilvl < maxLvl - 1) /*then reduce before factorization*/ + { + if (superlu_acc_offload) + LU_packed.ancestorReduction3dGPU(ilvl, myNodeCount, treePerm); + else + LU_packed.ancestorReduction3d(ilvl, myNodeCount, treePerm); + } + } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/ + + SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer + : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1]; + } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/ + + MPI_Barrier(grid3d->comm); + SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer; + + if (superlu_acc_offload) + LU_packed.copyLUGPUtoHost(); + LU_packed.packedU2skyline(LUstruct); + + if (!grid3d->zscp.Iam) + { + SCT_printSummary(grid, SCT); + // if (superlu_acc_offload ) + // printGPUStats(sluGPU->A_gpu, grid); + } #ifdef ITAC_PROF - VT_traceoff(); + VT_traceoff(); #endif #ifdef MAP_PROFILE - allinea_stop_sampling(); + allinea_stop_sampling(); #endif - reduceStat(FACT, stat, grid3d); - - // sherry added - /* Deallocate factorization specific buffers */ - freePackLUInfo(&packLUInfo); - dfreeScuBufs(&scuBufs); - freeFactStat(&factStat); - freeFactNodelists(&fNlists); - freeMsgsArr(numLA, msgss); - freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss); - dLluBufFreeArr(numLA, LUvsbs); - dfreeDiagFactBufsArr(mxLeafNode, dFBufs); - Free_HyP(HyP); - // if (superlu_acc_offload ) - // freeSluGPU(sluGPU); - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC (grid3d->iam, "Exit pdgstrf3d()"); + reduceStat(FACT, stat, grid3d); + + // sherry added + /* Deallocate factorization specific buffers */ + freePackLUInfo(&packLUInfo); + dfreeScuBufs(&scuBufs); + freeFactStat(&factStat); + freeFactNodelists(&fNlists); + freeMsgsArr(numLA, msgss); + freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss); + dLluBufFreeArr(numLA, LUvsbs); + dfreeDiagFactBufsArr(mxLeafNode, dFBufs); + Free_HyP(HyP); + // if (superlu_acc_offload ) + // freeSluGPU(sluGPU); + +#if (DEBUGlevel >= 1) + CHECK_MALLOC(grid3d->iam, "Exit pdgstrf3d()"); #endif - return 0; + return 0; -} /* pdgstrf3d */ + } /* pdgstrf3d */ #ifdef __cplusplus - } +} #endif //UrowindPtr_host \ No newline at end of file diff --git a/SRC/TRF3dV100/schurCompUpdate.cu b/SRC/TRF3dV100/schurCompUpdate.cu index c8935231..128c1fda 100644 --- a/SRC/TRF3dV100/schurCompUpdate.cu +++ b/SRC/TRF3dV100/schurCompUpdate.cu @@ -215,7 +215,7 @@ int_t LUstruct_v100::dSchurComplementUpdateGPU( int_t nub = upanel.nblocks(); int iSt =st_lb; - int iEnd =0; + int iEnd =iSt; diff --git a/cmake_thimble.sh b/cmake_thimble.sh new file mode 100644 index 00000000..d95c7d95 --- /dev/null +++ b/cmake_thimble.sh @@ -0,0 +1,13 @@ +cmake .. -DCMAKE_BUILD_TYPE=Debug\ + -DCMAKE_C_COMPILER=mpicc.mpich\ + -DCMAKE_CXX_COMPILER=mpicxx.mpich\ + -DXSDK_INDEX_SIZE=64\ + -DTPL_PARMETIS_INCLUDE_DIRS="/home/7ps/software/int64/include"\ + -DTPL_PARMETIS_LIBRARIES="-L/home/7ps/software/lmpich/int64/lib -lparmetis -lmetis" \ + -DTPL_ENABLE_BLASLIB=OFF\ + -DTPL_ENABLE_LAPACKLIB=OFF\ + -DTPL_ENABLE_COMBLASLIB=OFF \ + -DTPL_BLAS_LIBRARIES="-L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core" \ + -DCMAKE_C_FLAGS=" -std=c99 -g -fPIC -DPRNTlevel=1 -DDEBUGlevel=0 -I${MKLROOT}/include"\ + -DCMAKE_CXX_FLAGS=" -std=c++11 -I${MKLROOT}/include"\ + -DCMAKE_INSTALL_PREFIX="./" \ No newline at end of file