Skip to content

Commit

Permalink
GPU factorizaton code is compiling
Browse files Browse the repository at this point in the history
  • Loading branch information
piyush314 committed Aug 16, 2021
1 parent 43f39ab commit 8fb26c5
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 9 deletions.
1 change: 1 addition & 0 deletions SRC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ if(enable_double)
dtreeFactorization.c
TRF3dV100/pdgstrf3d_summit.cpp
TRF3dV100/dsparseTreeFactor_summit.cpp
TRF3dV100/dsparseTreeFactorGPU.cpp
TRF3dV100/lupanels.cpp
TRF3dV100/lupanels_comm3d.cpp
TRF3dV100/l_panels.cpp
Expand Down
134 changes: 134 additions & 0 deletions SRC/TRF3dV100/dsparseTreeFactorGPU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#include "superlu_ddefs.h"
#include "lupanels.hpp"
#include "lupanels_GPU.cuh"

//TODO: needs to be merged as a single factorization function
int_t LUstruct_v100::dsparseTreeFactorGPU(
sForest_t *sforest,
commRequests_t **comReqss, // lists of communication requests // size maxEtree level
dscuBufs_t *scuBufs, // contains buffers for schur complement update
packLUInfo_t *packLUInfo,
msgs_t **msgss, // size=num Look ahead
dLUValSubBuf_t **LUvsbs, // size=num Look ahead
ddiagFactBufs_t **dFBufs, // size maxEtree level
gEtreeInfo_t *gEtreeInfo, // global etree info
int_t *gIperm_c_supno,
double thresh, int tag_ub,
int *info)
{
int_t nnodes = sforest->nNodes; // number of nodes in the tree
if (nnodes < 1)
{
return 1;
}

#if (DEBUGlevel >= 1)
CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactor_ASYNC()");
#endif

int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
int_t *myIperm = treeTopoInfo->myIperm;
int_t maxTopoLevel = treeTopoInfo->numLvl;
int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;

/*main loop over all the levels*/
int_t numLA = getNumLookAhead(options);


for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
{
/* code */
int_t k_st = eTreeTopLims[topoLvl];
int_t k_end = eTreeTopLims[topoLvl + 1];
for (int_t k0 = k_st; k0 < k_end; ++k0)
{
int_t k = perm_c_supno[k0];
int_t offset = k0 - k_st;
int_t ksupc = SuperSize(k);
cublasHandle_t cubHandle= A_gpu.cuHandles[0];
cudaStream_t cuStream = A_gpu.cuStreams[0];
/*======= Diagonal Factorization ======*/
if (iam == procIJ(k, k))
{
lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU( k,
dFBufs[offset]->BlockUFactor, ksupc, // CPU pointers
dFBufs[offset]->BlockLFactor, ksupc, // CPU pointers
thresh, xsup, options, stat, info);
}

/*======= Diagonal Broadcast ======*/
if (myrow == krow(k))
MPI_Bcast((void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
MPI_DOUBLE, kcol(k), (grid->rscp).comm);
if (mycol == kcol(k))
MPI_Bcast((void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
MPI_DOUBLE, krow(k), (grid->cscp).comm);

/*======= Panel Update ======*/
if (myrow == krow(k))
uPanelVec[g2lRow(k)].panelSolveGPU(
cubHandle, cuStream,
ksupc, dFBufs[offset]->BlockLFactor, ksupc);

if (mycol == kcol(k))
lPanelVec[g2lCol(k)].panelSolveGPU(
cubHandle, cuStream,
ksupc, dFBufs[offset]->BlockUFactor, ksupc);

/*======= Panel Broadcast ======*/
upanel_t k_upanel(UidxRecvBufs[0], UvalRecvBufs[0],
A_gpu.UidxRecvBufs[0], A_gpu.UvalRecvBufs[0]) ;
lpanel_t k_lpanel(LidxRecvBufs[0], LvalRecvBufs[0],
A_gpu.LidxRecvBufs[0], A_gpu.LvalRecvBufs[0]);
if (myrow == krow(k))
{
k_upanel= uPanelVec[g2lRow(k)];
}
if (mycol == kcol(k))
k_lpanel = lPanelVec[g2lCol(k)];

if(UidxSendCounts[k]>0)
{
// assuming GPU direct is available
MPI_Bcast(k_upanel.gpuPanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm);
MPI_Bcast(k_upanel.gpuPanel.val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm);
// copy the index to cpu
cudaMemcpy(k_upanel.index, k_upanel.gpuPanel.index,
sizeof(int_t)*UidxSendCounts[k], cudaMemcpyDeviceToHost);
}

if(LidxSendCounts[k]>0)
{
MPI_Bcast(k_lpanel.gpuPanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm);
MPI_Bcast(k_lpanel.gpuPanel.val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm);
cudaMemcpy(k_lpanel.index, k_lpanel.gpuPanel.index,
sizeof(int_t)*LidxSendCounts[k], cudaMemcpyDeviceToHost);
}


/*======= Schurcomplement Update ======*/
#warning single node only
// dSchurComplementUpdate(k, lPanelVec[g2lCol(k)], uPanelVec[g2lRow(k)]);
// dSchurComplementUpdate(k, lPanelVec[g2lCol(k)], k_upanel);
if(UidxSendCounts[k]>0 && LidxSendCounts[k]>0)
{
// k_upanel.checkCorrectness();
int streamId =0;
dSchurComplementUpdateGPU(
streamId,
k, k_lpanel, k_upanel);

}
// MPI_Barrier(grid3d->comm);

} /*for k0= k_st:k_end */

} /*for topoLvl = 0:maxTopoLevel*/

#if (DEBUGlevel >= 1)
CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactor_ASYNC()");
#endif

return 0;
} /* dsparseTreeFactor_ASYNC */
12 changes: 9 additions & 3 deletions SRC/TRF3dV100/lupanels.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <algorithm>
#include <iostream>
#include <cassert>

#include "lupanels_GPU.cuh"
#include "lupanels.hpp"

Expand Down Expand Up @@ -156,6 +157,8 @@ LUstruct_v100::LUstruct_v100(int_t nsupers_, int_t ldt_,
//


if(superluAccOffload)
setLUstruct_GPU();

// for(int pc=0;pc<Pc; pc++)
// {
Expand All @@ -164,7 +167,8 @@ LUstruct_v100::LUstruct_v100(int_t nsupers_, int_t ldt_,
// }
}

int_t LUstruct_v100::dSchurComplementUpdate(int_t k, lpanel_t &lpanel, upanel_t &upanel)
int_t LUstruct_v100::dSchurComplementUpdate(
int_t k, lpanel_t &lpanel, upanel_t &upanel)
{
if (lpanel.isEmpty() || upanel.isEmpty())
return 0;
Expand Down Expand Up @@ -346,19 +350,21 @@ int_t LUstruct_v100::setLUstruct_GPU()


// set up streams;
//TODO: setup multiple cuda streams
//TODO: setup multiple cuda streams,
// make cuda streams consistent with look_aheads
// numCudaStreams is related to num_look_aheads
A_gpu.numCudaStreams = getnCudaStreams(); // this always returns 1
A_gpu.gemmBufferSize = get_max_buffer_size();

// TODO: make cuda streams consistent with look_aheads

assert(A_gpu.numCudaStreams< options->num_lookaheads);

// cudaMalloc(&A_gpu.LvalRecvBufs, sizeof(double*)*A_gpu.numCudaStreams);
for(int stream=0; stream<A_gpu.numCudaStreams; stream++ )
{

cudaStreamCreate ( &A_gpu.cuStreams[stream]);
cublasCreate(&A_gpu.cuHandles[stream]);
cudaMalloc(&A_gpu.LvalRecvBufs[stream], sizeof(double)*maxLvalCount);
cudaMalloc(&A_gpu.UvalRecvBufs[stream], sizeof(double)*maxUvalCount);
cudaMalloc(&A_gpu.LidxRecvBufs[stream], sizeof(int_t)*maxLidxCount);
Expand Down
29 changes: 25 additions & 4 deletions SRC/TRF3dV100/lupanels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ class lpanel_t
{
return &gpuPanel.val[blkPtrOffset(k)];
}

lpanel_t(int_t *index_, double *val_, int_t *indexGPU, double *valGPU) :
index(index_), val(val_), gpuPanel(indexGPU, valGPU)
{ return; };
};

class upanel_t
Expand Down Expand Up @@ -246,6 +250,10 @@ class upanel_t
{
return &gpuPanel.val[blkPtrOffset(k)];
}

upanel_t(int_t *index_, double *val_, int_t *indexGPU, double *valGPU) :
index(index_), val(val_), gpuPanel(indexGPU, valGPU)
{ return; };
};

// Defineing GPU data types
Expand Down Expand Up @@ -359,8 +367,21 @@ struct LUstruct_v100

// GPU related functions
int_t setLUstruct_GPU();

int_t dSchurComplementUpdate(
cublasHandle_t handle, int streamId,
int_t k, lpanel_t &lpanel, upanel_t &upanel);
int_t dsparseTreeFactorGPU(
sForest_t *sforest,
commRequests_t **comReqss, // lists of communication requests // size maxEtree level
dscuBufs_t *scuBufs, // contains buffers for schur complement update
packLUInfo_t *packLUInfo,
msgs_t **msgss, // size=num Look ahead
dLUValSubBuf_t **LUvsbs, // size=num Look ahead
ddiagFactBufs_t **dFBufs, // size maxEtree level
gEtreeInfo_t *gEtreeInfo, // global etree info
int_t *gIperm_c_supno,
double thresh, int tag_ub,
int *info);


int_t dSchurComplementUpdateGPU(
int streamId,
int_t k, lpanel_t &lpanel, upanel_t &upanel);
};
1 change: 1 addition & 0 deletions SRC/TRF3dV100/lupanels_GPU.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ struct LUstructGPU_t
int maxSuperSize;
// double arrays are problematic
cudaStream_t cuStreams[MAX_CUDA_STREAMS];
cublasHandle_t cuHandles[MAX_CUDA_STREAMS];
double* gpuGemmBuffs[MAX_CUDA_STREAMS];
double* LvalRecvBufs[MAX_CUDA_STREAMS];
double* UvalRecvBufs[MAX_CUDA_STREAMS];
Expand Down
5 changes: 3 additions & 2 deletions SRC/TRF3dV100/schurCompUpdate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,8 @@ void scatterGPU(
}


int_t LUstruct_v100::dSchurComplementUpdate(
cublasHandle_t handle, int streamId,
int_t LUstruct_v100::dSchurComplementUpdateGPU(
int streamId,
int_t k, lpanel_t &lpanel, upanel_t &upanel)
{
// TODO: redefine isEmpty so this works out
Expand Down Expand Up @@ -245,6 +245,7 @@ int_t LUstruct_v100::dSchurComplementUpdate(
{
jSt = jEnd;
jEnd = upanel.getEndBlock(jSt, maxGemmCols);
cublasHandle_t handle = A_gpu.cuHandles[streamId];
cudaStream_t cuStream = A_gpu.cuStreams[streamId];
cublasSetStream(handle, cuStream);
int gemm_m = lpanel.stRow(iEnd) - lpanel.stRow(iSt);
Expand Down

0 comments on commit 8fb26c5

Please sign in to comment.