From fe88f03d1f9d9ba67c9431c07b40b2b7c961961d Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Wed, 29 May 2024 17:24:39 -0700 Subject: [PATCH] Initial implementation of ILU0 and SolveOnly; Remove the unused batch code; small fix related to LSUB/USUB in psymbfact.c --- Doxygen_Config.txt | 1 + SRC/CMakeLists.txt | 1 + SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp | 1119 --------- .../dsparseTreeFactorGPU_impl.hpp | 122 +- SRC/CplusplusFactor/lupanels_GPU.cuh | 81 +- SRC/CplusplusFactor/schurCompUpdate.cu | 2106 ----------------- SRC/CplusplusFactor/schurCompUpdate_impl.cuh | 12 +- SRC/CplusplusFactor/xlupanels_GPU.cuh | 14 +- SRC/complex16/pzgssvx3d.c | 259 +- SRC/complex16/zssvx3dAux.c | 19 +- SRC/complex16/zutil_dist.c | 197 ++ SRC/include/superlu_defs.h | 6 +- SRC/include/superlu_dist_config.h | 8 +- SRC/include/superlu_zdefs.h | 9 + SRC/prec-independent/get_perm_c.c | 2 +- SRC/prec-independent/ilu_level_symbfact.c | 110 + SRC/prec-independent/psymbfact.c | 8 +- SRC/prec-independent/trfAux.c | 13 +- SRC/prec-independent/util.c | 2 + 19 files changed, 394 insertions(+), 3695 deletions(-) delete mode 100644 SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp delete mode 100644 SRC/CplusplusFactor/schurCompUpdate.cu create mode 100644 SRC/prec-independent/ilu_level_symbfact.c diff --git a/Doxygen_Config.txt b/Doxygen_Config.txt index 091a1c56..8dbe9902 100644 --- a/Doxygen_Config.txt +++ b/Doxygen_Config.txt @@ -1,3 +1,4 @@ + # Doxyfile 1.8.15 # This file describes the settings to be used by the documentation system diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 58a1fb02..d383873d 100755 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -63,6 +63,7 @@ set(sources prec-independent/pxerr_dist.c prec-independent/superlu_timer.c prec-independent/symbfact.c + prec-independent/ilu_level_symbfact.c prec-independent/psymbfact.c prec-independent/psymbfact_util.c prec-independent/get_perm_c_parmetis.c diff --git a/SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp b/SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp deleted file mode 100644 index f3029db3..00000000 --- a/SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp +++ /dev/null @@ -1,1119 +0,0 @@ -#include -#include "superlu_ddefs.h" -#include "lupanels.hpp" -#ifdef HAVE_CUDA -#include "lupanels_GPU.cuh" -#include "batch_block_copy.h" - -// [0, 1, 2, 3] independent -// [0, 1] schur complement update , [2, 3] panel bcast -// [2,3] schur complement update, [ 4,5] panel bcast - - -int getBufferOffset(int k0, int k1, int winSize, int winParity, int halfWin) -{ - int_t offset = (k0-k1)%winSize; - if(winParity%2) - offset+= halfWin; - - return offset; -} - -#if 0 // Sherry: old batch code -////////////////////////////////////////////////////////////////////////////////////////////////////////// - -LUMarshallData::LUMarshallData() -{ - dev_diag_ptrs = dev_panel_ptrs = NULL; - dev_diag_ld_array = dev_diag_dim_array = dev_info_array = NULL; - dev_panel_ld_array = dev_panel_dim_array = NULL; -} - -LUMarshallData::~LUMarshallData() -{ - gpuErrchk(cudaFree(dev_diag_ptrs)); - gpuErrchk(cudaFree(dev_panel_ptrs)); - gpuErrchk(cudaFree(dev_diag_ld_array)); - gpuErrchk(cudaFree(dev_diag_dim_array)); - gpuErrchk(cudaFree(dev_info_array)); - gpuErrchk(cudaFree(dev_panel_ld_array)); - gpuErrchk(cudaFree(dev_panel_dim_array)); -} - -void LUMarshallData::setBatchSize(int batch_size) -{ - gpuErrchk(cudaMalloc(&dev_diag_ptrs, batch_size * sizeof(double*))); - gpuErrchk(cudaMalloc(&dev_panel_ptrs, batch_size * sizeof(double*))); - - gpuErrchk(cudaMalloc(&dev_diag_ld_array, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_diag_dim_array, (batch_size + 1) * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_info_array, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_panel_ld_array, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_panel_dim_array, (batch_size + 1) * sizeof(int))); - - host_diag_ptrs.resize(batch_size); - host_diag_ld_array.resize(batch_size); - host_diag_dim_array.resize(batch_size); - host_panel_ptrs.resize(batch_size); - host_panel_ld_array.resize(batch_size); - host_panel_dim_array.resize(batch_size); -} - -void LUMarshallData::setMaxDiag() -{ - max_diag = 0; - for(int i = 0; i < batchsize; i++) - max_diag = SUPERLU_MAX(max_diag, host_diag_dim_array[i]); -} - -void LUMarshallData::setMaxPanel() -{ - max_panel = 0; - for(int i = 0; i < batchsize; i++) - max_panel = SUPERLU_MAX(max_panel, host_panel_dim_array[i]); -} - -SCUMarshallData::SCUMarshallData() -{ - dev_A_ptrs = dev_B_ptrs = dev_C_ptrs = NULL; - dev_lda_array = dev_ldb_array = dev_ldc_array = NULL; - dev_m_array = dev_n_array = dev_k_array = NULL; - dev_gpu_lpanels = NULL; - dev_gpu_upanels = NULL; - dev_ist = dev_iend = dev_jst = dev_jend = NULL; - dev_maxGemmCols = dev_maxGemmRows = NULL; -} - -SCUMarshallData::~SCUMarshallData() -{ - gpuErrchk(cudaFree(dev_A_ptrs)); - gpuErrchk(cudaFree(dev_B_ptrs)); - gpuErrchk(cudaFree(dev_C_ptrs)); - gpuErrchk(cudaFree(dev_lda_array)); - gpuErrchk(cudaFree(dev_ldb_array)); - gpuErrchk(cudaFree(dev_ldc_array)); - gpuErrchk(cudaFree(dev_m_array)); - gpuErrchk(cudaFree(dev_n_array)); - gpuErrchk(cudaFree(dev_k_array)); - gpuErrchk(cudaFree(dev_gpu_lpanels)); - gpuErrchk(cudaFree(dev_gpu_upanels)); - gpuErrchk(cudaFree(dev_ist)); - gpuErrchk(cudaFree(dev_iend)); - gpuErrchk(cudaFree(dev_jst)); - gpuErrchk(cudaFree(dev_jend)); - gpuErrchk(cudaFree(dev_maxGemmCols)); - gpuErrchk(cudaFree(dev_maxGemmRows)); -} - -void SCUMarshallData::setBatchSize(int batch_size) -{ - gpuErrchk(cudaMalloc(&dev_A_ptrs, batch_size * sizeof(double*))); - gpuErrchk(cudaMalloc(&dev_B_ptrs, batch_size * sizeof(double*))); - gpuErrchk(cudaMalloc(&dev_C_ptrs, batch_size * sizeof(double*))); - - gpuErrchk(cudaMalloc(&dev_lda_array, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_ldb_array, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_ldc_array, batch_size * sizeof(int))); - - gpuErrchk(cudaMalloc(&dev_m_array, (batch_size + 1) * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_n_array, (batch_size + 1) * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_k_array, (batch_size + 1) * sizeof(int))); - - gpuErrchk(cudaMalloc(&dev_ist, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_iend, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_jst, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_jend, batch_size * sizeof(int))); - - gpuErrchk(cudaMalloc(&dev_maxGemmCols, batch_size * sizeof(int))); - gpuErrchk(cudaMalloc(&dev_maxGemmRows, batch_size * sizeof(int))); - - gpuErrchk(cudaMalloc(&dev_gpu_lpanels, batch_size * sizeof(lpanelGPU_t))); - gpuErrchk(cudaMalloc(&dev_gpu_upanels, batch_size * sizeof(upanelGPU_t))); - - host_A_ptrs.resize(batch_size); - host_B_ptrs.resize(batch_size); - host_C_ptrs.resize(batch_size); - host_lda_array.resize(batch_size); - host_ldb_array.resize(batch_size); - host_ldc_array.resize(batch_size); - host_m_array.resize(batch_size); - host_n_array.resize(batch_size); - host_k_array.resize(batch_size); - upanels.resize(batch_size); - lpanels.resize(batch_size); - host_gpu_upanels.resize(batch_size); - host_gpu_lpanels.resize(batch_size); - ist.resize(batch_size); - iend.resize(batch_size); - jst.resize(batch_size); - jend.resize(batch_size); - maxGemmRows.resize(batch_size); - maxGemmCols.resize(batch_size); -} - -void SCUMarshallData::setMaxDims() -{ - max_n = max_k = max_m = max_ilen = max_jlen = 0; - for(int i = 0; i < batchsize; i++) - { - max_m = SUPERLU_MAX(max_m, host_m_array[i]); - max_n = SUPERLU_MAX(max_n, host_n_array[i]); - max_k = SUPERLU_MAX(max_k, host_k_array[i]); - max_ilen = SUPERLU_MAX(max_ilen, iend[i] - ist[i]); - max_jlen = SUPERLU_MAX(max_jlen, jend[i] - jst[i]); - } -} - -void SCUMarshallData::copyToGPU() -{ - gpuErrchk(cudaMemcpy(dev_A_ptrs, host_A_ptrs.data(), batchsize * sizeof(double*), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_B_ptrs, host_B_ptrs.data(), batchsize * sizeof(double*), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_C_ptrs, host_C_ptrs.data(), batchsize * sizeof(double*), cudaMemcpyHostToDevice)); - - gpuErrchk(cudaMemcpy(dev_lda_array, host_lda_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_ldb_array, host_ldb_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_ldc_array, host_ldc_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - - gpuErrchk(cudaMemcpy(dev_m_array, host_m_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_n_array, host_n_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_k_array, host_k_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - - gpuErrchk(cudaMemcpy(dev_ist, ist.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_iend, iend.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_jst, jst.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_jend, jend.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice)); -} - -void SCUMarshallData::copyPanelDataToGPU() -{ - gpuErrchk(cudaMemcpy(dev_gpu_lpanels, host_gpu_lpanels.data(), batchsize * sizeof(lpanelGPU_t), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dev_gpu_upanels, host_gpu_upanels.data(), batchsize * sizeof(upanelGPU_t), cudaMemcpyHostToDevice)); -} - -void LUstruct_v100::marshallBatchedBufferCopyData(int k_st, int k_end, int_t *perm_c_supno) -{ - // First gather up all the pointer and meta data on the host - LUMarshallData& mdata = A_gpu.marshall_data; - double **panel_ptrs = mdata.host_panel_ptrs.data(); - int *panel_ld_batch = mdata.host_panel_ld_array.data(); - int *panel_dim_batch = mdata.host_panel_dim_array.data(); - double **diag_ptrs = mdata.host_diag_ptrs.data(); - int *diag_ld_batch = mdata.host_diag_ld_array.data(); - int *diag_dim_batch = mdata.host_diag_dim_array.data(); - - mdata.batchsize = 0; - - for (int_t k0 = k_st; k0 < k_end; k0++) - { - int_t k = perm_c_supno[k0]; - int_t buffer_offset = k0 - k_st; - int ksupc = SuperSize(k); - - if (iam == procIJ(k, k)) - { - lpanel_t &lpanel = lPanelVec[g2lCol(k)]; - - assert(mdata.batchsize < mdata.host_diag_ptrs.size()); - - panel_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0); - panel_ld_batch[mdata.batchsize] = lpanel.LDA(); - panel_dim_batch[mdata.batchsize] = ksupc; - - diag_ptrs[mdata.batchsize] = A_gpu.dFBufs[buffer_offset]; - diag_ld_batch[mdata.batchsize] = ksupc; - diag_dim_batch[mdata.batchsize] = ksupc; - - mdata.batchsize++; - - } - } - - mdata.setMaxDiag(); - mdata.setMaxPanel(); - - // Then copy the marshalled data over to the GPU - gpuErrchk(cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(mdata.dev_diag_ld_array, diag_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(mdata.dev_diag_dim_array, diag_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(mdata.dev_panel_ptrs, panel_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(mdata.dev_panel_ld_array, panel_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(mdata.dev_panel_dim_array, panel_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); -} - -#endif -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - -int_t LUstruct_v100::dDFactPSolveGPU(int_t k, int_t offset, ddiagFactBufs_t **dFBufs) -{ - // this is new version with diagonal factor being performed on GPU - // different from dDiagFactorPanelSolveGPU (it performs diag factor in CPU) - - /* Sherry: argument dFBufs[] is on CPU, not used in this routine */ - - double t0 = SuperLU_timer_(); - int ksupc = SuperSize(k); - cublasHandle_t cubHandle = A_gpu.cuHandles[offset]; - cusolverDnHandle_t cusolverH = A_gpu.cuSolveHandles[offset]; - cudaStream_t cuStream = A_gpu.cuStreams[offset]; - - /*======= Diagonal Factorization ======*/ - if (iam == procIJ(k, k)) - { - lPanelVec[g2lCol(k)].diagFactorCuSolver(k, - cusolverH, cuStream, - A_gpu.diagFactWork[offset], A_gpu.diagFactInfo[offset], // CPU pointers - A_gpu.dFBufs[offset], ksupc, // CPU pointers - thresh, xsup, options, stat, info); - - } - - //CHECK_MALLOC(iam, "after diagFactorCuSolver()"); - - //TODO: need to synchronize the cuda stream - /*======= Diagonal Broadcast ======*/ - if (myrow == krow(k)) - MPI_Bcast((void *)A_gpu.dFBufs[offset], ksupc * ksupc, - MPI_DOUBLE, kcol(k), (grid->rscp).comm); - - //CHECK_MALLOC(iam, "after row Bcast"); - - if (mycol == kcol(k)) - MPI_Bcast((void *)A_gpu.dFBufs[offset], ksupc * ksupc, - MPI_DOUBLE, krow(k), (grid->cscp).comm); - - // do the panels solver - if (myrow == krow(k)) - { - uPanelVec[g2lRow(k)].panelSolveGPU( - cubHandle, cuStream, - ksupc, A_gpu.dFBufs[offset], ksupc); - cudaStreamSynchronize(cuStream); // synchronize befpre broadcast - } - - if (mycol == kcol(k)) - { - lPanelVec[g2lCol(k)].panelSolveGPU( - cubHandle, cuStream, - ksupc, A_gpu.dFBufs[offset], ksupc); - cudaStreamSynchronize(cuStream); - } - SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0); - - return 0; -} /* dDFactPSolveGPU */ - - -int_t LUstruct_v100::dDFactPSolveGPU(int_t k, int_t handle_offset, int buffer_offset, ddiagFactBufs_t **dFBufs) -{ - // this is new version with diagonal factor being performed on GPU - // different from dDiagFactorPanelSolveGPU (it performs diag factor in CPU) - - /* Sherry: argument dFBufs[] is on CPU, not used in this routine */ - - double t0 = SuperLU_timer_(); - int ksupc = SuperSize(k); - cublasHandle_t cubHandle = A_gpu.cuHandles[handle_offset]; - cusolverDnHandle_t cusolverH = A_gpu.cuSolveHandles[handle_offset]; - cudaStream_t cuStream = A_gpu.cuStreams[handle_offset]; - - /*======= Diagonal Factorization ======*/ - if (iam == procIJ(k, k)) - { - lPanelVec[g2lCol(k)].diagFactorCuSolver(k, - cusolverH, cuStream, - A_gpu.diagFactWork[handle_offset], A_gpu.diagFactInfo[handle_offset], // CPU pointers - A_gpu.dFBufs[buffer_offset], ksupc, // CPU pointers - thresh, xsup, options, stat, info); - - } - - //CHECK_MALLOC(iam, "after diagFactorCuSolver()"); - - //TODO: need to synchronize the cuda stream - /*======= Diagonal Broadcast ======*/ - if (myrow == krow(k)) - MPI_Bcast((void *)A_gpu.dFBufs[buffer_offset], ksupc * ksupc, - MPI_DOUBLE, kcol(k), (grid->rscp).comm); - - //CHECK_MALLOC(iam, "after row Bcast"); - - if (mycol == kcol(k)) - MPI_Bcast((void *)A_gpu.dFBufs[buffer_offset], ksupc * ksupc, - MPI_DOUBLE, krow(k), (grid->cscp).comm); - - // do the panels solver - if (myrow == krow(k)) - { - uPanelVec[g2lRow(k)].panelSolveGPU( - cubHandle, cuStream, - ksupc, A_gpu.dFBufs[buffer_offset], ksupc); - cudaStreamSynchronize(cuStream); // synchronize befpre broadcast - } - - if (mycol == kcol(k)) - { - lPanelVec[g2lCol(k)].panelSolveGPU( - cubHandle, cuStream, - ksupc, A_gpu.dFBufs[buffer_offset], ksupc); - cudaStreamSynchronize(cuStream); - } - SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0); - - return 0; -} /* dDFactPSolveGPU */ - -/* This performs diag factor on CPU */ -int_t LUstruct_v100::dDiagFactorPanelSolveGPU(int_t k, int_t offset, ddiagFactBufs_t **dFBufs) -{ - double t0 = SuperLU_timer_(); - int_t ksupc = SuperSize(k); - cublasHandle_t cubHandle = A_gpu.cuHandles[offset]; - cudaStream_t cuStream = A_gpu.cuStreams[offset]; - if (iam == procIJ(k, k)) - { - - lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU(k, - dFBufs[offset]->BlockUFactor, ksupc, // CPU pointers - dFBufs[offset]->BlockLFactor, ksupc, // CPU pointers - thresh, xsup, options, stat, info); - } - - /*======= Diagonal Broadcast ======*/ - if (myrow == krow(k)) - MPI_Bcast((void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc, - MPI_DOUBLE, kcol(k), (grid->rscp).comm); - if (mycol == kcol(k)) - MPI_Bcast((void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc, - MPI_DOUBLE, krow(k), (grid->cscp).comm); - - /*======= Panel Update ======*/ - if (myrow == krow(k)) - { - - cudaMemcpy(A_gpu.dFBufs[offset], dFBufs[offset]->BlockLFactor, - ksupc * ksupc * sizeof(double), cudaMemcpyHostToDevice); - uPanelVec[g2lRow(k)].panelSolveGPU( - cubHandle, cuStream, - ksupc, A_gpu.dFBufs[offset], ksupc); - cudaStreamSynchronize(cuStream); // synchronize befpre broadcast - } - - if (mycol == kcol(k)) - { - cudaMemcpy(A_gpu.dFBufs[offset], dFBufs[offset]->BlockUFactor, - ksupc * ksupc * sizeof(double), cudaMemcpyHostToDevice); - lPanelVec[g2lCol(k)].panelSolveGPU( - cubHandle, cuStream, - ksupc, A_gpu.dFBufs[offset], ksupc); - cudaStreamSynchronize(cuStream); - } - SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0); - - return 0; -} - -int_t LUstruct_v100::dPanelBcastGPU(int_t k, int_t offset) -{ - double t0 = SuperLU_timer_(); - /*======= Panel Broadcast ======*/ - // upanel_t k_upanel(UidxRecvBufs[offset], UvalRecvBufs[offset], - // A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset]); - // lpanel_t k_lpanel(LidxRecvBufs[offset], LvalRecvBufs[offset], - // A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset]); - // if (myrow == krow(k)) - // { - // k_upanel = uPanelVec[g2lRow(k)]; - // } - // if (mycol == kcol(k)) - // k_lpanel = lPanelVec[g2lCol(k)]; - upanel_t k_upanel = getKUpanel(k,offset); - lpanel_t k_lpanel = getKLpanel(k,offset); - - - if (UidxSendCounts[k] > 0) - { - // assuming GPU direct is available - MPI_Bcast(k_upanel.gpuPanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm); - MPI_Bcast(k_upanel.gpuPanel.val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm); - // copy the index to cpu - gpuErrchk(cudaMemcpy(k_upanel.index, k_upanel.gpuPanel.index, - sizeof(int_t) * UidxSendCounts[k], cudaMemcpyDeviceToHost)); - } - - if (LidxSendCounts[k] > 0) - { - MPI_Bcast(k_lpanel.gpuPanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm); - MPI_Bcast(k_lpanel.gpuPanel.val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm); - gpuErrchk(cudaMemcpy(k_lpanel.index, k_lpanel.gpuPanel.index, - sizeof(int_t) * LidxSendCounts[k], cudaMemcpyDeviceToHost)); - } - SCT->tPanelBcast += (SuperLU_timer_() - t0); - return 0; -} - -int_t LUstruct_v100::dsparseTreeFactorGPU( - sForest_t *sforest, - ddiagFactBufs_t **dFBufs, // size maxEtree level - gEtreeInfo_t *gEtreeInfo, // global etree info - - int tag_ub) -{ - int_t nnodes = sforest->nNodes; // number of nodes in the tree - if (nnodes < 1) - { - return 1; - } - - int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization - treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; - int_t *myIperm = treeTopoInfo->myIperm; - int_t maxTopoLevel = treeTopoInfo->numLvl; - int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; - - /*main loop over all the levels*/ - int_t numLA = SUPERLU_MIN(A_gpu.numCudaStreams, getNumLookAhead(options)); - -#if (DEBUGlevel >= 1) - CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorGPU()"); -#endif - printf("Using New C++ code with GPU acceleration\n"); fflush(stdout); - printf(". lookahead numLA %d\n", numLA); fflush(stdout); - - // start the pipeline. Sherry: need to free these 3 arrays - int *donePanelBcast = int32Malloc_dist(nnodes); - int *donePanelSolve = int32Malloc_dist(nnodes); - int *localNumChildrenLeft = int32Malloc_dist(nnodes); - - //TODO: not needed, remove after testing - for (int_t i = 0; i < nnodes; i++) - { - donePanelBcast[i] = 0; - donePanelSolve[i] = 0; - localNumChildrenLeft[i] = 0; - } - - for (int_t k0 = 0; k0 < nnodes; k0++) - { - int_t k = perm_c_supno[k0]; - int_t k_parent = gEtreeInfo->setree[k]; - int_t ik = myIperm[k_parent]; - if (ik > -1 && ik < nnodes) - localNumChildrenLeft[ik]++; - } - - // start the pipeline - int_t topoLvl = 0; - int_t k_st = eTreeTopLims[topoLvl]; - int_t k_end = eTreeTopLims[topoLvl + 1]; - - //TODO: make this asynchronous - for (int_t k0 = k_st; k0 < k_end; k0++) - { - int_t k = perm_c_supno[k0]; - int_t offset = 0; - // dDiagFactorPanelSolveGPU(k, offset, dFBufs); - dDFactPSolveGPU(k, offset, dFBufs); - donePanelSolve[k0]=1; - } - - //TODO: its really the panels that needs to be doubled - // everything else can remain as it is - int_t winSize = SUPERLU_MIN(numLA/2, eTreeTopLims[1]); - - printf(". lookahead winSize %d\n", winSize); fflush(stdout); - - for (int k0 = k_st; k0 < winSize; ++k0) - { - int_t k = perm_c_supno[k0]; - int_t offset = k0%numLA; - if(!donePanelBcast[k0]) - { - dPanelBcastGPU(k, offset); - donePanelBcast[k0] =1; - } - }/*for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)*/ - - int_t k1 =0; - int_t winParity=0; - int_t halfWin = numLA/2; - while(k1setree[k]; - /* L o o k A h e a d P a n e l U p d a t e */ - if(UidxSendCounts[k]>0 && LidxSendCounts[k]>0) - lookAheadUpdateGPU(offset, k,k_parent, k_lpanel,k_upanel); - } - - for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1+winSize); ++k0) - { - int_t k = perm_c_supno[k0]; - int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin); - SyncLookAheadUpdate(offset); - } - - for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1+winSize); ++k0) - { - int_t k = perm_c_supno[k0]; - int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin); - upanel_t k_upanel = getKUpanel(k,offset); - lpanel_t k_lpanel = getKLpanel(k,offset); - int_t k_parent = gEtreeInfo->setree[k]; - /* Look Ahead Panel Solve */ - if(k_parent < nsupers) - { - int_t k0_parent = myIperm[k_parent]; - if (k0_parent > 0 && k0_parent= 2) - printf("parent %d of node %d during second phase\n", k0_parent, k0); -#endif - int_t dOffset = 0; // this is wrong - // dDiagFactorPanelSolveGPU(k_parent, dOffset,dFBufs); - dDFactPSolveGPU(k_parent, dOffset,dFBufs); - donePanelSolve[k0_parent]=1; - } - } - } - - /*proceed with remaining SchurComplement update */ - if(UidxSendCounts[k]>0 && LidxSendCounts[k]>0) - dSchurCompUpdateExcludeOneGPU(offset, k,k_parent, k_lpanel,k_upanel); - - } - - int_t k1_next = k1+winSize; - int_t oldWinSize = winSize; - for (int_t k0_next = k1_next; k0_next < SUPERLU_MIN(nnodes, k1_next+winSize); ++k0_next) - { - int k_next = perm_c_supno[k0_next]; - if (!localNumChildrenLeft[k0_next]) - { - // int offset_next = (k0_next-k1_next)%winSize; - // if(!(winParity%2)) - // offset_next += halfWin; - - int_t offset_next = getBufferOffset(k0_next, k1_next, winSize, winParity+1, halfWin); - dPanelBcastGPU(k_next, offset_next); - donePanelBcast[k0_next] =1; - // printf("Trying %d on offset %d\n", k0_next, offset_next); - } - else - { - winSize = k0_next - k1_next; - break; - } - } - - for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1+oldWinSize); ++k0) - { - int_t k = perm_c_supno[k0]; - // int_t offset = (k0-k1)%oldWinSize; - // if(winParity%2) - // offset+= halfWin; - int_t offset = getBufferOffset(k0, k1, oldWinSize, winParity, halfWin); - // printf("Syncing stream %d on offset %d\n", k0, offset); - if(UidxSendCounts[k]>0 && LidxSendCounts[k]>0) - gpuErrchk(cudaStreamSynchronize(A_gpu.cuStreams[offset])); - } - - k1=k1_next; - winParity++; - } - -#if 0 - - for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) - { - /* code */ - int_t k_st = eTreeTopLims[topoLvl]; - int_t k_end = eTreeTopLims[topoLvl + 1]; - for (int_t k0 = k_st; k0 < k_end; ++k0) - { - int_t k = perm_c_supno[k0]; - - int_t ksupc = SuperSize(k); - cublasHandle_t cubHandle = A_gpu.cuHandles[0]; - cudaStream_t cuStream = A_gpu.cuStreams[0]; - dDiagFactorPanelSolveGPU(k, 0, dFBufs); - /*======= Panel Broadcast ======*/ - // panelBcastGPU(k, 0); - int_t offset = k0%numLA; - dPanelBcastGPU(k, offset); - - /*======= Schurcomplement Update ======*/ - upanel_t k_upanel(UidxRecvBufs[offset], UvalRecvBufs[offset], - A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset]); - lpanel_t k_lpanel(LidxRecvBufs[offset], LvalRecvBufs[offset], - A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset]); - if (myrow == krow(k)) - { - k_upanel = uPanelVec[g2lRow(k)]; - } - if (mycol == kcol(k)) - k_lpanel = lPanelVec[g2lCol(k)]; - - if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) - { - int streamId = 0; - - -#if 0 - - dSchurComplementUpdateGPU( - streamId, - k, k_lpanel, k_upanel); -#else - int_t k_parent = gEtreeInfo->setree[k]; - lookAheadUpdateGPU( - streamId, - k, k_parent, k_lpanel, k_upanel); - dSchurCompUpdateExcludeOneGPU( - streamId, - k, k_parent, k_lpanel, k_upanel); - -#endif - - } - } /*for k0= k_st:k_end */ - } /*for topoLvl = 0:maxTopoLevel*/ - -#endif /* match #if 0 at line 562 before "for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)" */ - - - /* Sherry added 2/1/23 */ - SUPERLU_FREE(donePanelBcast); - SUPERLU_FREE(donePanelSolve); - SUPERLU_FREE(localNumChildrenLeft); - -#if (DEBUGlevel >= 1) - CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorGPU()"); -#endif - - return 0; -} /* dsparseTreeFactorGPU */ - - -#if 0 // Sherry: old batch code -////////////////////////////////////////////////////////////////////////////////////////////////////////// - -void LUstruct_v100::dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno) -{ -#ifdef HAVE_MAGMA - // Marshall the data for the leaf level batched LU decomposition - LUMarshallData& mdata = A_gpu.marshall_data; - cudaStream_t stream = A_gpu.cuStreams[0]; - marshallBatchedLUData(k_st, k_end, perm_c_supno); - - int info = magma_dgetrf_vbatched( - mdata.dev_diag_dim_array, mdata.dev_diag_dim_array, - mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, - NULL, mdata.dev_info_array, mdata.batchsize, - A_gpu.magma_queue - ); - - // Hackathon change: assuming individual matrices in a local batch are not distributed - // so we don't do the buffer copies anymore - // // Copy the diagonal block data over to the bcast buffers - // marshallBatchedBufferCopyData(k_st, k_end, perm_c_supno); - - // copyBlock_vbatch( - // stream, mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, mdata.max_diag, mdata.max_panel, - // mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, - // mdata.batchsize - // ); - - // Upper panel triangular solves - marshallBatchedTRSMUData(k_st, k_end, perm_c_supno); - - magmablas_dtrsm_vbatched( - MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, - mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, 1.0, - mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, - mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, - mdata.batchsize, A_gpu.magma_queue - ); - - // Lower panel triangular solves - marshallBatchedTRSMLData(k_st, k_end, perm_c_supno); - - magmablas_dtrsm_vbatched( - MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, - mdata.dev_panel_dim_array, mdata.dev_diag_dim_array, 1.0, - mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, - mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, - mdata.batchsize, A_gpu.magma_queue - ); - - // Wajih: This should be converted to a single bcast if possible - // Hackathon change: assuming individual matrices in a local batch are not distributed - // for (int k0 = k_st; k0 < k_end; k0++) - // { - // int k = perm_c_supno[k0]; - // int offset = 0; - // /*======= Panel Broadcast ======*/ - // dPanelBcastGPU(k, offset); - // } - - // Initialize the schur complement update marshall data - initSCUMarshallData(k_st, k_end, perm_c_supno); - SCUMarshallData& sc_mdata = A_gpu.sc_marshall_data; - - // Keep marshalling while there are batches to be processed - int done_i = 0; - int total_batches = 0; - while(done_i == 0) - { - done_i = marshallSCUBatchedDataOuter(k_st, k_end, perm_c_supno); - int done_j = 0; - while(done_i == 0 && done_j == 0) - { - done_j = marshallSCUBatchedDataInner(k_st, k_end, perm_c_supno); - if(done_j != 1) - { - total_batches++; - magmablas_dgemm_vbatched_max_nocheck ( - MagmaNoTrans, MagmaNoTrans, sc_mdata.dev_m_array, sc_mdata.dev_n_array, sc_mdata.dev_k_array, - 1.0, sc_mdata.dev_A_ptrs, sc_mdata.dev_lda_array, sc_mdata.dev_B_ptrs, sc_mdata.dev_ldb_array, - 0.0, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, sc_mdata.batchsize, - sc_mdata.max_m, sc_mdata.max_n, sc_mdata.max_k, A_gpu.magma_queue - ); - - scatterGPU_batchDriver( - sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend, - sc_mdata.max_ilen, sc_mdata.max_jlen, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, - A_gpu.maxSuperSize, ldt, sc_mdata.dev_gpu_lpanels, sc_mdata.dev_gpu_upanels, - dA_gpu, sc_mdata.batchsize, A_gpu.cuStreams[0] - ); - } - } - } - // printf("SCU batches = %d\n", total_batches); -#endif -} - -int LUstruct_v100::dsparseTreeFactorBatchGPU( - sForest_t *sforest, - ddiagFactBufs_t **dFBufs, // size maxEtree level - gEtreeInfo_t *gEtreeInfo, // global etree info - int tag_ub) -{ -#ifdef HAVE_MAGMA - int nnodes = sforest->nNodes; // number of nodes in the tree - int topoLvl, k_st, k_end, k0, k, offset, ksupc; - if (nnodes < 1) - { - return 1; - } - - int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization - treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; - int_t *myIperm = treeTopoInfo->myIperm; - int_t maxTopoLevel = treeTopoInfo->numLvl; - int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; - - -#if (DEBUGlevel >= 1) - CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorBatchGPU()"); -#endif - printf("Using level-based scheduling on GPU\n"); fflush(stdout); - - /* For all the leaves at level 0 */ - topoLvl = 0; - k_st = eTreeTopLims[topoLvl]; - k_end = eTreeTopLims[topoLvl + 1]; - printf("level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end); fflush(stdout); - -#if 0 - //ToDo: make this batched - for (k0 = k_st; k0 < k_end; k0++) - { - k = perm_c_supno[k0]; - offset = k0 - k_st; - // dDiagFactorPanelSolveGPU(k, offset, dFBufs); - dDFactPSolveGPU(k, 0, offset, dFBufs); - - /*======= Panel Broadcast ======*/ - dPanelBcastGPU(k, offset); // does this only if (UidxSendCounts[k] > 0) - //donePanelSolve[k0]=1; - - /*======= Schurcomplement Update ======*/ - /* UidxSendCounts are computed in LUstruct_v100 constructor in LUpanels.cpp */ - if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) { - // k_upanel.checkCorrectness(); - int streamId = 0; - upanel_t k_upanel = getKUpanel(k,offset); - lpanel_t k_lpanel = getKLpanel(k,offset); - dSchurComplementUpdateGPU( streamId, - k, k_lpanel, k_upanel); - // cudaStreamSynchronize(cuStream); // there is sync inside the kernel - } - } - - /* Main loop over all the internal levels */ - for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl) { - - k_st = eTreeTopLims[topoLvl]; - k_end = eTreeTopLims[topoLvl + 1]; - - printf("level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end); fflush(stdout); - - /* loop over all the nodes at level topoLvl */ - for (k0 = k_st; k0 < k_end; ++k0) { /* ToDo: batch this */ - k = perm_c_supno[k0]; - offset = k0 - k_st; - // offset = getBufferOffset(k0, k1, winSize, winParity, halfWin); - //ksupc = SuperSize(k); - - dDFactPSolveGPU(k, 0, offset, dFBufs); - - /*======= Panel Broadcast ======*/ - dPanelBcastGPU(k, offset); // does this only if (UidxSendCounts[k] > 0) - - /*======= Schurcomplement Update ======*/ - if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) - { - // k_upanel.checkCorrectness(); - int streamId = 0; - - //#define NDEBUG -#ifndef NDEBUG - checkGPU(); // ?? -#endif - upanel_t k_upanel = getKUpanel(k,offset); - lpanel_t k_lpanel = getKLpanel(k,offset); - dSchurComplementUpdateGPU(streamId, - k, k_lpanel, k_upanel); -// cudaStreamSynchronize(cuStream); // there is sync inside the kernel -#if 0 // Sherry commented out 7/4/23 -#ifndef NDEBUG - dSchurComplementUpdate(k, k_lpanel, k_upanel); // ?? why do this on CPU ? - cudaStreamSynchronize(cuStream); - checkGPU(); -#endif -#endif - } - // MPI_Barrier(grid3d->comm); - } /* end for k0= k_st:k_end */ - } /* end for topoLvl = 0:maxTopoLevel */ -#else - printf("Using batched code\n"); - double t0 = SuperLU_timer_(); - - // Copy the nodelist to the GPU - gpuErrchk(cudaMemcpy(A_gpu.dperm_c_supno, perm_c_supno, sizeof(int) * sforest->nNodes, cudaMemcpyHostToDevice)); - - // Solve level by level - dFactBatchSolve(k_st, k_end, perm_c_supno); - for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl) { - - k_st = eTreeTopLims[topoLvl]; - k_end = eTreeTopLims[topoLvl + 1]; - dFactBatchSolve(k_st, k_end, perm_c_supno); - } - printf("Batch time = %.3f\n", SuperLU_timer_() - t0); -#endif - -#if (DEBUGlevel >= 1) - CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorBatchGPU()"); -#endif - -#else - printf("MAGMA is required for batched execution!\n"); - fflush(stdout); - exit(0); - -#endif /* match ifdef have_magma */ - - return 0; -} /* end dsparseTreeFactorBatchGPU */ - -#endif // Sherry: old batch code commented out -////////////////////////////////////////////////////////////////////////////////////////////////////////// - -//TODO: needs to be merged as a single factorization function -int_t LUstruct_v100::dsparseTreeFactorGPUBaseline( - sForest_t *sforest, - ddiagFactBufs_t **dFBufs, // size maxEtree level - gEtreeInfo_t *gEtreeInfo, // global etree info - - int tag_ub) -{ - int_t nnodes = sforest->nNodes; // number of nodes in the tree - if (nnodes < 1) - { - return 1; - } - - printf("Using New C++ code with GPU acceleration\n"); -#if (DEBUGlevel >= 1) - CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorGPUBaseline()"); -#endif - - int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization - treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; - int_t *myIperm = treeTopoInfo->myIperm; - int_t maxTopoLevel = treeTopoInfo->numLvl; - int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; - - /*main loop over all the levels*/ - int_t numLA = getNumLookAhead(options); - - // start the pipeline - - for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) - { - /* code */ - int_t k_st = eTreeTopLims[topoLvl]; - int_t k_end = eTreeTopLims[topoLvl + 1]; - for (int_t k0 = k_st; k0 < k_end; ++k0) - { - int_t k = perm_c_supno[k0]; - int_t offset = k0 - k_st; - int_t ksupc = SuperSize(k); - cublasHandle_t cubHandle = A_gpu.cuHandles[0]; - cudaStream_t cuStream = A_gpu.cuStreams[0]; - /*======= Diagonal Factorization ======*/ - if (iam == procIJ(k, k)) - { -//#define NDEBUG -#ifndef NDEBUG - lPanelVec[g2lCol(k)].checkGPU(); - lPanelVec[g2lCol(k)].diagFactor(k, dFBufs[offset]->BlockUFactor, ksupc, - thresh, xsup, options, stat, info); - lPanelVec[g2lCol(k)].packDiagBlock(dFBufs[offset]->BlockLFactor, ksupc); -#endif - lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU(k, - dFBufs[offset]->BlockUFactor, ksupc, // CPU pointers - dFBufs[offset]->BlockLFactor, ksupc, // CPU pointers - thresh, xsup, options, stat, info); -// cudaStreamSynchronize(cuStream); -#ifndef NDEBUG - cudaStreamSynchronize(cuStream); - lPanelVec[g2lCol(k)].checkGPU(); -#endif - } - - /*======= Diagonal Broadcast ======*/ - if (myrow == krow(k)) - MPI_Bcast((void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc, - MPI_DOUBLE, kcol(k), (grid->rscp).comm); - if (mycol == kcol(k)) - MPI_Bcast((void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc, - MPI_DOUBLE, krow(k), (grid->cscp).comm); - - /*======= Panel Update ======*/ - if (myrow == krow(k)) - { -#ifndef NDEBUG - uPanelVec[g2lRow(k)].checkGPU(); -#endif - cudaMemcpy(A_gpu.dFBufs[0], dFBufs[offset]->BlockLFactor, - ksupc * ksupc * sizeof(double), cudaMemcpyHostToDevice); - uPanelVec[g2lRow(k)].panelSolveGPU( - cubHandle, cuStream, - ksupc, A_gpu.dFBufs[0], ksupc); - cudaStreamSynchronize(cuStream); // synchronize befpre broadcast -#ifndef NDEBUG - uPanelVec[g2lRow(k)].panelSolve(ksupc, dFBufs[offset]->BlockLFactor, ksupc); - cudaStreamSynchronize(cuStream); - uPanelVec[g2lRow(k)].checkGPU(); -#endif - } - - if (mycol == kcol(k)) - { - cudaMemcpy(A_gpu.dFBufs[0], dFBufs[offset]->BlockUFactor, - ksupc * ksupc * sizeof(double), cudaMemcpyHostToDevice); - lPanelVec[g2lCol(k)].panelSolveGPU( - cubHandle, cuStream, - ksupc, A_gpu.dFBufs[0], ksupc); - cudaStreamSynchronize(cuStream); -#ifndef NDEBUG - lPanelVec[g2lCol(k)].panelSolve(ksupc, dFBufs[offset]->BlockUFactor, ksupc); - cudaStreamSynchronize(cuStream); - lPanelVec[g2lCol(k)].checkGPU(); -#endif - } - - /*======= Panel Broadcast ======*/ - upanel_t k_upanel(UidxRecvBufs[0], UvalRecvBufs[0], - A_gpu.UidxRecvBufs[0], A_gpu.UvalRecvBufs[0]); - lpanel_t k_lpanel(LidxRecvBufs[0], LvalRecvBufs[0], - A_gpu.LidxRecvBufs[0], A_gpu.LvalRecvBufs[0]); - if (myrow == krow(k)) - { - k_upanel = uPanelVec[g2lRow(k)]; - } - if (mycol == kcol(k)) - k_lpanel = lPanelVec[g2lCol(k)]; - - if (UidxSendCounts[k] > 0) - { - // assuming GPU direct is available - MPI_Bcast(k_upanel.gpuPanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm); - MPI_Bcast(k_upanel.gpuPanel.val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm); - // copy the index to cpu - cudaMemcpy(k_upanel.index, k_upanel.gpuPanel.index, - sizeof(int_t) * UidxSendCounts[k], cudaMemcpyDeviceToHost); - -#ifndef NDEBUG - MPI_Bcast(k_upanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm); - MPI_Bcast(k_upanel.val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm); -#endif - } - - if (LidxSendCounts[k] > 0) - { - MPI_Bcast(k_lpanel.gpuPanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm); - MPI_Bcast(k_lpanel.gpuPanel.val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm); - cudaMemcpy(k_lpanel.index, k_lpanel.gpuPanel.index, - sizeof(int_t) * LidxSendCounts[k], cudaMemcpyDeviceToHost); - -#ifndef NDEBUG - MPI_Bcast(k_lpanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm); - MPI_Bcast(k_lpanel.val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm); -#endif - } - - /*======= Schurcomplement Update ======*/ - - if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) - { - // k_upanel.checkCorrectness(); - int streamId = 0; -#ifndef NDEBUG - checkGPU(); -#endif - dSchurComplementUpdateGPU( - streamId, - k, k_lpanel, k_upanel); -// cudaStreamSynchronize(cuStream); // there is sync inside the kernel -#ifndef NDEBUG - dSchurComplementUpdate(k, k_lpanel, k_upanel); - cudaStreamSynchronize(cuStream); - checkGPU(); -#endif - } - // MPI_Barrier(grid3d->comm); - - } /*for k0= k_st:k_end */ - - } /*for topoLvl = 0:maxTopoLevel*/ - -#if (DEBUGlevel >= 1) - CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorGPUBaseline()"); -#endif - - return 0; -} /* dsparseTreeFactorGPUBaseline */ -#endif diff --git a/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp b/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp index 0a9a7b9b..acc738d7 100644 --- a/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp +++ b/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp @@ -720,97 +720,6 @@ void xLUstruct_t::marshallBatchedBufferCopyData(int k_st, int k_end, int_ #endif //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -template -void xLUstruct_t::dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno) -{ -#if 0//def HAVE_MAGMA - //#ifdef HAVE_MAGMA - // Marshall the data for the leaf level batched LU decomposition - LUMarshallData &mdata = A_gpu.marshall_data; - cudaStream_t stream = A_gpu.cuStreams[0]; - marshallBatchedLUData(k_st, k_end, perm_c_supno); - - int info = magma_dgetrf_vbatched( - mdata.dev_diag_dim_array, mdata.dev_diag_dim_array, - mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, - NULL, mdata.dev_info_array, mdata.batchsize, - A_gpu.magma_queue); - - // Hackathon change: assuming individual matrices in a local batch are not distributed - // so we don't do the buffer copies anymore - // // Copy the diagonal block data over to the bcast buffers - // marshallBatchedBufferCopyData(k_st, k_end, perm_c_supno); - - // copyBlock_vbatch( - // stream, mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, mdata.max_diag, mdata.max_panel, - // mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, - // mdata.batchsize - // ); - - // Upper panel triangular solves - marshallBatchedTRSMUData(k_st, k_end, perm_c_supno); - - magmablas_dtrsm_vbatched( - MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, - mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, 1.0, - mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, - mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, - mdata.batchsize, A_gpu.magma_queue); - - // Lower panel triangular solves - marshallBatchedTRSMLData(k_st, k_end, perm_c_supno); - - magmablas_dtrsm_vbatched( - MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, - mdata.dev_panel_dim_array, mdata.dev_diag_dim_array, 1.0, - mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, - mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, - mdata.batchsize, A_gpu.magma_queue); - - // Wajih: This should be converted to a single bcast if possible - // Hackathon change: assuming individual matrices in a local batch are not distributed - // for (int k0 = k_st; k0 < k_end; k0++) - // { - // int k = perm_c_supno[k0]; - // int offset = 0; - // /*======= Panel Broadcast ======*/ - // dPanelBcastGPU(k, offset); - // } - - // Initialize the schur complement update marshall data - initSCUMarshallData(k_st, k_end, perm_c_supno); - xSCUMarshallData &sc_mdata = A_gpu.sc_marshall_data; - - // Keep marshalling while there are batches to be processed - int done_i = 0; - int total_batches = 0; - while (done_i == 0) - { - done_i = marshallSCUBatchedDataOuter(k_st, k_end, perm_c_supno); - int done_j = 0; - while (done_i == 0 && done_j == 0) - { - done_j = marshallSCUBatchedDataInner(k_st, k_end, perm_c_supno); - if (done_j != 1) - { - total_batches++; - magmablas_dgemm_vbatched_max_nocheck( - MagmaNoTrans, MagmaNoTrans, sc_mdata.dev_m_array, sc_mdata.dev_n_array, sc_mdata.dev_k_array, - 1.0, sc_mdata.dev_A_ptrs, sc_mdata.dev_lda_array, sc_mdata.dev_B_ptrs, sc_mdata.dev_ldb_array, - 0.0, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, sc_mdata.batchsize, - sc_mdata.max_m, sc_mdata.max_n, sc_mdata.max_k, A_gpu.magma_queue); - - scatterGPU_batchDriver( - sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend, - sc_mdata.max_ilen, sc_mdata.max_jlen, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, - A_gpu.maxSuperSize, ldt, sc_mdata.dev_gpu_lpanels, sc_mdata.dev_gpu_upanels, - dA_gpu, sc_mdata.batchsize, A_gpu.cuStreams[0]); - } - } - } - // printf("SCU batches = %d\n", total_batches); -#endif -} #if 0 //// This is not used anymore ////// @@ -821,7 +730,6 @@ int xLUstruct_t::dsparseTreeFactorBatchGPU( gEtreeInfo_t *gEtreeInfo, // global etree info int tag_ub) { -#if 0// def HAVE_MAGMA int nnodes = sforest->nNodes; // number of nodes in the tree int topoLvl, k_st, k_end, k0, k, offset, ksupc; if (nnodes < 1) @@ -848,8 +756,7 @@ int xLUstruct_t::dsparseTreeFactorBatchGPU( printf("level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end); fflush(stdout); -#if 0 - //ToDo: make this batched + //ToDo: make this batched -- may use this when MAGMA is not available for (k0 = k_st; k0 < k_end; k0++) { k = perm_c_supno[k0]; @@ -920,40 +827,17 @@ n cudaStreamSynchronize(cuStream); // MPI_Barrier(grid3d->comm); } /* end for k0= k_st:k_end */ } /* end for topoLvl = 0:maxTopoLevel */ -#else - printf("Using batched code\n"); - double t0 = SuperLU_timer_(); - - // Copy the nodelist to the GPU - gpuErrchk(cudaMemcpy(A_gpu.dperm_c_supno, perm_c_supno, sizeof(int) * sforest->nNodes, cudaMemcpyHostToDevice)); - - // Solve level by level - dFactBatchSolve(k_st, k_end, perm_c_supno); - for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl) - { - - k_st = eTreeTopLims[topoLvl]; - k_end = eTreeTopLims[topoLvl + 1]; - dFactBatchSolve(k_st, k_end, perm_c_supno); - } - printf("Batch time = %.3f\n", SuperLU_timer_() - t0); -#endif #if (DEBUGlevel >= 1) CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorBatchGPU()"); #endif -#else - printf("MAGMA is required for batched execution!\n"); - fflush(stdout); - exit(0); - -#endif /* match ifdef have_magma */ - return 0; } /* end dsparseTreeFactorBatchGPU */ #endif /* match #if 0 */ + + ////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/SRC/CplusplusFactor/lupanels_GPU.cuh b/SRC/CplusplusFactor/lupanels_GPU.cuh index 7b650f4b..2da76488 100644 --- a/SRC/CplusplusFactor/lupanels_GPU.cuh +++ b/SRC/CplusplusFactor/lupanels_GPU.cuh @@ -308,86 +308,7 @@ public: } }; -#if 0 -///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - -// Wajih: Device and host memory used to store marshalled batch data for LU and TRSM -struct LUMarshallData -{ - LUMarshallData(); - ~LUMarshallData(); - - // Diagonal device pointer data - double **dev_diag_ptrs; - int *dev_diag_ld_array, *dev_diag_dim_array, *dev_info_array; - - // TRSM panel device pointer data - double **dev_panel_ptrs; - int *dev_panel_ld_array, *dev_panel_dim_array; - - // Max of marshalled device data - int max_panel, max_diag; - - // Number of marshalled operations - int batchsize; - // Data accumulated on the host - std::vector host_diag_ptrs; - std::vector host_diag_ld_array, host_diag_dim_array; - - std::vector host_panel_ptrs; - std::vector host_panel_ld_array, host_panel_dim_array; - - void setBatchSize(int batch_size); - void setMaxDiag(); - void setMaxPanel(); -}; - -// Wajih: Device and host memory used to store marshalled batch data for Schur complement update -struct SCUMarshallData -{ - SCUMarshallData(); - ~SCUMarshallData(); - - // GEMM device pointer data - double **dev_A_ptrs, **dev_B_ptrs, **dev_C_ptrs; - int *dev_lda_array, *dev_ldb_array, *dev_ldc_array; - int *dev_m_array, *dev_n_array, *dev_k_array; - - // Panel device pointer data and scu loop limits - lpanelGPU_t* dev_gpu_lpanels; - upanelGPU_t* dev_gpu_upanels; - int* dev_ist, *dev_iend, *dev_jst, *dev_jend; - int *dev_maxGemmRows, *dev_maxGemmCols; - - // Max of marshalled gemm device data - int max_m, max_n, max_k; - - // Max of marshalled loop limits - int max_ilen, max_jlen; - - // Number of marshalled operations - int batchsize; - - // Data accumulated on the host - std::vector host_A_ptrs, host_B_ptrs, host_C_ptrs; - std::vector host_lda_array, host_ldb_array, host_ldc_array; - std::vector host_m_array, host_n_array, host_k_array; - - // Host data initialized once per level - std::vector upanels; - std::vector lpanels; - std::vector host_gpu_upanels; - std::vector host_gpu_lpanels; - std::vector ist, iend, jst, jend, maxGemmRows, maxGemmCols; - - void setBatchSize(int batch_size); - void setMaxDims(); - void copyPanelDataToGPU(); - void copyToGPU(); -}; - -#endif ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #define MAX_CUDA_STREAMS 64 @@ -469,4 +390,4 @@ void scatterGPU_batchDriver( int max_ilen, int max_jlen, double **gemmBuff_ptrs, int *LDgemmBuff_batch, int maxSuperSize, int ldt, lpanelGPU_t *lpanels, upanelGPU_t *upanels, LUstructGPU_t *dA, int batchCount, cudaStream_t cuStream -); \ No newline at end of file +); diff --git a/SRC/CplusplusFactor/schurCompUpdate.cu b/SRC/CplusplusFactor/schurCompUpdate.cu deleted file mode 100644 index 8c01e15d..00000000 --- a/SRC/CplusplusFactor/schurCompUpdate.cu +++ /dev/null @@ -1,2106 +0,0 @@ -#if 1//def HAVE_CUDA -#include -#include -#include -#include -#include -#include -#include -#include "superlu_ddefs.h" -#include "lupanels_GPU.cuh" -#include "lupanels.hpp" - -cudaError_t checkCudaLocal(cudaError_t result) -{ - // #if defined(DEBUG) || defined(_DEBUG) - // printf("Checking cuda\n"); - if (result != cudaSuccess) - { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); - } - // #endif - return result; -} - -__global__ void indirectCopy(double *dest, double *src, int_t *idx, int n) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) - dest[idx[i]] = src[i]; -} - -/** - * @file schurCompUpdate.cu - * @brief This function copies the packed buffers to GPU and performs the sparse - initialization on GPU call indirectCopy, this is the kernel - * @param gpuValBasePtr is the base pointer of the GPU matrix - * @param valBufferPacked is the packed buffer of the matrix - * @param valIdx is the index of the packed buffer - */ -void copyToGPU(double *gpuValBasePtr, std::vector &valBufferPacked, - std::vector &valIdx) -{ - int nnzCount = valBufferPacked.size(); - // calculate the size of the packed buffers - int_t gpuLvalSizePacked = nnzCount * sizeof(double); - int_t gpuLidxSizePacked = nnzCount * sizeof(int_t); - // allocate the memory for the packed buffers on GPU - double *dlvalPacked; - int_t *dlidxPacked; - gpuErrchk(cudaMalloc(&dlvalPacked, gpuLvalSizePacked)); - gpuErrchk(cudaMalloc(&dlidxPacked, gpuLidxSizePacked)); - // copy the packed buffers from CPU to GPU - gpuErrchk(cudaMemcpy(dlvalPacked, valBufferPacked.data(), gpuLvalSizePacked, cudaMemcpyHostToDevice)); - gpuErrchk(cudaMemcpy(dlidxPacked, valIdx.data(), gpuLidxSizePacked, cudaMemcpyHostToDevice)); - // perform the sparse initialization on GPU call indirectCopy - const int ThreadblockSize = 256; - int nThreadBlocks = (nnzCount + ThreadblockSize - 1) / ThreadblockSize; - indirectCopy<<>>( - gpuValBasePtr, dlvalPacked, dlidxPacked, nnzCount); - // wait for it to finish and free dlvalPacked and dlidxPacked - gpuErrchk(cudaDeviceSynchronize()); - gpuErrchk(cudaFree(dlvalPacked)); - gpuErrchk(cudaFree(dlidxPacked)); -} - -// copy the panel to GPU -void copyToGPU_Sparse(double *gpuValBasePtr, double *valBuffer, int_t gpuLvalSize) -{ - // sparse Initialization for GPU, this is the experimental code - // find non-zero elements in the panel, their location and values and copy to GPU - int numDoubles = gpuLvalSize / sizeof(double); - std::vector valBufferPacked; - std::vector valIdx; - for (int_t i = 0; i < numDoubles; i++) - { - if (valBuffer[i] != 0) - { - valBufferPacked.push_back(valBuffer[i]); - valIdx.push_back(i); - } - } - printf("%d non-zero elements in the panel, wrt original=%d\n", valBufferPacked.size(), numDoubles); - // get the size of the packed buffers and allocate memory on GPU - copyToGPU(gpuValBasePtr, valBufferPacked, valIdx); -} - -//#define NDEBUG -__device__ - int_t - lpanelGPU_t::find(int_t k) -{ - int threadId = threadIdx.x; - __shared__ int idx; - __shared__ int found; - if (!threadId) - { - idx = -1; - found = 0; - } - - int nThreads = blockDim.x; - int blocksPerThreads = CEILING(nblocks(), nThreads); - __syncthreads(); - for (int blk = blocksPerThreads * threadIdx.x; - blk < blocksPerThreads * (threadIdx.x + 1); - blk++) - { - // if(found) break; - - if (blk < nblocks()) - { - if (k == gid(blk)) - { - idx = blk; - found = 1; - } - } - } - __syncthreads(); - return idx; -} - -__device__ - int_t - upanelGPU_t::find(int_t k) -{ - int threadId = threadIdx.x; - __shared__ int idx; - __shared__ int found; - if (!threadId) - { - idx = -1; - found = 0; - } - __syncthreads(); - - int nThreads = blockDim.x; - int blocksPerThreads = CEILING(nblocks(), nThreads); - - for (int blk = blocksPerThreads * threadIdx.x; - blk < blocksPerThreads * (threadIdx.x + 1); - blk++) - { - // if(found) break; - - if (blk < nblocks()) - { - if (k == gid(blk)) - { - idx = blk; - found = 1; - } - } - } - __syncthreads(); - return idx; -} - -__device__ int computeIndirectMapGPU(int *rcS2D, int_t srcLen, int_t *srcVec, - int_t dstLen, int_t *dstVec, - int *dstIdx) -{ - int threadId = threadIdx.x; - if (dstVec == NULL) /*uncompressed dimension*/ - { - if (threadId < srcLen) - rcS2D[threadId] = srcVec[threadId]; - __syncthreads(); - return 0; - } - - if (threadId < dstLen) - dstIdx[dstVec[threadId]] = threadId; - __syncthreads(); - - if (threadId < srcLen) - rcS2D[threadId] = dstIdx[srcVec[threadId]]; - __syncthreads(); - - return 0; -} - -__device__ void scatterGPU_dev( - int iSt, int jSt, - double *gemmBuff, int LDgemmBuff, - lpanelGPU_t& lpanel, upanelGPU_t& upanel, - LUstructGPU_t *dA -) -{ - // calculate gi,gj - int ii = iSt + blockIdx.x; - int jj = jSt + blockIdx.y; - int threadId = threadIdx.x; - - int gi = lpanel.gid(ii); - int gj = upanel.gid(jj); -#ifndef NDEBUG - // if (!threadId) - // printf("Scattering to (%d, %d) \n", gi, gj); -#endif - double *Dst; - int_t lddst; - int_t dstRowLen, dstColLen; - int_t *dstRowList; - int_t *dstColList; - int li, lj; - if (gj > gi) // its in upanel - { - li = dA->g2lRow(gi); - lj = dA->uPanelVec[li].find(gj); - Dst = dA->uPanelVec[li].blkPtr(lj); - lddst = dA->supersize(gi); - dstRowLen = dA->supersize(gi); - dstRowList = NULL; - dstColLen = dA->uPanelVec[li].nbcol(lj); - dstColList = dA->uPanelVec[li].colList(lj); - } - else - { - lj = dA->g2lCol(gj); - li = dA->lPanelVec[lj].find(gi); - Dst = dA->lPanelVec[lj].blkPtr(li); - lddst = dA->lPanelVec[lj].LDA(); - dstRowLen = dA->lPanelVec[lj].nbrow(li); - dstRowList = dA->lPanelVec[lj].rowList(li); - // if(!threadId ) - // printf("Scattering to (%d, %d) by %d li=%d\n",gi, gj,threadId,li); - dstColLen = dA->supersize(gj); - dstColList = NULL; - } - - // compute source row to dest row mapping - int maxSuperSize = dA->maxSuperSize; - extern __shared__ int baseSharedPtr[]; - int *rowS2D = baseSharedPtr; - int *colS2D = &rowS2D[maxSuperSize]; - int *dstIdx = &colS2D[maxSuperSize]; - - int nrows = lpanel.nbrow(ii); - int ncols = upanel.nbcol(jj); - // lpanel.rowList(ii), upanel.colList(jj) - - computeIndirectMapGPU(rowS2D, nrows, lpanel.rowList(ii), - dstRowLen, dstRowList, dstIdx); - - // compute source col to dest col mapping - computeIndirectMapGPU(colS2D, ncols, upanel.colList(jj), - dstColLen, dstColList, dstIdx); - - int nThreads = blockDim.x; - int colsPerThreadBlock = nThreads / nrows; - - int rowOff = lpanel.stRow(ii) - lpanel.stRow(iSt); - int colOff = upanel.stCol(jj) - upanel.stCol(jSt); - double *Src = &gemmBuff[rowOff + colOff * LDgemmBuff]; - int ldsrc = LDgemmBuff; - // TODO: this seems inefficient - if (threadId < nrows * colsPerThreadBlock) - { - /* 1D threads are logically arranged in 2D shape. */ - int i = threadId % nrows; - int j = threadId / nrows; - -#pragma unroll 4 - while (j < ncols) - { - -#define ATOMIC_SCATTER -// Atomic Scatter is need if I want to perform multiple Schur Complement -// update concurrently -#ifdef ATOMIC_SCATTER - atomicAdd(&Dst[rowS2D[i] + lddst * colS2D[j]], -Src[i + ldsrc * j]); -#else - Dst[rowS2D[i] + lddst * colS2D[j]] -= Src[i + ldsrc * j]; -#endif - j += colsPerThreadBlock; - } - } - - __syncthreads(); -} - -__global__ void scatterGPU( - int iSt, int jSt, - double *gemmBuff, int LDgemmBuff, - lpanelGPU_t lpanel, upanelGPU_t upanel, - LUstructGPU_t *dA) -{ - scatterGPU_dev(iSt, jSt, gemmBuff, LDgemmBuff, lpanel, upanel, dA); -} - -__global__ void scatterGPU_batch( - int* iSt_batch, int *iEnd_batch, int *jSt_batch, int *jEnd_batch, - double **gemmBuff_ptrs, int *LDgemmBuff_batch, lpanelGPU_t *lpanels, - upanelGPU_t *upanels, LUstructGPU_t *dA -) -{ - int batch_index = blockIdx.z; - int iSt = iSt_batch[batch_index], iEnd = iEnd_batch[batch_index]; - int jSt = jSt_batch[batch_index], jEnd = jEnd_batch[batch_index]; - - int ii = iSt + blockIdx.x; - int jj = jSt + blockIdx.y; - if(ii >= iEnd || jj >= jEnd) - return; - - double* gemmBuff = gemmBuff_ptrs[batch_index]; - if(gemmBuff == NULL) - return; - - int LDgemmBuff = LDgemmBuff_batch[batch_index]; - lpanelGPU_t& lpanel = lpanels[batch_index]; - upanelGPU_t& upanel = upanels[batch_index]; - scatterGPU_dev(iSt, jSt, gemmBuff, LDgemmBuff, lpanel, upanel, dA); -} - -void scatterGPU_driver( - int iSt, int iEnd, int jSt, int jEnd, double *gemmBuff, int LDgemmBuff, - int maxSuperSize, int ldt, lpanelGPU_t lpanel, upanelGPU_t upanel, - LUstructGPU_t *dA, cudaStream_t cuStream -) -{ - dim3 dimBlock(ldt); // 1d thread - dim3 dimGrid(iEnd - iSt, jEnd - jSt); - size_t sharedMemorySize = 3 * maxSuperSize * sizeof(int_t); - - scatterGPU<<>>( - iSt, jSt, gemmBuff, LDgemmBuff, lpanel, upanel, dA - ); - - gpuErrchk(cudaGetLastError()); -} - -void scatterGPU_batchDriver( - int* iSt_batch, int *iEnd_batch, int *jSt_batch, int *jEnd_batch, - int max_ilen, int max_jlen, double **gemmBuff_ptrs, int *LDgemmBuff_batch, - int maxSuperSize, int ldt, lpanelGPU_t *lpanels, upanelGPU_t *upanels, - LUstructGPU_t *dA, int batchCount, cudaStream_t cuStream -) -{ - const int op_increment = 65535; - - for(int op_start = 0; op_start < batchCount; op_start += op_increment) - { - int batch_size = std::min(op_increment, batchCount - op_start); - - dim3 dimBlock(ldt); // 1d thread - dim3 dimGrid(max_ilen, max_jlen, batch_size); - size_t sharedMemorySize = 3 * maxSuperSize * sizeof(int_t); - - scatterGPU_batch<<>>( - iSt_batch + op_start, iEnd_batch + op_start, jSt_batch + op_start, - jEnd_batch + op_start, gemmBuff_ptrs + op_start, LDgemmBuff_batch + op_start, - lpanels + op_start, upanels + op_start, dA - ); - } -} - -int_t LUstruct_v100::dSchurComplementUpdateGPU( - int streamId, - int_t k, // the k-th panel or supernode - lpanel_t &lpanel, upanel_t &upanel) -{ - - if (lpanel.isEmpty() || upanel.isEmpty()) - return 0; - - int_t st_lb = 0; - if (myrow == krow(k)) - st_lb = 1; - - int_t nlb = lpanel.nblocks(); - int_t nub = upanel.nblocks(); - - int iSt = st_lb; - int iEnd = iSt; - - int nrows = lpanel.stRow(nlb) - lpanel.stRow(st_lb); - int ncols = upanel.nzcols(); - - int maxGemmRows = nrows; - int maxGemmCols = ncols; - // entire gemm doesn't fit in gemm buffer - if (nrows * ncols > A_gpu.gemmBufferSize) - { - int maxGemmOpSize = (int)sqrt(A_gpu.gemmBufferSize); - int numberofRowChunks = (nrows + maxGemmOpSize - 1) / maxGemmOpSize; - maxGemmRows = nrows / numberofRowChunks; - maxGemmCols = A_gpu.gemmBufferSize / maxGemmRows; - /* printf("buffer exceeded! k = %d, st_lb = %d, nlb = %d, nrowsXncols %d, maxGemRows %d, maxGemmCols %d\n", - k, st_lb, nlb, nrows*ncols, maxGemmRows, maxGemmCols);*/ - } - - while (iEnd < nlb) - { - iSt = iEnd; - iEnd = lpanel.getEndBlock(iSt, maxGemmRows); - - assert(iEnd > iSt); - int jSt = 0; - int jEnd = 0; - while (jEnd < nub) - { - jSt = jEnd; - jEnd = upanel.getEndBlock(jSt, maxGemmCols); - assert(jEnd > jSt); - cublasHandle_t handle = A_gpu.cuHandles[streamId]; - cudaStream_t cuStream = A_gpu.cuStreams[streamId]; - cublasSetStream(handle, cuStream); - int gemm_m = lpanel.stRow(iEnd) - lpanel.stRow(iSt); - int gemm_n = upanel.stCol(jEnd) - upanel.stCol(jSt); - int gemm_k = supersize(k); -#if 0 - printf("k = %d, iSt = %d, iEnd = %d, jst = %d, jend = %d\n", k, iSt, iEnd, jSt, jEnd); - printf("m=%d, n=%d, k=%d\n", gemm_m, gemm_n, gemm_k); - fflush(stdout); -#endif - - double alpha = 1.0; - double beta = 0.0; - cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, - gemm_m, gemm_n, gemm_k, &alpha, - lpanel.blkPtrGPU(iSt), lpanel.LDA(), - upanel.blkPtrGPU(jSt), upanel.LDA(), &beta, - A_gpu.gpuGemmBuffs[streamId], gemm_m); - - scatterGPU_driver( - iSt, iEnd, jSt, jEnd, A_gpu.gpuGemmBuffs[streamId], gemm_m, - A_gpu.maxSuperSize, ldt, lpanel.gpuPanel, upanel.gpuPanel, - dA_gpu, cuStream - ); - } - } - gpuErrchk(cudaStreamSynchronize(A_gpu.cuStreams[streamId])); - return 0; -} /* end dSchurComplementUpdateGPU */ - -int_t LUstruct_v100::lookAheadUpdateGPU( - int streamId, - int_t k, int_t laIdx, lpanel_t &lpanel, upanel_t &upanel) -{ - if (lpanel.isEmpty() || upanel.isEmpty()) - return 0; - - int_t st_lb = 0; - if (myrow == krow(k)) - st_lb = 1; - - int_t nlb = lpanel.nblocks(); - int_t nub = upanel.nblocks(); - - int_t laILoc = lpanel.find(laIdx); - int_t laJLoc = upanel.find(laIdx); - - int iSt = st_lb; - int jSt = 0; - - /* call look ahead update on Lpanel*/ - if (laJLoc != GLOBAL_BLOCK_NOT_FOUND) - dSchurCompUpdatePartGPU( - iSt, nlb, laJLoc, laJLoc + 1, - k, lpanel, upanel, - A_gpu.lookAheadLHandle[streamId], A_gpu.lookAheadLStream[streamId], - A_gpu.lookAheadLGemmBuffer[streamId]); - - /* call look ahead update on Upanel*/ - if (laILoc != GLOBAL_BLOCK_NOT_FOUND) - { - dSchurCompUpdatePartGPU( - laILoc, laILoc + 1, jSt, laJLoc, - k, lpanel, upanel, - A_gpu.lookAheadUHandle[streamId], A_gpu.lookAheadUStream[streamId], - A_gpu.lookAheadUGemmBuffer[streamId]); - dSchurCompUpdatePartGPU( - laILoc, laILoc + 1, laJLoc + 1, nub, - k, lpanel, upanel, - A_gpu.lookAheadUHandle[streamId], A_gpu.lookAheadUStream[streamId], - A_gpu.lookAheadUGemmBuffer[streamId]); - } - - // checkCudaLocal(cudaStreamSynchronize(A_gpu.lookAheadLStream[streamId])); - // checkCudaLocal(cudaStreamSynchronize(A_gpu.lookAheadUStream[streamId])); - - return 0; -} - -int_t LUstruct_v100::SyncLookAheadUpdate(int streamId) -{ - gpuErrchk(cudaStreamSynchronize(A_gpu.lookAheadLStream[streamId])); - gpuErrchk(cudaStreamSynchronize(A_gpu.lookAheadUStream[streamId])); - - return 0; -} - -int_t LUstruct_v100::dSchurCompUpdateExcludeOneGPU( - int streamId, - int_t k, int_t ex, // suypernodes to be excluded - lpanel_t &lpanel, upanel_t &upanel) -{ - if (lpanel.isEmpty() || upanel.isEmpty()) - return 0; - - int_t st_lb = 0; - if (myrow == krow(k)) - st_lb = 1; - - int_t nlb = lpanel.nblocks(); - int_t nub = upanel.nblocks(); - - int_t exILoc = lpanel.find(ex); - int_t exJLoc = upanel.find(ex); - - dSchurCompUpLimitedMem( - streamId, - st_lb, exILoc, 0, exJLoc, - k, lpanel, upanel); - - dSchurCompUpLimitedMem( - streamId, - st_lb, exILoc, exJLoc + 1, nub, - k, lpanel, upanel); - - int_t nextStI = exILoc + 1; - if (exILoc == GLOBAL_BLOCK_NOT_FOUND) - nextStI = st_lb; - /* - for j we don't need to change since, if exJLoc == GLOBAL_BLOCK_NOT_FOUND =-1 - then exJLoc+1 =0 will work out correctly as starting j - */ - dSchurCompUpLimitedMem( - streamId, - nextStI, nlb, 0, exJLoc, - k, lpanel, upanel); - - dSchurCompUpLimitedMem( - streamId, - nextStI, nlb, exJLoc + 1, nub, - k, lpanel, upanel); - - // checkCudaLocal(cudaStreamSynchronize(A_gpu.cuStreams[streamId])); - return 0; -} - -int_t LUstruct_v100::dSchurCompUpdatePartGPU( - int_t iSt, int_t iEnd, int_t jSt, int_t jEnd, - int_t k, lpanel_t &lpanel, upanel_t &upanel, - cublasHandle_t handle, cudaStream_t cuStream, - double *gemmBuff) -{ - if (iSt >= iEnd || jSt >= jEnd) - return 0; - - cublasSetStream(handle, cuStream); - int gemm_m = lpanel.stRow(iEnd) - lpanel.stRow(iSt); - int gemm_n = upanel.stCol(jEnd) - upanel.stCol(jSt); - int gemm_k = supersize(k); - double alpha = 1.0; - double beta = 0.0; -#ifndef NDEBUG - // printf("m=%d, n=%d, k=%d\n", gemm_m, gemm_n, gemm_k); -#endif - cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, - gemm_m, gemm_n, gemm_k, &alpha, - lpanel.blkPtrGPU(iSt), lpanel.LDA(), - upanel.blkPtrGPU(jSt), upanel.LDA(), &beta, - gemmBuff, gemm_m); - - // setting up scatter - dim3 dimBlock(ldt); // 1d thread - dim3 dimGrid(iEnd - iSt, jEnd - jSt); - size_t sharedMemorySize = 3 * A_gpu.maxSuperSize * sizeof(int_t); - - scatterGPU<<>>( - iSt, jSt, - gemmBuff, gemm_m, - lpanel.gpuPanel, upanel.gpuPanel, dA_gpu); - - return 0; -} - -int_t LUstruct_v100::dSchurCompUpLimitedMem( - int streamId, - int_t lStart, int_t lEnd, - int_t uStart, int_t uEnd, - int_t k, lpanel_t &lpanel, upanel_t &upanel) -{ - - if (lStart >= lEnd || uStart >= uEnd) - return 0; - int iSt = lStart; - int iEnd = iSt; - int nrows = lpanel.stRow(lEnd) - lpanel.stRow(lStart); - int ncols = upanel.stCol(uEnd) - upanel.stCol(uStart); - - int maxGemmRows = nrows; - int maxGemmCols = ncols; - // entire gemm doesn't fit in gemm buffer - if (nrows * ncols > A_gpu.gemmBufferSize) - { - int maxGemmOpSize = (int)sqrt(A_gpu.gemmBufferSize); - int numberofRowChunks = (nrows + maxGemmOpSize - 1) / maxGemmOpSize; - maxGemmRows = nrows / numberofRowChunks; - maxGemmCols = A_gpu.gemmBufferSize / maxGemmRows; - } - - while (iEnd < lEnd) - { - iSt = iEnd; - iEnd = lpanel.getEndBlock(iSt, maxGemmRows); - if (iEnd > lEnd) - iEnd = lEnd; - - assert(iEnd > iSt); - int jSt = uStart; - int jEnd = uStart; - while (jEnd < uEnd) - { - jSt = jEnd; - jEnd = upanel.getEndBlock(jSt, maxGemmCols); - if (jEnd > uEnd) - jEnd = uEnd; - - cublasHandle_t handle = A_gpu.cuHandles[streamId]; - cudaStream_t cuStream = A_gpu.cuStreams[streamId]; - dSchurCompUpdatePartGPU(iSt, iEnd, jSt, jEnd, - k, lpanel, upanel, handle, cuStream, A_gpu.gpuGemmBuffs[streamId]); - } - } - - return 0; -} - -int getMPIProcsPerGPU() -{ - if (!(getenv("MPI_PROCESS_PER_GPU"))) - { - return 1; - } - else - { - int devCount; - cudaGetDeviceCount(&devCount); - int envCount = atoi(getenv("MPI_PROCESS_PER_GPU")); - envCount = SUPERLU_MAX(envCount, 1); - printf("MPI_PROCESS_PER_GPU=%d, devCount=%d\n", envCount, devCount); - return SUPERLU_MIN(envCount, devCount); - } -} - -#define USABLE_GPU_MEM_FRACTION 0.9 - -size_t getGPUMemPerProcs(MPI_Comm baseCommunicator) -{ - - size_t mfree, mtotal; - // TODO: shared memory communicator should be part of - // LU struct - // MPI_Comm sharedComm; - // MPI_Comm_split_type(baseCommunicator, MPI_COMM_TYPE_SHARED, - // 0, MPI_INFO_NULL, &sharedComm); - // MPI_Barrier(sharedComm); - cudaMemGetInfo(&mfree, &mtotal); - // MPI_Barrier(sharedComm); - // MPI_Comm_free(&sharedComm); -#if 0 - printf("Total memory %zu & free memory %zu\n", mtotal, mfree); -#endif - return (size_t)(USABLE_GPU_MEM_FRACTION * (double)mfree) / getMPIProcsPerGPU(); -} - -int_t LUstruct_v100::setLUstruct_GPU() -{ - int i, stream; - -#if (DEBUGlevel >= 1) - int iam = 0; - CHECK_MALLOC(iam, "Enter setLUstruct_GPU()"); fflush(stdout); -#endif - - A_gpu.Pr = Pr; - A_gpu.Pc = Pc; - A_gpu.maxSuperSize = ldt; - - /* Sherry: this mapping may be inefficient on Frontier */ - /*Mapping to device*/ - int deviceCount; - cudaGetDeviceCount(&deviceCount); // How many GPUs? - int device_id = (int)(grid3d->iam/get_mpi_process_per_gpu ()); //YL: allow multiple MPIs per GPU - - char *ttemp; - ttemp = getenv ("SUPERLU_BIND_MPI_GPU"); - if (ttemp) { - cudaSetDevice(device_id); - } - - double tRegion[5]; - size_t useableGPUMem = getGPUMemPerProcs(grid3d->comm); - /** - * Memory is divided into two parts data memory and buffer memory - * data memory is used for useful data - * bufferMemory is used for buffers - * */ - size_t memReqData = 0; - - /*Memory for XSUP*/ - memReqData += (nsupers + 1) * sizeof(int_t); - - tRegion[0] = SuperLU_timer_(); - - size_t totalNzvalSize = 0; /* too big for gemmBufferSize */ - size_t max_gemmCsize = 0; /* Sherry added 2/20/2023 */ - size_t max_nzrow = 0; /* Yang added 10/20/2023 */ - size_t max_nzcol = 0; - - /*Memory for lpapenl and upanel Data*/ - for (i = 0; i < CEILING(nsupers, Pc); ++i) - { - if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1) - { - memReqData += lPanelVec[i].totalSize(); - totalNzvalSize += lPanelVec[i].nzvalSize(); - if(lPanelVec[i].nzvalSize()>0) - max_nzrow = SUPERLU_MAX(lPanelVec[i].nzrows(),max_nzrow); - //max_gemmCsize = SUPERoLU_MAX(max_gemmCsize, ???); - } - } - for (i = 0; i < CEILING(nsupers, Pr); ++i) - { - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - { - memReqData += uPanelVec[i].totalSize(); - totalNzvalSize += uPanelVec[i].nzvalSize(); - if(uPanelVec[i].nzvalSize()>0) - max_nzcol = SUPERLU_MAX(uPanelVec[i].nzcols(),max_nzcol); - } - } - max_gemmCsize = max_nzcol*max_nzrow; - - memReqData += CEILING(nsupers, Pc) * sizeof(lpanelGPU_t); - memReqData += CEILING(nsupers, Pr) * sizeof(upanelGPU_t); - - memReqData += sizeof(LUstructGPU_t); - - // Per stream data - // TODO: estimate based on ancestor size - int_t maxBuffSize = sp_ienv_dist (8, options); - int maxsup = sp_ienv_dist(3, options); // max. supernode size - maxBuffSize = SUPERLU_MAX(maxsup * maxsup, maxBuffSize); // Sherry added 7/10/23 - #if 0 - A_gpu.gemmBufferSize = SUPERLU_MIN(maxBuffSize, totalNzvalSize); - #else - A_gpu.gemmBufferSize = SUPERLU_MIN(maxBuffSize, SUPERLU_MAX(max_gemmCsize,totalNzvalSize)); /* Yang added 10/20/2023 */ - #endif - size_t dataPerStream = 3 * sizeof(double) * maxLvalCount + 3 * sizeof(double) * maxUvalCount + 2 * sizeof(int_t) * maxLidxCount + 2 * sizeof(int_t) * maxUidxCount + A_gpu.gemmBufferSize * sizeof(double) + ldt * ldt * sizeof(double); - if (memReqData + 2 * dataPerStream > useableGPUMem) - { - printf("Not enough memory on GPU: available = %zu, required for 2 streams =%zu, exiting\n", useableGPUMem, memReqData + 2 * dataPerStream); - exit(-1); - } - - tRegion[0] = SuperLU_timer_() - tRegion[0]; -#if ( PRNTlevel>=1 ) - // print the time taken to estimate memory on GPU - if (grid3d->iam == 0) - { - printf("GPU deviceCount=%d\n", deviceCount); - printf("Time taken to estimate memory on GPU: %f\n", tRegion[0]); - printf("\t.. totalNzvalSize %ld, gemmBufferSize %ld\n", - (long) totalNzvalSize, (long) A_gpu.gemmBufferSize); - } -#endif - - /*Memory for lapenlPanel Data*/ - tRegion[1] = SuperLU_timer_(); - - int_t maxNumberOfStream = (useableGPUMem - memReqData) / dataPerStream; - - int numberOfStreams = SUPERLU_MIN(getNumLookAhead(options), maxNumberOfStream); - numberOfStreams = SUPERLU_MIN(numberOfStreams, MAX_CUDA_STREAMS); - int rNumberOfStreams; - MPI_Allreduce(&numberOfStreams, &rNumberOfStreams, 1, - MPI_INT, MPI_MIN, grid3d->comm); - A_gpu.numCudaStreams = rNumberOfStreams; - - if (!grid3d->iam) - printf("Using %d CUDA LookAhead streams\n", rNumberOfStreams); - size_t totalMemoryRequired = memReqData + numberOfStreams * dataPerStream; - -#if 0 /**** Old code ****/ - upanelGPU_t *uPanelVec_GPU = new upanelGPU_t[CEILING(nsupers, Pr)]; - lpanelGPU_t *lPanelVec_GPU = new lpanelGPU_t[CEILING(nsupers, Pc)]; - void *gpuBasePtr, *gpuCurrentPtr; - cudaMalloc(&gpuBasePtr, totalMemoryRequired); - gpuCurrentPtr = gpuBasePtr; - - A_gpu.xsup = (int_t *)gpuCurrentPtr; - gpuCurrentPtr = (int_t *)gpuCurrentPtr + (nsupers + 1); - cudaMemcpy(A_gpu.xsup, xsup, (nsupers + 1) * sizeof(int_t), cudaMemcpyHostToDevice); - - for (int i = 0; i < CEILING(nsupers, Pc); ++i) - { - if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1) - { - lPanelVec_GPU[i] = lPanelVec[i].copyToGPU(gpuCurrentPtr); - gpuCurrentPtr = (char *)gpuCurrentPtr + lPanelVec[i].totalSize(); - } - } - A_gpu.lPanelVec = (lpanelGPU_t *)gpuCurrentPtr; - gpuCurrentPtr = (char *)gpuCurrentPtr + CEILING(nsupers, Pc) * sizeof(lpanelGPU_t); - cudaMemcpy(A_gpu.lPanelVec, lPanelVec_GPU, - CEILING(nsupers, Pc) * sizeof(lpanelGPU_t), cudaMemcpyHostToDevice); - - for (int i = 0; i < CEILING(nsupers, Pr); ++i) - { - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - { - uPanelVec_GPU[i] = uPanelVec[i].copyToGPU(gpuCurrentPtr); - gpuCurrentPtr = (char *)gpuCurrentPtr + uPanelVec[i].totalSize(); - } - } - A_gpu.uPanelVec = (upanelGPU_t *)gpuCurrentPtr; - gpuCurrentPtr = (char *)gpuCurrentPtr + CEILING(nsupers, Pr) * sizeof(upanelGPU_t); - cudaMemcpy(A_gpu.uPanelVec, uPanelVec_GPU, - CEILING(nsupers, Pr) * sizeof(upanelGPU_t), cudaMemcpyHostToDevice); - - for (int stream = 0; stream < A_gpu.numCudaStreams; stream++) - { - - cudaStreamCreate(&A_gpu.cuStreams[stream]); - cublasCreate(&A_gpu.cuHandles[stream]); - A_gpu.LvalRecvBufs[stream] = (double *)gpuCurrentPtr; - gpuCurrentPtr = (double *)gpuCurrentPtr + maxLvalCount; - A_gpu.UvalRecvBufs[stream] = (double *)gpuCurrentPtr; - gpuCurrentPtr = (double *)gpuCurrentPtr + maxUvalCount; - A_gpu.LidxRecvBufs[stream] = (int_t *)gpuCurrentPtr; - gpuCurrentPtr = (int_t *)gpuCurrentPtr + maxLidxCount; - A_gpu.UidxRecvBufs[stream] = (int_t *)gpuCurrentPtr; - gpuCurrentPtr = (int_t *)gpuCurrentPtr + maxUidxCount; - - A_gpu.gpuGemmBuffs[stream] = (double *)gpuCurrentPtr; - gpuCurrentPtr = (double *)gpuCurrentPtr + A_gpu.gemmBufferSize; - A_gpu.dFBufs[stream] = (double *)gpuCurrentPtr; - gpuCurrentPtr = (double *)gpuCurrentPtr + ldt * ldt; - - /*lookAhead buffers and stream*/ - cublasCreate(&A_gpu.lookAheadLHandle[stream]); - cudaStreamCreate(&A_gpu.lookAheadLStream[stream]); - A_gpu.lookAheadLGemmBuffer[stream] = (double *)gpuCurrentPtr; - gpuCurrentPtr = (double *)gpuCurrentPtr + maxLvalCount; - cublasCreate(&A_gpu.lookAheadUHandle[stream]); - cudaStreamCreate(&A_gpu.lookAheadUStream[stream]); - A_gpu.lookAheadUGemmBuffer[stream] = (double *)gpuCurrentPtr; - gpuCurrentPtr = (double *)gpuCurrentPtr + maxUvalCount; - } - // cudaCheckError(); - // allocate - dA_gpu = (LUstructGPU_t *)gpuCurrentPtr; - - cudaMemcpy(dA_gpu, &A_gpu, sizeof(LUstructGPU_t), cudaMemcpyHostToDevice); - gpuCurrentPtr = (LUstructGPU_t *)gpuCurrentPtr + 1; - -#else /* else of #if 0 ----> this is the current active code - Sherry */ - gpuErrchk(cudaMalloc(&A_gpu.xsup, (nsupers + 1) * sizeof(int_t))); - gpuErrchk(cudaMemcpy(A_gpu.xsup, xsup, (nsupers + 1) * sizeof(int_t), cudaMemcpyHostToDevice)); - - double tLsend, tUsend; -#if 0 - tLsend = SuperLU_timer_(); - upanelGPU_t *uPanelVec_GPU = copyUpanelsToGPU(); - tLsend = SuperLU_timer_() - tLsend; - tUsend = SuperLU_timer_(); - lpanelGPU_t *lPanelVec_GPU = copyLpanelsToGPU(); - tUsend = SuperLU_timer_() - tUsend; -#else - upanelGPU_t *uPanelVec_GPU = new upanelGPU_t[CEILING(nsupers, Pr)]; - lpanelGPU_t *lPanelVec_GPU = new lpanelGPU_t[CEILING(nsupers, Pc)]; - tLsend = SuperLU_timer_(); - for (i = 0; i < CEILING(nsupers, Pc); ++i) - { - if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1) - lPanelVec_GPU[i] = lPanelVec[i].copyToGPU(); - } - tLsend = SuperLU_timer_() - tLsend; - tUsend = SuperLU_timer_(); - // cudaCheckError(); - for (i = 0; i < CEILING(nsupers, Pr); ++i) - { - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - uPanelVec_GPU[i] = uPanelVec[i].copyToGPU(); - } - tUsend = SuperLU_timer_() - tUsend; -#endif - tRegion[1] = SuperLU_timer_() - tRegion[1]; - printf("TRegion L,U send: \t %g\n", tRegion[1]); - printf("Time to send Lpanel=%g and U panels =%g \n", tLsend, tUsend); - fflush(stdout); - - gpuErrchk(cudaMalloc(&A_gpu.lPanelVec, CEILING(nsupers, Pc) * sizeof(lpanelGPU_t))); - gpuErrchk(cudaMemcpy(A_gpu.lPanelVec, lPanelVec_GPU, - CEILING(nsupers, Pc) * sizeof(lpanelGPU_t), cudaMemcpyHostToDevice)); - gpuErrchk(cudaMalloc(&A_gpu.uPanelVec, CEILING(nsupers, Pr) * sizeof(upanelGPU_t))); - gpuErrchk(cudaMemcpy(A_gpu.uPanelVec, uPanelVec_GPU, - CEILING(nsupers, Pr) * sizeof(upanelGPU_t), cudaMemcpyHostToDevice)); - - delete [] uPanelVec_GPU; - delete [] lPanelVec_GPU; - - tRegion[2] = SuperLU_timer_(); - int dfactBufSize = 0; - // TODO: does it work with NULL pointer? - cusolverDnHandle_t cusolverH = NULL; - cusolverDnCreate(&cusolverH); - - cusolverDnDgetrf_bufferSize(cusolverH, ldt, ldt, NULL, ldt, &dfactBufSize); - - cusolverDnDestroy(cusolverH); - printf("Size of dfactBuf is %d\n", dfactBufSize); - tRegion[2] = SuperLU_timer_() - tRegion[2]; - printf("TRegion dfactBuf: \t %g\n", tRegion[2]); - fflush(stdout); - - tRegion[3] = SuperLU_timer_(); - - double tcuMalloc=SuperLU_timer_(); - - /* Sherry: where are these freed ?? */ - for (stream = 0; stream < A_gpu.numCudaStreams; stream++) - { - gpuErrchk(cudaMalloc(&A_gpu.LvalRecvBufs[stream], sizeof(double) * maxLvalCount)); - gpuErrchk(cudaMalloc(&A_gpu.UvalRecvBufs[stream], sizeof(double) * maxUvalCount)); - gpuErrchk(cudaMalloc(&A_gpu.LidxRecvBufs[stream], sizeof(int_t) * maxLidxCount)); - gpuErrchk(cudaMalloc(&A_gpu.UidxRecvBufs[stream], sizeof(int_t) * maxUidxCount)); - // allocate the space for diagonal factor on GPU - gpuErrchk(cudaMalloc(&A_gpu.diagFactWork[stream], sizeof(double) * dfactBufSize)); - gpuErrchk(cudaMalloc(&A_gpu.diagFactInfo[stream], sizeof(int))); - - /*lookAhead buffers and stream*/ - gpuErrchk(cudaMalloc(&A_gpu.lookAheadLGemmBuffer[stream], sizeof(double) * maxLvalCount)); - - gpuErrchk(cudaMalloc(&A_gpu.lookAheadUGemmBuffer[stream], sizeof(double) * maxUvalCount)); - // Sherry: replace this by new code - //cudaMalloc(&A_gpu.dFBufs[stream], ldt * ldt * sizeof(double)); - //cudaMalloc(&A_gpu.gpuGemmBuffs[stream], A_gpu.gemmBufferSize * sizeof(double)); - } - - /* Sherry: dfBufs[] changed to double pointer **, max(batch, numCudaStreams) */ - int mxLeafNode = trf3Dpartition->mxLeafNode, mx_fsize = 0; - - /* Compute gemmCsize[] for batch operations - !!!!!!! WARNING: this only works for 1 MPI !!!!!! */ - if ( options->batchCount > 0 ) { - trf3Dpartition->gemmCsizes = int32Calloc_dist(mxLeafNode); - int k, k0, k_st, k_end, offset, Csize; - - for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { /* Loop through the Pz tree levels */ - int treeId = trf3Dpartition->myTreeIdxs[ilvl]; - sForest_t* sforest = trf3Dpartition->sForests[treeId]; - if (sforest){ - int_t *perm_c_supno = sforest->nodeList ; - mx_fsize = max((int_t)mx_fsize, sforest->nNodes); - - int maxTopoLevel = sforest->topoInfo.numLvl;/* number of levels at each outer-tree node */ - for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) { - k_st = sforest->topoInfo.eTreeTopLims[topoLvl]; - k_end = sforest->topoInfo.eTreeTopLims[topoLvl + 1]; - - for (k0 = k_st; k0 < k_end; ++k0) { - offset = k0 - k_st; - k = perm_c_supno[k0]; - Csize = lPanelVec[k].nzrows() * uPanelVec[k].nzcols(); - trf3Dpartition->gemmCsizes[offset] = - SUPERLU_MAX(trf3Dpartition->gemmCsizes[offset], Csize); - } - } - } - } - } - - int num_dfbufs; /* number of diagonal buffers */ - if ( options->batchCount > 0 ) { /* use batch code */ - num_dfbufs = mxLeafNode; - } else { /* use pipelined code */ - // num_dfbufs = MAX_CUDA_STREAMS; // - num_dfbufs = A_gpu.numCudaStreams; - } - int num_gemmbufs = num_dfbufs; - printf(".. setLUstrut_GPU: num_dfbufs %d, num_gemmbufs %d\n", num_dfbufs, num_gemmbufs); fflush(stdout); - - A_gpu.dFBufs = (double **) SUPERLU_MALLOC(num_dfbufs * sizeof(double *)); - A_gpu.gpuGemmBuffs = (double **) SUPERLU_MALLOC(num_gemmbufs * sizeof(double *)); - - int l, sum_diag_size = 0, sum_gemmC_size = 0; - - if ( options->batchCount > 0 ) { /* set up variable-size buffers for batch code */ - for (i = 0; i < num_dfbufs; ++i) { - l = trf3Dpartition->diagDims[i]; - gpuErrchk(cudaMalloc(&(A_gpu.dFBufs[i]), l * l * sizeof(double))); - //printf("\t diagDims[%d] %d\n", i, l); - gpuErrchk(cudaMalloc(&(A_gpu.gpuGemmBuffs[i]), trf3Dpartition->gemmCsizes[i] * sizeof(double))); - sum_diag_size += l * l; - sum_gemmC_size += trf3Dpartition->gemmCsizes[i]; - } - } else { /* uniform-size buffers */ - l = ldt * ldt; - for (i = 0; i < num_dfbufs; ++i) { - gpuErrchk(cudaMalloc(&(A_gpu.dFBufs[i]), l * sizeof(double))); - gpuErrchk(cudaMalloc(&(A_gpu.gpuGemmBuffs[i]), A_gpu.gemmBufferSize * sizeof(double))); - } - } - - // Wajih: Adding allocation for batched LU and SCU marshalled data - // TODO: these are serialized workspaces, so the allocations can be shared - -#if 0 // old batch code - A_gpu.marshall_data.setBatchSize(num_dfbufs); - A_gpu.sc_marshall_data.setBatchSize(num_dfbufs); -#endif - - // TODO: where should these be freed? - // Allocate GPU copy for the node list - gpuErrchk(cudaMalloc(&(A_gpu.dperm_c_supno), sizeof(int) * mx_fsize)); - // Allocate GPU copy of all the gemm buffer pointers and copy the host array to the GPU - gpuErrchk(cudaMalloc(&(A_gpu.dgpuGemmBuffs), sizeof(double*) * num_gemmbufs)); - gpuErrchk(cudaMemcpy(A_gpu.dgpuGemmBuffs, A_gpu.gpuGemmBuffs, sizeof(double*) * num_gemmbufs, cudaMemcpyHostToDevice)); - - tcuMalloc = SuperLU_timer_() - tcuMalloc; -#if ( PRNTlevel>=1 ) - printf("Time to allocate GPU memory: %g\n", tcuMalloc); - printf("\t.. sum_diag_size %d\t sum_gemmC_size %d\n", sum_diag_size, sum_gemmC_size); - fflush(stdout); -#endif - - double tcuStream=SuperLU_timer_(); - - for (stream = 0; stream < A_gpu.numCudaStreams; stream++) - { - // cublasCreate(&A_gpu.cuHandles[stream]); - cusolverDnCreate(&A_gpu.cuSolveHandles[stream]); - } - tcuStream = SuperLU_timer_() - tcuStream; - printf("Time to create cublas streams: %g\n", tcuStream); - - double tcuStreamCreate=SuperLU_timer_(); - for (stream = 0; stream < A_gpu.numCudaStreams; stream++) - { - cudaStreamCreate(&A_gpu.cuStreams[stream]); - cublasCreate(&A_gpu.cuHandles[stream]); - /*lookAhead buffers and stream*/ - cublasCreate(&A_gpu.lookAheadLHandle[stream]); - cudaStreamCreate(&A_gpu.lookAheadLStream[stream]); - cublasCreate(&A_gpu.lookAheadUHandle[stream]); - cudaStreamCreate(&A_gpu.lookAheadUStream[stream]); - - // Wajih: Need to create at least one magma queue -#ifdef HAVE_MAGMA - if(stream == 0) - { - magma_queue_create_from_cuda( - device_id, A_gpu.cuStreams[stream], A_gpu.cuHandles[stream], - NULL, &A_gpu.magma_queue - ); - } -#endif - } - tcuStreamCreate = SuperLU_timer_() - tcuStreamCreate; - printf("Time to create CUDA streams: %g\n", tcuStreamCreate); - - tRegion[3] = SuperLU_timer_() - tRegion[3]; - printf("TRegion stream: \t %g\n", tRegion[3]); - // allocate - gpuErrchk(cudaMalloc(&dA_gpu, sizeof(LUstructGPU_t))); - gpuErrchk(cudaMemcpy(dA_gpu, &A_gpu, sizeof(LUstructGPU_t), cudaMemcpyHostToDevice)); - -#endif - - // cudaCheckError(); - -#if (DEBUGlevel >= 1) - CHECK_MALLOC(iam, "Exit setLUstruct_GPU()"); -#endif - return 0; -} /* setLUstruct_GPU */ - -int_t LUstruct_v100::copyLUGPUtoHost() -{ - - for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) - if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1) - lPanelVec[i].copyFromGPU(); - - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - uPanelVec[i].copyFromGPU(); - return 0; -} - -int_t LUstruct_v100::copyLUHosttoGPU() -{ - for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) - if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1) - lPanelVec[i].copyBackToGPU(); - - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - uPanelVec[i].copyBackToGPU(); - return 0; -} - -int_t LUstruct_v100::checkGPU() -{ - - for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) - lPanelVec[i].checkGPU(); - - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - uPanelVec[i].checkGPU(); - - std::cout << "Checking LU struct completed succesfully" - << "\n"; - return 0; -} - - -#if 0 // Sherry: old batch code -////////////////////////////////////////////////////////////////////////////////////////////////////////// - -// Marshall Functors for batched execution -struct MarshallLUFunc { - int k_st, *ld_batch, *dim_batch; - double** diag_ptrs; - LUstructGPU_t *A_gpu; - - MarshallLUFunc(int k_st, double** diag_ptrs, int *ld_batch, int *dim_batch, LUstructGPU_t *A_gpu) - { - this->k_st = k_st; - this->ld_batch = ld_batch; - this->dim_batch = dim_batch; - this->diag_ptrs = diag_ptrs; - this->A_gpu = A_gpu; - } - - __device__ void operator()(const unsigned int &i) const - { - int k = A_gpu->dperm_c_supno[k_st + i]; - int_t* xsup = A_gpu->xsup; - lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)]; - - diag_ptrs[i] = lpanel.blkPtr(0); - ld_batch[i] = lpanel.LDA(); - dim_batch[i] = SuperSize(k); - } -}; - -struct MarshallTRSMUFunc { - int k_st, *diag_ld_batch, *diag_dim_batch, *panel_ld_batch, *panel_dim_batch; - double** diag_ptrs, **panel_ptrs; - LUstructGPU_t *A_gpu; - - MarshallTRSMUFunc( - int k_st, double** diag_ptrs, int *diag_ld_batch, int *diag_dim_batch, double** panel_ptrs, - int *panel_ld_batch, int *panel_dim_batch, LUstructGPU_t *A_gpu - ) - { - this->k_st = k_st; - this->diag_ptrs = diag_ptrs; - this->diag_ld_batch = diag_ld_batch; - this->diag_dim_batch = diag_dim_batch; - this->panel_ptrs = panel_ptrs; - this->panel_ld_batch = panel_ld_batch; - this->panel_dim_batch = panel_dim_batch; - this->A_gpu = A_gpu; - } - - __device__ void operator()(const unsigned int &i) const - { - int k = A_gpu->dperm_c_supno[k_st + i]; - int_t* xsup = A_gpu->xsup; - int ksupc = SuperSize(k); - - upanelGPU_t &upanel = A_gpu->uPanelVec[A_gpu->g2lRow(k)]; - lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)]; - - if(!upanel.isEmpty()) - { - panel_ptrs[i] = upanel.blkPtr(0); - panel_ld_batch[i] = upanel.LDA(); - panel_dim_batch[i] = upanel.nzcols(); - diag_ptrs[i] = lpanel.blkPtr(0); - diag_ld_batch[i] = lpanel.LDA(); - diag_dim_batch[i] = ksupc; - } - else - { - panel_ptrs[i] = diag_ptrs[i] = NULL; - panel_ld_batch[i] = diag_ld_batch[i] = 1; - panel_dim_batch[i] = diag_dim_batch[i] = 0; - } - } -}; - -struct MarshallTRSMLFunc { - int k_st, *diag_ld_batch, *diag_dim_batch, *panel_ld_batch, *panel_dim_batch; - double** diag_ptrs, **panel_ptrs; - LUstructGPU_t *A_gpu; - - MarshallTRSMLFunc( - int k_st, double** diag_ptrs, int *diag_ld_batch, int *diag_dim_batch, double** panel_ptrs, - int *panel_ld_batch, int *panel_dim_batch, LUstructGPU_t *A_gpu - ) - { - this->k_st = k_st; - this->diag_ptrs = diag_ptrs; - this->diag_ld_batch = diag_ld_batch; - this->diag_dim_batch = diag_dim_batch; - this->panel_ptrs = panel_ptrs; - this->panel_ld_batch = panel_ld_batch; - this->panel_dim_batch = panel_dim_batch; - this->A_gpu = A_gpu; - } - - __device__ void operator()(const unsigned int &i) const - { - int k = A_gpu->dperm_c_supno[k_st + i]; - int_t* xsup = A_gpu->xsup; - int ksupc = SuperSize(k); - - lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)]; - - if(!lpanel.isEmpty()) - { - double *lPanelStPtr = lpanel.blkPtr(0); - int_t len = lpanel.nzrows(); - if(lpanel.haveDiag()) - { - lPanelStPtr = lpanel.blkPtr(1); - len -= lpanel.nbrow(0); - } - panel_ptrs[i] = lPanelStPtr; - panel_ld_batch[i] = lpanel.LDA(); - panel_dim_batch[i] = len; - diag_ptrs[i] = lpanel.blkPtr(0); - diag_ld_batch[i] = lpanel.LDA(); - diag_dim_batch[i] = ksupc; - } - else - { - panel_ptrs[i] = diag_ptrs[i] = NULL; - panel_ld_batch[i] = diag_ld_batch[i] = 1; - panel_dim_batch[i] = diag_dim_batch[i] = 0; - } - } -}; - -struct MarshallInitSCUFunc { - int k_st, *ist, *iend, *maxGemmRows, *maxGemmCols; - lpanelGPU_t* lpanels; - upanelGPU_t* upanels; - LUstructGPU_t *A_gpu; - - MarshallInitSCUFunc( - int k_st, int *ist, int *iend, int *maxGemmRows, int *maxGemmCols, - lpanelGPU_t* lpanels, upanelGPU_t* upanels, LUstructGPU_t *A_gpu - ) - { - this->k_st = k_st; - this->ist = ist; - this->iend = iend; - this->maxGemmRows = maxGemmRows; - this->maxGemmCols = maxGemmCols; - this->lpanels = lpanels; - this->upanels = upanels; - this->A_gpu = A_gpu; - } - - __device__ void operator()(const unsigned int &i) const - { - int k = A_gpu->dperm_c_supno[k_st + i]; - size_t gemmBufferSize = A_gpu->gemmBufferSize; - - lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)]; - upanelGPU_t &upanel = A_gpu->uPanelVec[A_gpu->g2lCol(k)]; - - lpanels[i] = lpanel; - upanels[i] = upanel; - - if(!upanel.isEmpty() && !lpanel.isEmpty()) - { - int_t st_lb = 1; - int_t nlb = lpanel.nblocks(); - int_t nub = upanel.nblocks(); - - ist[i] = iend[i] = st_lb; - int nrows = lpanel.stRow(nlb) - lpanel.stRow(st_lb); - int ncols = upanel.nzcols(); - - int max_rows = nrows; - int max_cols = ncols; - // entire gemm doesn't fit in gemm buffer - if (nrows * ncols > gemmBufferSize) - { - int maxGemmOpSize = (int)sqrt((double)gemmBufferSize); - int numberofRowChunks = (nrows + maxGemmOpSize - 1) / maxGemmOpSize; - max_rows = nrows / numberofRowChunks; - max_cols = gemmBufferSize / max_rows; - } - - maxGemmRows[i] = max_rows; - maxGemmCols[i] = max_cols; - } - else - { - ist[i] = iend[i] = 0; - maxGemmRows[i] = maxGemmCols[i] = 0; - } - } -}; - -struct MarshallSCUOuter_Predicate -{ - __host__ __device__ bool operator()(const int &x) - { - return x == 1; - } -}; - -struct MarshallSCUOuterFunc { - int k_st, *ist, *iend, *jst, *jend, *maxGemmRows, *done_flags; - LUstructGPU_t *A_gpu; - - MarshallSCUOuterFunc(int k_st, int *ist, int *iend, int *jst, int *jend, int *maxGemmRows, int* done_flags, LUstructGPU_t *A_gpu) - { - this->k_st = k_st; - this->ist = ist; - this->iend = iend; - this->jst = jst; - this->jend = jend; - this->maxGemmRows = maxGemmRows; - this->done_flags = done_flags; - this->A_gpu = A_gpu; - } - - __device__ void operator()(const unsigned int &i) const - { - int k = A_gpu->dperm_c_supno[k_st + i]; - lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)]; - upanelGPU_t &upanel = A_gpu->uPanelVec[A_gpu->g2lCol(k)]; - int& iEnd = iend[i]; - - // Not done if even one operation still has work to do - if(!lpanel.isEmpty() && !upanel.isEmpty() && iEnd < lpanel.nblocks()) - { - int& iSt = ist[i]; - iSt = iEnd; - iEnd = lpanel.getEndBlock(iSt, maxGemmRows[i]); - assert(iEnd > iSt); - jst[i] = jend[i] = 0; - done_flags[i] = 0; - } - else - { - done_flags[i] = 1; - } - } -}; - -struct MarshallSCUInner_Predicate -{ - __host__ __device__ bool operator()(const int &x) - { - return x == 0; - } -}; - -template -struct element_diff : public thrust::unary_function -{ - T* st, *end; - element_diff(T* st, T *end) - { - this->st = st; - this->end = end; - } - - __device__ T operator()(const T &x) const - { - return end[x] - st[x]; - } -}; - - -struct MarshallSCUInnerFunc { - int k_st, *ist, *iend, *jst, *jend, *maxGemmCols; - LUstructGPU_t *A_gpu; - double** A_ptrs, **B_ptrs, **C_ptrs; - int* lda_array, *ldb_array, *ldc_array, *m_array, *n_array, *k_array; - - MarshallSCUInnerFunc( - int k_st, int *ist, int *iend, int *jst, int *jend, int *maxGemmCols, - double** A_ptrs, int* lda_array, double** B_ptrs, int* ldb_array, double **C_ptrs, int *ldc_array, - int *m_array, int *n_array, int *k_array, LUstructGPU_t *A_gpu - ) - { - this->k_st = k_st; - this->ist = ist; - this->iend = iend; - this->jst = jst; - this->jend = jend; - this->maxGemmCols = maxGemmCols; - this->A_ptrs = A_ptrs; - this->B_ptrs = B_ptrs; - this->C_ptrs = C_ptrs; - this->lda_array = lda_array; - this->ldb_array = ldb_array; - this->ldc_array = ldc_array; - this->m_array = m_array; - this->n_array = n_array; - this->k_array = k_array; - this->A_gpu = A_gpu; - } - - __device__ void operator()(const unsigned int &i) const - { - int k = A_gpu->dperm_c_supno[k_st + i]; - int_t* xsup = A_gpu->xsup; - lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)]; - upanelGPU_t &upanel = A_gpu->uPanelVec[A_gpu->g2lCol(k)]; - - int iSt = ist[i], iEnd = iend[i]; - int& jSt = jst[i], &jEnd = jend[i]; - - // Not done if even one operation still has work to do - if(!lpanel.isEmpty() && !upanel.isEmpty() && jEnd < upanel.nblocks()) - { - jSt = jEnd; - jEnd = upanel.getEndBlock(jSt, maxGemmCols[i]); - assert(jEnd > jSt); - - A_ptrs[i] = lpanel.blkPtr(iSt); - B_ptrs[i] = upanel.blkPtr(jSt); - C_ptrs[i] = A_gpu->dgpuGemmBuffs[i]; - - lda_array[i] = lpanel.LDA(); - ldb_array[i] = upanel.LDA(); - ldc_array[i] = lpanel.stRow(iEnd) - lpanel.stRow(iSt); - - m_array[i] = ldc_array[i]; - n_array[i] = upanel.stCol(jEnd) - upanel.stCol(jSt); - k_array[i] = SuperSize(k); - } - else - { - A_ptrs[i] = B_ptrs[i] = C_ptrs[i] = NULL; - lda_array[i] = ldb_array[i] = ldc_array[i] = 1; - m_array[i] = n_array[i] = k_array[i] = 0; - } - } -}; - -// Marshalling routines for batched execution -void LUstruct_v100::marshallBatchedLUData(int k_st, int k_end, int_t *perm_c_supno) -{ - // First gather up all the pointer and meta data on the host - LUMarshallData& mdata = A_gpu.marshall_data; - mdata.batchsize = k_end - k_st; - - MarshallLUFunc func(k_st, mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_diag_dim_array, dA_gpu); - - thrust::for_each( - thrust::system::cuda::par, thrust::counting_iterator(0), - thrust::counting_iterator(mdata.batchsize), func - ); - - // double **diag_ptrs = mdata.host_diag_ptrs.data(); - // int *ld_batch = mdata.host_diag_ld_array.data(); - // int *dim_batch = mdata.host_diag_dim_array.data(); - - // mdata.batchsize = 0; - - // for (int_t k0 = k_st; k0 < k_end; k0++) - // { - // int_t k = perm_c_supno[k0]; - - // if (iam == procIJ(k, k)) - // { - // assert(mdata.batchsize < mdata.host_diag_ptrs.size()); - - // lpanel_t &lpanel = lPanelVec[g2lCol(k)]; - // diag_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0); - // ld_batch[mdata.batchsize] = lpanel.LDA(); - // dim_batch[mdata.batchsize] = SuperSize(k); - // mdata.batchsize++; - // } - // } - - // mdata.setMaxDiag(); - // // Then copy the marshalled data over to the GPU - // gpuErrchk(cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice)); - // gpuErrchk(cudaMemcpy(mdata.dev_diag_ld_array, ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); - // gpuErrchk(cudaMemcpy(mdata.dev_diag_dim_array, dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); -} - -void LUstruct_v100::marshallBatchedTRSMUData(int k_st, int k_end, int_t *perm_c_supno) -{ - // First gather up all the pointer and meta data on the host - LUMarshallData& mdata = A_gpu.marshall_data; - mdata.batchsize = k_end - k_st; - - MarshallTRSMUFunc func( - k_st, mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_diag_dim_array, - mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, mdata.dev_panel_dim_array, dA_gpu - ); - - thrust::for_each( - thrust::system::cuda::par, thrust::counting_iterator(0), - thrust::counting_iterator(mdata.batchsize), func - ); - - // double **panel_ptrs = mdata.host_panel_ptrs.data(); - // int *panel_ld_batch = mdata.host_panel_ld_array.data(); - // int *panel_dim_batch = mdata.host_panel_dim_array.data(); - // double **diag_ptrs = mdata.host_diag_ptrs.data(); - // int *diag_ld_batch = mdata.host_diag_ld_array.data(); - // int *diag_dim_batch = mdata.host_diag_dim_array.data(); - - // mdata.batchsize = 0; - - // for (int_t k0 = k_st; k0 < k_end; k0++) - // { - // int_t k = perm_c_supno[k0]; - // int_t buffer_offset = k0 - k_st; - // int ksupc = SuperSize(k); - - // if (myrow == krow(k)) - // { - // upanel_t& upanel = uPanelVec[g2lRow(k)]; - // lpanel_t &lpanel = lPanelVec[g2lCol(k)]; - // if(!upanel.isEmpty()) - // { - // assert(mdata.batchsize < mdata.host_diag_ptrs.size()); - - // panel_ptrs[mdata.batchsize] = upanel.blkPtrGPU(0); - // panel_ld_batch[mdata.batchsize] = upanel.LDA(); - // panel_dim_batch[mdata.batchsize] = upanel.nzcols(); - - // // Hackathon change: using the original diagonal block instead of the bcast buffer - // // diag_ptrs[mdata.batchsize] = A_gpu.dFBufs[buffer_offset]; - // // diag_ld_batch[mdata.batchsize] = ksupc; - // // diag_dim_batch[mdata.batchsize] = ksupc; - // diag_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0); - // diag_ld_batch[mdata.batchsize] = lpanel.LDA(); - // diag_dim_batch[mdata.batchsize] = ksupc; - - // mdata.batchsize++; - // } - // } - // } - - // mdata.setMaxDiag(); - // mdata.setMaxPanel(); - - // // Then copy the marshalled data over to the GPU - // gpuErrchk(cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice)); - // gpuErrchk(cudaMemcpy(mdata.dev_diag_ld_array, diag_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); - // gpuErrchk(cudaMemcpy(mdata.dev_diag_dim_array, diag_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); - // gpuErrchk(cudaMemcpy(mdata.dev_panel_ptrs, panel_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice)); - // gpuErrchk(cudaMemcpy(mdata.dev_panel_ld_array, panel_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); - // gpuErrchk(cudaMemcpy(mdata.dev_panel_dim_array, panel_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice)); -} - -void LUstruct_v100::marshallBatchedTRSMLData(int k_st, int k_end, int_t *perm_c_supno) -{ - // First gather up all the pointer and meta data on the host - LUMarshallData& mdata = A_gpu.marshall_data; - mdata.batchsize = k_end - k_st; - - MarshallTRSMLFunc func( - k_st, mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_diag_dim_array, - mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, mdata.dev_panel_dim_array, dA_gpu - ); - - thrust::for_each( - thrust::system::cuda::par, thrust::counting_iterator(0), - thrust::counting_iterator(mdata.batchsize), func - ); - - // double **panel_ptrs = mdata.host_panel_ptrs.data(); - // int *panel_ld_batch = mdata.host_panel_ld_array.data(); - // int *panel_dim_batch = mdata.host_panel_dim_array.data(); - // double **diag_ptrs = mdata.host_diag_ptrs.data(); - // int *diag_ld_batch = mdata.host_diag_ld_array.data(); - // int *diag_dim_batch = mdata.host_diag_dim_array.data(); - - // mdata.batchsize = 0; - - // for (int_t k0 = k_st; k0 < k_end; k0++) - // { - // int_t k = perm_c_supno[k0]; - // int_t buffer_offset = k0 - k_st; - // int ksupc = SuperSize(k); - - // if (mycol == kcol(k)) - // { - // lpanel_t &lpanel = lPanelVec[g2lCol(k)]; - // if(!lpanel.isEmpty()) - // { - // assert(mdata.batchsize < mdata.host_diag_ptrs.size()); - - // double *lPanelStPtr = lpanel.blkPtrGPU(0); - // int_t len = lpanel.nzrows(); - // if(lpanel.haveDiag()) - // { - // /* code */ - // lPanelStPtr = lpanel.blkPtrGPU(1); - // len -= lpanel.nbrow(0); - // } - // panel_ptrs[mdata.batchsize] = lPanelStPtr; - // panel_ld_batch[mdata.batchsize] = lpanel.LDA(); - // panel_dim_batch[mdata.batchsize] = len; - - // // Hackathon change: using the original diagonal block instead of the bcast buffer - // // diag_ptrs[mdata.batchsize] = A_gpu.dFBufs[buffer_offset]; - // // diag_ld_batch[mdata.batchsize] = ksupc; - // // diag_dim_batch[mdata.batchsize] = ksupc; - // diag_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0); - // diag_ld_batch[mdata.batchsize] = lpanel.LDA(); - // diag_dim_batch[mdata.batchsize] = ksupc; - - // mdata.batchsize++; - // } - // } - // } - - // mdata.setMaxDiag(); - // mdata.setMaxPanel(); - - // // Then copy the marshalled data over to the GPU - // cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice); - // cudaMemcpy(mdata.dev_diag_ld_array, diag_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice); - // cudaMemcpy(mdata.dev_diag_dim_array, diag_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice); - // cudaMemcpy(mdata.dev_panel_ptrs, panel_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice); - // cudaMemcpy(mdata.dev_panel_ld_array, panel_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice); - // cudaMemcpy(mdata.dev_panel_dim_array, panel_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice); -} - -void LUstruct_v100::initSCUMarshallData(int k_st, int k_end, int_t *perm_c_supno) -{ - SCUMarshallData& sc_mdata = A_gpu.sc_marshall_data; - sc_mdata.batchsize = k_end - k_st; - - MarshallInitSCUFunc func( - k_st, sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_maxGemmRows, sc_mdata.dev_maxGemmCols, - sc_mdata.dev_gpu_lpanels, sc_mdata.dev_gpu_upanels, dA_gpu - ); - - thrust::for_each( - thrust::system::cuda::par, thrust::counting_iterator(0), - thrust::counting_iterator(sc_mdata.batchsize), func - ); - - // for (int_t k0 = k_st; k0 < k_end; k0++) - // { - // int_t k = perm_c_supno[k0]; - // int_t buffer_offset = k0 - k_st; - - // assert(buffer_offset < sc_mdata.upanels.size()); - - // // Wajih: TODO: figure out what this offset does - // int offset = 0; - // if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) - // { - // sc_mdata.upanels[buffer_offset] = getKUpanel(k, offset); - // sc_mdata.lpanels[buffer_offset] = getKLpanel(k, offset); - - // // Set gemm loop parameters for the panels - // upanel_t& upanel = sc_mdata.upanels[buffer_offset]; - // lpanel_t& lpanel = sc_mdata.lpanels[buffer_offset]; - - // sc_mdata.host_gpu_upanels[buffer_offset] = upanel.gpuPanel; - // sc_mdata.host_gpu_lpanels[buffer_offset] = lpanel.gpuPanel; - - // if(!upanel.isEmpty() && !lpanel.isEmpty()) - // { - // int_t st_lb = 0; - // if (myrow == krow(k)) - // st_lb = 1; - - // int_t nlb = lpanel.nblocks(); - // int_t nub = upanel.nblocks(); - - // sc_mdata.ist[buffer_offset] = st_lb; - // sc_mdata.iend[buffer_offset] = sc_mdata.ist[buffer_offset]; - - // int nrows = lpanel.stRow(nlb) - lpanel.stRow(st_lb); - // int ncols = upanel.nzcols(); - - // int maxGemmRows = nrows; - // int maxGemmCols = ncols; - // // entire gemm doesn't fit in gemm buffer - // if (nrows * ncols > A_gpu.gemmBufferSize) - // { - // int maxGemmOpSize = (int)sqrt(A_gpu.gemmBufferSize); - // int numberofRowChunks = (nrows + maxGemmOpSize - 1) / maxGemmOpSize; - // maxGemmRows = nrows / numberofRowChunks; - // maxGemmCols = A_gpu.gemmBufferSize / maxGemmRows; - // } - - // sc_mdata.maxGemmRows[buffer_offset] = maxGemmRows; - // sc_mdata.maxGemmCols[buffer_offset] = maxGemmCols; - // } - // } - // } - - // sc_mdata.copyPanelDataToGPU(); -} - -int LUstruct_v100::marshallSCUBatchedDataOuter(int k_st, int k_end, int_t *perm_c_supno) -{ - SCUMarshallData& sc_mdata = A_gpu.sc_marshall_data; - sc_mdata.batchsize = k_end - k_st; - - // Temporarily use the m array for the done flags - int *done_flags = sc_mdata.dev_m_array; - MarshallSCUOuterFunc func( - k_st, sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend, - sc_mdata.dev_maxGemmRows, done_flags, dA_gpu - ); - - thrust::for_each( - thrust::system::cuda::par, thrust::counting_iterator(0), - thrust::counting_iterator(sc_mdata.batchsize), func - ); - - bool done = thrust::all_of( - thrust::system::cuda::par, done_flags, done_flags + sc_mdata.batchsize, - MarshallSCUOuter_Predicate() - ); - - return done; - - // int done_i = 1; - // for (int k0 = k_st; k0 < k_end; k0++) - // { - // int k = perm_c_supno[k0]; - // int buffer_index = k0 - k_st; - // lpanel_t& lpanel = sc_mdata.lpanels[buffer_index]; - // upanel_t& upanel = sc_mdata.upanels[buffer_index]; - // if(lpanel.isEmpty() || upanel.isEmpty()) - // continue; - - // int& iEnd = sc_mdata.iend[buffer_index]; - // // Not done if even one operation still has work to do - // if(iEnd < lpanel.nblocks()) - // { - // done_i = 0; - // int& iSt = sc_mdata.ist[buffer_index]; - // iSt = iEnd; - // iEnd = lpanel.getEndBlock(iSt, sc_mdata.maxGemmRows[buffer_index]); - // assert(iEnd > iSt); - // sc_mdata.jst[buffer_index] = sc_mdata.jend[buffer_index] = 0; - // } - // } - - // return done_i; -} - -int LUstruct_v100::marshallSCUBatchedDataInner(int k_st, int k_end, int_t *perm_c_supno) -{ - SCUMarshallData& sc_mdata = A_gpu.sc_marshall_data; - int knum = k_end - k_st; - sc_mdata.batchsize = knum; - - MarshallSCUInnerFunc func( - k_st, sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend, sc_mdata.dev_maxGemmCols, - sc_mdata.dev_A_ptrs, sc_mdata.dev_lda_array, sc_mdata.dev_B_ptrs, sc_mdata.dev_ldb_array, sc_mdata.dev_C_ptrs, - sc_mdata.dev_ldc_array, sc_mdata.dev_m_array, sc_mdata.dev_n_array, sc_mdata.dev_k_array, dA_gpu - ); - - thrust::counting_iterator start(0), end(knum); - thrust::for_each(thrust::system::cuda::par, start, end, func); - - // Set the max dims in the marshalled data - sc_mdata.max_m = thrust::reduce(thrust::system::cuda::par, sc_mdata.dev_m_array, sc_mdata.dev_m_array + knum, 0, thrust::maximum()); - sc_mdata.max_n = thrust::reduce(thrust::system::cuda::par, sc_mdata.dev_n_array, sc_mdata.dev_n_array + knum, 0, thrust::maximum()); - sc_mdata.max_k = thrust::reduce(thrust::system::cuda::par, sc_mdata.dev_k_array, sc_mdata.dev_k_array + knum, 0, thrust::maximum()); - sc_mdata.max_ilen = thrust::transform_reduce(thrust::system::cuda::par, start, end, element_diff(sc_mdata.dev_ist, sc_mdata.dev_iend), 0, thrust::maximum()); - sc_mdata.max_jlen = thrust::transform_reduce(thrust::system::cuda::par, start, end, element_diff(sc_mdata.dev_jst, sc_mdata.dev_jend), 0, thrust::maximum()); - - printf("SCU %d -> %d: %d %d %d %d %d\n", k_st, k_end, sc_mdata.max_m, sc_mdata.max_n, sc_mdata.max_k, sc_mdata.max_ilen, sc_mdata.max_jlen); - - return thrust::all_of( - thrust::system::cuda::par, sc_mdata.dev_m_array, sc_mdata.dev_m_array + sc_mdata.batchsize, - MarshallSCUInner_Predicate() - ); - // int done_j = 1; - // for(int k0 = k_st; k0 < k_end; k0++) - // { - // int k = perm_c_supno[k0]; - // int buffer_index = k0 - k_st; - // lpanel_t& lpanel = sc_mdata.lpanels[buffer_index]; - // upanel_t& upanel = sc_mdata.upanels[buffer_index]; - - // if(lpanel.isEmpty() || upanel.isEmpty()) - // continue; - - // int iSt = sc_mdata.ist[buffer_index]; - // int iEnd = sc_mdata.iend[buffer_index]; - - // int& jSt = sc_mdata.jst[buffer_index]; - // int& jEnd = sc_mdata.jend[buffer_index]; - - // // Not done if even one operation still has work to do - // if(jEnd < upanel.nblocks()) - // { - // jSt = jEnd; - // jEnd = upanel.getEndBlock(jSt, sc_mdata.maxGemmCols[buffer_index]); - - // assert(jEnd > jSt); - // done_j = 0; - // // printf("k = %d, ist = %d, iend = %d, jst = %d, jend = %d\n", k, iSt, iEnd, jSt, jEnd); - - // sc_mdata.host_m_array[buffer_index] = lpanel.stRow(iEnd) - lpanel.stRow(iSt); - // sc_mdata.host_n_array[buffer_index] = upanel.stCol(jEnd) - upanel.stCol(jSt); - // sc_mdata.host_k_array[buffer_index] = supersize(k); - - // sc_mdata.host_A_ptrs[buffer_index] = lpanel.blkPtrGPU(iSt); - // sc_mdata.host_B_ptrs[buffer_index] = upanel.blkPtrGPU(jSt); - // sc_mdata.host_C_ptrs[buffer_index] = A_gpu.gpuGemmBuffs[buffer_index]; - - // sc_mdata.host_lda_array[buffer_index] = lpanel.LDA(); - // sc_mdata.host_ldb_array[buffer_index] = upanel.LDA(); - // sc_mdata.host_ldc_array[buffer_index] = sc_mdata.host_m_array[buffer_index]; - // } - // else - // { - // sc_mdata.host_A_ptrs[buffer_index] = NULL; - // sc_mdata.host_B_ptrs[buffer_index] = NULL; - // sc_mdata.host_C_ptrs[buffer_index] = NULL; - - // sc_mdata.host_m_array[buffer_index] = 0; - // sc_mdata.host_n_array[buffer_index] = 0; - // sc_mdata.host_k_array[buffer_index] = 0; - - // sc_mdata.host_lda_array[buffer_index] = 1; - // sc_mdata.host_ldb_array[buffer_index] = 1; - // sc_mdata.host_ldc_array[buffer_index] = 1; - // } - // } - - // if(done_j == 0) - // { - // // Upload the buffers to the gpu - // sc_mdata.setMaxDims(); - // sc_mdata.copyToGPU(); - // } - - // return done_j; -} - -#endif // Sherry: old batch code commented out -////////////////////////////////////////////////////////////////////////////////////////////////////////// - - -/** - * @brief Pack non-zero values into a vector. - * - * @param spNzvalArray The array of non-zero values. - * @param nzvalSize The size of the array of non-zero values. - * @param valOffset The offset of the non-zero values. - * @param packedNzvals The vector to store the non-zero values. - * @param packedNzvalsIndices The vector to store the indices of the non-zero values. - */ -void packNzvals(std::vector &packedNzvals, std::vector &packedNzvalsIndices, - double *spNzvalArray, int_t nzvalSize, int_t valOffset) -{ - for (int k = 0; k < nzvalSize; k++) - { - if (spNzvalArray[k] != 0) - { - packedNzvals.push_back(spNzvalArray[k]); - packedNzvalsIndices.push_back(valOffset + k); - } - } -} - -const int AVOID_CPU_NZVAL = 1; -lpanelGPU_t *LUstruct_v100::copyLpanelsToGPU() -{ - lpanelGPU_t *lPanelVec_GPU = new lpanelGPU_t[CEILING(nsupers, Pc)]; - - // TODO: set gpuLvalSize, gpuLidxSize - gpuLvalSize = 0; - gpuLidxSize = 0; - for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) - { - if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1) - { - gpuLvalSize += sizeof(double) * lPanelVec[i].nzvalSize(); - gpuLidxSize += sizeof(int_t) * lPanelVec[i].indexSize(); - } - } - - double *valBuffer = (double *)SUPERLU_MALLOC(gpuLvalSize); - int_t *idxBuffer = (int_t *)SUPERLU_MALLOC(gpuLidxSize); - - // allocate memory buffer on GPU - gpuErrchk(cudaMalloc(&gpuLvalBasePtr, gpuLvalSize)); - gpuErrchk(cudaMalloc(&gpuLidxBasePtr, gpuLidxSize)); - - size_t valOffset = 0; - size_t idxOffset = 0; - double tCopyToCPU = SuperLU_timer_(); - - std::vector packedNzvals; - std::vector packedNzvalsIndices; - - // do a memcpy to CPU buffer - for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) - { - if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1) - { - if (lPanelVec[i].isEmpty()) - { - lpanelGPU_t ithLpanel(NULL, NULL); - lPanelVec[i].gpuPanel = ithLpanel; - lPanelVec_GPU[i] = ithLpanel; - } - else - { - lpanelGPU_t ithLpanel(&gpuLidxBasePtr[idxOffset], &gpuLvalBasePtr[valOffset]); - lPanelVec[i].gpuPanel = ithLpanel; - lPanelVec_GPU[i] = ithLpanel; - if (AVOID_CPU_NZVAL) - packNzvals(packedNzvals, packedNzvalsIndices, lPanelVec[i].val, lPanelVec[i].nzvalSize(), valOffset); - else - memcpy(&valBuffer[valOffset], lPanelVec[i].val, sizeof(double) * lPanelVec[i].nzvalSize()); - - memcpy(&idxBuffer[idxOffset], lPanelVec[i].index, sizeof(int_t) * lPanelVec[i].indexSize()); - - valOffset += lPanelVec[i].nzvalSize(); - idxOffset += lPanelVec[i].indexSize(); - } - } - } - tCopyToCPU = SuperLU_timer_() - tCopyToCPU; - std::cout << "Time to copy-L to CPU: " << tCopyToCPU << "\n"; - // do a cudaMemcpy to GPU - double tLsend = SuperLU_timer_(); - if (AVOID_CPU_NZVAL) - copyToGPU(gpuLvalBasePtr, packedNzvals, packedNzvalsIndices); - else - { -#if 0 - cudaMemcpy(gpuLvalBasePtr, valBuffer, gpuLvalSize, cudaMemcpyHostToDevice); -#else - copyToGPU_Sparse(gpuLvalBasePtr, valBuffer, gpuLvalSize); -#endif - } - // find - gpuErrchk(cudaMemcpy(gpuLidxBasePtr, idxBuffer, gpuLidxSize, cudaMemcpyHostToDevice)); - tLsend = SuperLU_timer_() - tLsend; - printf("cudaMemcpy time L =%g \n", tLsend); - - SUPERLU_FREE(valBuffer); - SUPERLU_FREE(idxBuffer); - return lPanelVec_GPU; -} /* copyLpanelsToGPU */ - - -upanelGPU_t *LUstruct_v100::copyUpanelsToGPU() -{ -#if (DEBUGlevel >= 1) - int iam = 0; - CHECK_MALLOC(iam, "Enter copyUpanelsToGPU()"); -#endif - - upanelGPU_t *uPanelVec_GPU = new upanelGPU_t[CEILING(nsupers, Pr)]; - - gpuUvalSize = 0; - gpuUidxSize = 0; - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - { - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - { - gpuUvalSize += sizeof(double) * uPanelVec[i].nzvalSize(); - gpuUidxSize += sizeof(int_t) * uPanelVec[i].indexSize(); - } - } - - // TODO: set gpuUvalSize, gpuUidxSize - - // allocate memory buffer on GPU - gpuErrchk(cudaMalloc(&gpuUvalBasePtr, gpuUvalSize)); - gpuErrchk(cudaMalloc(&gpuUidxBasePtr, gpuUidxSize)); - - size_t valOffset = 0; - size_t idxOffset = 0; - - double tCopyToCPU = SuperLU_timer_(); - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - { - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - { - if (uPanelVec[i].isEmpty()) - { - upanelGPU_t ithupanel(NULL, NULL); - uPanelVec[i].gpuPanel = ithupanel; - uPanelVec_GPU[i] = ithupanel; - } - } - } - - int_t *idxBuffer = NULL; - if ( gpuUidxSize>0 ) /* Sherry fix: gpuUidxSize can be 0 */ - idxBuffer = (int_t *)SUPERLU_MALLOC(gpuUidxSize); - - if (AVOID_CPU_NZVAL) - { - printf("AVOID_CPU_NZVAL is set\n"); - std::vector packedNzvals; - std::vector packedNzvalsIndices; - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - { - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - { - if (!uPanelVec[i].isEmpty()) - { - - upanelGPU_t ithupanel(&gpuUidxBasePtr[idxOffset], &gpuUvalBasePtr[valOffset]); - uPanelVec[i].gpuPanel = ithupanel; - uPanelVec_GPU[i] = ithupanel; - packNzvals(packedNzvals, packedNzvalsIndices, uPanelVec[i].val, - uPanelVec[i].nzvalSize(), valOffset); - memcpy(&idxBuffer[idxOffset], uPanelVec[i].index, sizeof(int_t) * uPanelVec[i].indexSize()); - - valOffset += uPanelVec[i].nzvalSize(); - idxOffset += uPanelVec[i].indexSize(); - } - } - } - tCopyToCPU = SuperLU_timer_() - tCopyToCPU; - printf("copyU to CPU-buff time = %g\n", tCopyToCPU); - - // do a cudaMemcpy to GPU - double tLsend = SuperLU_timer_(); - copyToGPU(gpuUvalBasePtr, packedNzvals, packedNzvalsIndices); - gpuErrchk(cudaMemcpy(gpuUidxBasePtr, idxBuffer, gpuUidxSize, cudaMemcpyHostToDevice)); - tLsend = SuperLU_timer_() - tLsend; - printf("cudaMemcpy time U =%g \n", tLsend); - // SUPERLU_FREE(valBuffer); - } - else /* AVOID_CPU_NZVAL not set */ - { - // do a memcpy to CPU buffer - double *valBuffer = (double *)SUPERLU_MALLOC(gpuUvalSize); - - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - { - if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1) - { - if (!uPanelVec[i].isEmpty()) - { - - upanelGPU_t ithupanel(&gpuUidxBasePtr[idxOffset], &gpuUvalBasePtr[valOffset]); - uPanelVec[i].gpuPanel = ithupanel; - uPanelVec_GPU[i] = ithupanel; - memcpy(&valBuffer[valOffset], uPanelVec[i].val, sizeof(double) * uPanelVec[i].nzvalSize()); - memcpy(&idxBuffer[idxOffset], uPanelVec[i].index, sizeof(int_t) * uPanelVec[i].indexSize()); - - valOffset += uPanelVec[i].nzvalSize(); - idxOffset += uPanelVec[i].indexSize(); - } - } - } - tCopyToCPU = SuperLU_timer_() - tCopyToCPU; - printf("copyU to CPU-buff time = %g\n", tCopyToCPU); - - // do a cudaMemcpy to GPU - double tLsend = SuperLU_timer_(); - const int USE_GPU_COPY = 1; - if (USE_GPU_COPY) - { - gpuErrchk(cudaMemcpy(gpuUvalBasePtr, valBuffer, gpuUvalSize, cudaMemcpyHostToDevice)); - } - else - copyToGPU_Sparse(gpuUvalBasePtr, valBuffer, gpuUvalSize); - - gpuErrchk(cudaMemcpy(gpuUidxBasePtr, idxBuffer, gpuUidxSize, cudaMemcpyHostToDevice)); - tLsend = SuperLU_timer_() - tLsend; - printf("cudaMemcpy time U =%g \n", tLsend); - - SUPERLU_FREE(valBuffer); - } /* end else AVOID_CPU_NZVAL not set */ - - if ( gpuUidxSize>0 ) /* Sherry fix: gpuUidxSize can be 0 */ - SUPERLU_FREE(idxBuffer); - -#if (DEBUGlevel >= 1) - CHECK_MALLOC(iam, "Exit copyUpanelsToGPU()"); -#endif - - return uPanelVec_GPU; - -} /* copyUpanelsToGPU */ - -#endif diff --git a/SRC/CplusplusFactor/schurCompUpdate_impl.cuh b/SRC/CplusplusFactor/schurCompUpdate_impl.cuh index 5b958e40..7085f85d 100644 --- a/SRC/CplusplusFactor/schurCompUpdate_impl.cuh +++ b/SRC/CplusplusFactor/schurCompUpdate_impl.cuh @@ -1,4 +1,4 @@ -#pragma once +ss#pragma once #include #include #include @@ -1063,16 +1063,6 @@ int_t xLUstruct_t::setLUstruct_GPU() cublasCreate(&A_gpu.lookAheadUHandle[stream]); cudaStreamCreate(&A_gpu.lookAheadUStream[stream]); - // Wajih: Need to create at least one magma queue -#ifdef HAVE_MAGMA - if(stream == 0) - { - magma_queue_create_from_cuda( - device_id, A_gpu.cuStreams[stream], A_gpu.cuHandles[stream], - NULL, &A_gpu.magma_queue - ); - } -#endif } tcuStreamCreate = SuperLU_timer_() - tcuStreamCreate; tRegion[3] = SuperLU_timer_() - tRegion[3]; diff --git a/SRC/CplusplusFactor/xlupanels_GPU.cuh b/SRC/CplusplusFactor/xlupanels_GPU.cuh index a048b9a6..f4ce8e9c 100644 --- a/SRC/CplusplusFactor/xlupanels_GPU.cuh +++ b/SRC/CplusplusFactor/xlupanels_GPU.cuh @@ -7,9 +7,6 @@ #ifdef HAVE_CUDA #include #include -#ifdef HAVE_MAGMA - #include "magma.h" -#endif #endif #include "lu_common.hpp" @@ -412,15 +409,6 @@ struct xLUstructGPU_t cudaStream_t cuStreams[MAX_CUDA_STREAMS]; cublasHandle_t cuHandles[MAX_CUDA_STREAMS]; - // Magma is needed for non-uniform batched execution -#ifdef HAVE_MAGMA - magma_queue_t magma_queue; -#endif -#if 0 //////////////////////////// - LUMarshallData marshall_data; - xSCUMarshallData sc_marshall_data; -#endif - int* dperm_c_supno; /* Sherry: Allocate an array of buffers for the diagonal blocks @@ -475,4 +463,4 @@ void scatterGPU_batchDriver( int max_ilen, int max_jlen, Ftype **gemmBuff_ptrs, int *LDgemmBuff_batch, int maxSuperSize, int ldt, xlpanelGPU_t *lpanels, xupanelGPU_t *upanels, xLUstructGPU_t *dA, int batchCount, cudaStream_t cuStream -); \ No newline at end of file +); diff --git a/SRC/complex16/pzgssvx3d.c b/SRC/complex16/pzgssvx3d.c index 833392be..b81f6012 100755 --- a/SRC/complex16/pzgssvx3d.c +++ b/SRC/complex16/pzgssvx3d.c @@ -1,3 +1,10 @@ +// +// Solve-only setup +// turn off: equil, rowperm, colperm, +// options->ilu_level = 0; +// 1. DOFACT -> distribution +// 2. FACTORED -> solve +// /*! \file Copyright (c) 2003, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required @@ -507,207 +514,6 @@ at the top-level directory. * */ -// Sherry: TODO - this should be moved to a utility file. -int zwriteLUtoDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct) -{ - - if (getenv("LUFILE")) - { - FILE *fp = fopen(getenv("LUFILE"), "w"); - printf("writing to %s", getenv("LUFILE")); - for (int i = 0; i < nsupers; i++) - { - if (LUstruct->Llu->Lrowind_bc_ptr[i]) - { - int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i]; - doublecomplex *nzval = LUstruct->Llu->Lnzval_bc_ptr[i]; - - int_t len = lsub[1]; /* LDA of the nzval[] */ - int_t len2 = SuperSize(i) * len; - fwrite(nzval, sizeof(doublecomplex), len2, fp); // assume fp will be incremented - } - - if (LUstruct->Llu->Ufstnz_br_ptr[i]) - { - int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i]; - doublecomplex *nzval = LUstruct->Llu->Unzval_br_ptr[i]; - int_t lenv = usub[1]; - - fwrite(nzval, sizeof(doublecomplex), lenv, fp); // assume fp will be incremented - } - } - - fclose(fp); - } - else - { - printf("Please set environment variable LUFILE to write\n..bye bye"); - exit(0); - } - - return 0; -} - -#define EPSILON 1e-3 - -// Sherry: TODO - this is used for debugging -static int zcheckArr(doublecomplex *A, doublecomplex *B, int n) -{ - for (int i = 0; i < n; i++) - { - doublecomplex temp; - z_sub(&temp, &A[i], &B[i]); - assert(slud_z_abs(&temp) <= EPSILON * SUPERLU_MIN(slud_z_abs(&A[i]), slud_z_abs(&B[i]))); - } - - return 0; -} - -// Sherry: TODO - this should be moved to a utility file. -int zcheckLUFromDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct) -{ - zLocalLU_t *Llu = LUstruct->Llu; - - doublecomplex *Lval_buf = doublecomplexMalloc_dist(Llu->bufmax[1]); // DOUBLE_ALLOC(Llu->bufmax[1]); - doublecomplex *Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3]); // DOUBLE_ALLOC(Llu->bufmax[3]); - - if (getenv("LUFILE")) - { - FILE *fp = fopen(getenv("LUFILE"), "r"); - printf("reading from %s", getenv("LUFILE")); - for (int i = 0; i < nsupers; i++) - { - if (LUstruct->Llu->Lrowind_bc_ptr[i]) - { - int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i]; - doublecomplex *nzval = LUstruct->Llu->Lnzval_bc_ptr[i]; - - int_t len = lsub[1]; /* LDA of the nzval[] */ - int_t len2 = SuperSize(i) * len; - fread(Lval_buf, sizeof(doublecomplex), len2, fp); // assume fp will be incremented - zcheckArr(nzval, Lval_buf, len2); - } - - if (LUstruct->Llu->Ufstnz_br_ptr[i]) - { - int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i]; - doublecomplex *nzval = LUstruct->Llu->Unzval_br_ptr[i]; - int_t lenv = usub[1]; - - fread(Uval_buf, sizeof(doublecomplex), lenv, fp); // assume fp will be incremented - zcheckArr(nzval, Uval_buf, lenv); - } - } - printf("CHecking LU from %s is succesful ", getenv("LUFILE")); - fclose(fp); - } - else - { - printf("Please set environment variable LUFILE to read\n..bye bye"); - exit(0); - } - - return 0; -} - -// Sherry: TODO - this should be moved to a utility file. -/*! \brief Dump the factored matrix L using matlab triple-let format - */ -void zDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d, - Glu_persist_t *Glu_persist, zLocalLU_t *Llu) -{ - register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb; - int k, mycol, r, n, nmax; - int_t nnzL; - int_t *xsup = Glu_persist->xsup; - int_t *index; - doublecomplex *nzval; - char filename[256]; - FILE *fp, *fopen(); - gridinfo_t *grid = &(grid3d->grid2d); - int iam = grid->iam; - int iam3d = grid3d->iam; - - // assert(grid->npcol*grid->nprow==1); - - // count nonzeros in the first pass - nnzL = 0; - n = 0; - ncb = nsupers / grid->npcol; - extra = nsupers % grid->npcol; - mycol = MYCOL( iam, grid ); - if ( mycol < extra ) ++ncb; - for (lb = 0; lb < ncb; ++lb) { - index = Llu->Lrowind_bc_ptr[lb]; - if ( index ) { /* Not an empty column */ - nzval = Llu->Lnzval_bc_ptr[lb]; - nb = index[0]; - nsupr = index[1]; - gb = lb * grid->npcol + mycol; - nsupc = SuperSize( gb ); - for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { - len = index[k+1]; - - for (j = 0; j < nsupc; ++j) { - for (i=0; i=xsup[gb]+j+1){ - nnzL ++; - nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1); - n = nmax; - } - - } - } - k += LB_DESCRIPTOR + len; - r += len; - } - } - } - MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm); - MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm); - - snprintf(filename, sizeof(filename), "%s-%d", "L", iam3d); - printf("Dumping L factor to --> %s\n", filename); - if ( !(fp = fopen(filename, "w")) ) { - ABORT("File open failed"); - } - - if(grid->iam==0){ - fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL); - } - - ncb = nsupers / grid->npcol; - extra = nsupers % grid->npcol; - mycol = MYCOL( iam, grid ); - if ( mycol < extra ) ++ncb; - for (lb = 0; lb < ncb; ++lb) { - index = Llu->Lrowind_bc_ptr[lb]; - if ( index ) { /* Not an empty column */ - nzval = Llu->Lnzval_bc_ptr[lb]; - nb = index[0]; - nsupr = index[1]; - gb = lb * grid->npcol + mycol; - nsupc = SuperSize( gb ); - for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { - len = index[k+1]; - - for (j = 0; j < nsupc; ++j) { - for (i=0; iSolveOnly == YES ) { + options->Fact = DOFACT; + options->Equil = NO; + options->RowPerm = NOROWPERM; + options->ColPerm = NATURAL; + options->ILU_level = 0; + } Fact = options->Fact; validateInput_pzgssvx3d(options, A, ldb, nrhs, grid3d, info); @@ -887,8 +701,10 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, if (Equil) { zscaleMatrixDiagonally(Fact, ScalePermstruct, A, stat, grid, &rowequ, &colequ, &iinfo); - if (iinfo < 0) - return; // Sherry: TOO - return a number in INFO + if (iinfo < 0) { + *info = -20 - iinfo; + return; + } } /* end if Equil ... LAPACK style, not involving MC64 */ @@ -897,12 +713,12 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, * Gather A from the distributed compressed row format to * global A in compressed column format. * Numerical values are gathered only when a row permutation - ** for large diagonal is sought after. + * for large diagonal is sought after. */ if (Fact != SamePattern_SameRowPerm && (parSymbFact == NO || options->RowPerm != NO)) { - int_t need_value = (options->RowPerm == LargeDiag_MC64); + int need_value = (options->RowPerm == LargeDiag_MC64); pzCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); GAstore = (NCformat *)GA.Store; nnz = GAstore->nnz; @@ -1045,15 +861,15 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, if (Glu_freeable == NULL) { if (!(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) + SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) ABORT("Malloc fails for Glu_freeable."); } zbcastPermutedSparseA(A, ScalePermstruct, Glu_freeable, - LUstruct, grid3d); - } else { - //TODO: need a parmetis version of zbcastPermutedSparseA broadcasting Pslu_freeable - } + LUstruct, grid3d); + } else { + ; /*TODO: need a parmetis version of zbcastPermutedSparseA broadcasting Pslu_freeable*/ } + } perm_r = ScalePermstruct->perm_r; perm_c = ScalePermstruct->perm_c; @@ -1102,7 +918,7 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, t = SuperLU_timer_(); dist_mem_use = pzdistribute3d_Yang(options, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid3d); + Glu_freeable, LUstruct, grid3d); stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factorization. */ @@ -1174,9 +990,6 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, #endif - - - SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t)); slu_SCT_init(SCT); @@ -1189,9 +1002,6 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, #endif - - - t = SuperLU_timer_(); /*factorize in grid 1*/ @@ -1427,18 +1237,18 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, } /* end printing stats */ } /* end if not Factored */ - } + } /* end if grid-0 */ if(Solve3D){ if ( options->Fact == DOFACT || options->Fact == SamePattern ) { - /* Need to reset the solve's communication pattern, - because perm_r[] and/or perm_c[] is changed. */ - if ( options->SolveInitialized == YES ) { /* Initialized before */ + /* Need to reset the solve's communication pattern, + because perm_r[] and/or perm_c[] is changed. */ + if ( options->SolveInitialized == YES ) { /* Initialized before */ zSolveFinalize(options, SOLVEstruct); /* Clean up structure */ pzgstrs_delete_device_lsum_x(SOLVEstruct); options->SolveInitialized = NO; /* Reset the solve state */ - } + } } if (get_new3dsolve()){ @@ -2011,9 +1821,8 @@ if (grid3d->zscp.Iam == 0) /* on 2D grid-0 */ { if (colequ) { - b_col = B; - for (j = 0; j < nrhs; ++j) - { + b_col = B; + for (j = 0; j < nrhs; ++j) { irow = fst_row; for (i = 0; i < m_loc; ++i) { diff --git a/SRC/complex16/zssvx3dAux.c b/SRC/complex16/zssvx3dAux.c index 1bab02b5..b8655419 100644 --- a/SRC/complex16/zssvx3dAux.c +++ b/SRC/complex16/zssvx3dAux.c @@ -406,11 +406,9 @@ void zperform_LargeDiag_MC64( } // int iinfo; - zfindRowPerm_MC64(grid, job, m, n, - nnz, - colptr, - rowind, - a_GA, Equil, perm_r, R1, C1, iinfo); + zfindRowPerm_MC64(grid, job, m, n, nnz, + colptr, rowind, + a_GA, Equil, perm_r, R1, C1, iinfo); if (*iinfo && job == 5) { SUPERLU_FREE(R1); @@ -508,12 +506,11 @@ void zperform_row_permutation( else if (options->RowPerm == LargeDiag_MC64) { - zperform_LargeDiag_MC64( - options, Fact, - ScalePermstruct, LUstruct, - m, n, grid, - A, GA, stat, job, - Equil, rowequ, colequ, iinfo); + zperform_LargeDiag_MC64(options, Fact, + ScalePermstruct, LUstruct, + m, n, grid, + A, GA, stat, job, + Equil, rowequ, colequ, iinfo); } else // LargeDiag_HWPM { diff --git a/SRC/complex16/zutil_dist.c b/SRC/complex16/zutil_dist.c index cdd415b4..b2452cbb 100755 --- a/SRC/complex16/zutil_dist.c +++ b/SRC/complex16/zutil_dist.c @@ -995,6 +995,203 @@ zprint_gsmv_comm(FILE *fp, int_t m_loc, pzgsmv_comm_t *gsmv_comm, return 0; } +int zwriteLUtoDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct) +{ + + if (getenv("LUFILE")) + { + FILE *fp = fopen(getenv("LUFILE"), "w"); + printf("writing to %s", getenv("LUFILE")); + for (int i = 0; i < nsupers; i++) + { + if (LUstruct->Llu->Lrowind_bc_ptr[i]) + { + int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i]; + doublecomplex *nzval = LUstruct->Llu->Lnzval_bc_ptr[i]; + + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(i) * len; + fwrite(nzval, sizeof(doublecomplex), len2, fp); // assume fp will be incremented + } + + if (LUstruct->Llu->Ufstnz_br_ptr[i]) + { + int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i]; + doublecomplex *nzval = LUstruct->Llu->Unzval_br_ptr[i]; + int_t lenv = usub[1]; + + fwrite(nzval, sizeof(doublecomplex), lenv, fp); // assume fp will be incremented + } + } + + fclose(fp); + } + else + { + printf("Please set environment variable LUFILE to write\n..bye bye"); + exit(0); + } + + return 0; +} + +#define EPSILON 1e-3 + +int zcheckArr(doublecomplex *A, doublecomplex *B, int n) +{ + for (int i = 0; i < n; i++) + { + doublecomplex temp; + z_sub(&temp, &A[i], &B[i]); + assert(slud_z_abs(&temp) <= EPSILON * SUPERLU_MIN(slud_z_abs(&A[i]), slud_z_abs(&B[i]))); + } + + return 0; +} + +// Sherry: TODO - this should be moved to a utility file. +int zcheckLUFromDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct) +{ + zLocalLU_t *Llu = LUstruct->Llu; + + doublecomplex *Lval_buf = doublecomplexMalloc_dist(Llu->bufmax[1]); // DOUBLE_ALLOC(Llu->bufmax[1]); + doublecomplex *Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3]); // DOUBLE_ALLOC(Llu->bufmax[3]); + + if (getenv("LUFILE")) + { + FILE *fp = fopen(getenv("LUFILE"), "r"); + printf("reading from %s", getenv("LUFILE")); + for (int i = 0; i < nsupers; i++) + { + if (LUstruct->Llu->Lrowind_bc_ptr[i]) + { + int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i]; + doublecomplex *nzval = LUstruct->Llu->Lnzval_bc_ptr[i]; + + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(i) * len; + fread(Lval_buf, sizeof(doublecomplex), len2, fp); // assume fp will be incremented + zcheckArr(nzval, Lval_buf, len2); + } + + if (LUstruct->Llu->Ufstnz_br_ptr[i]) + { + int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i]; + doublecomplex *nzval = LUstruct->Llu->Unzval_br_ptr[i]; + int_t lenv = usub[1]; + + fread(Uval_buf, sizeof(doublecomplex), lenv, fp); // assume fp will be incremented + zcheckArr(nzval, Uval_buf, lenv); + } + } + printf("CHecking LU from %s is succesful ", getenv("LUFILE")); + fclose(fp); + } + else + { + printf("Please set environment variable LUFILE to read\n..bye bye"); + exit(0); + } + + return 0; +} + +// Sherry: TODO - this should be moved to a utility file. +/*! \brief Dump the factored matrix L using matlab triple-let format + */ +void zDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d, + Glu_persist_t *Glu_persist, zLocalLU_t *Llu) +{ + register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb; + int k, mycol, r, n, nmax; + int_t nnzL; + int_t *xsup = Glu_persist->xsup; + int_t *index; + doublecomplex *nzval; + char filename[256]; + FILE *fp, *fopen(); + gridinfo_t *grid = &(grid3d->grid2d); + int iam = grid->iam; + int iam3d = grid3d->iam; + + // assert(grid->npcol*grid->nprow==1); + + // count nonzeros in the first pass + nnzL = 0; + n = 0; + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; i=xsup[gb]+j+1){ + nnzL ++; + nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1); + n = nmax; + } + + } + } + k += LB_DESCRIPTOR + len; + r += len; + } + } + } + MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm); + MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm); + + snprintf(filename, sizeof(filename), "%s-%d", "L", iam3d); + printf("Dumping L factor to --> %s\n", filename); + if ( !(fp = fopen(filename, "w")) ) { + ABORT("File open failed"); + } + + if(grid->iam==0){ + fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL); + } + + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; i + * -- Distributed SuperLU routine (version 9.0) -- + * Lawrence Berkeley National Lab, Univ. of California Berkeley. + * May 20, 2024 + * + * Modified: + */ + +#include "superlu_defs.h" + +/** + * @brief This function performs the level-based ILU symbolic factorization on + * matrix Pc*Pr*A*Pc' and sets up the nonzero data structures for L & U matrices. + * In the process, the matrix is also ordered and its memory usage information is gathered. + * + * @param options Input parameters to control how the ILU decomposition will be performed. + * @param A Pointer to the global supermatrix A, permuted by columns in NCPfomat. + * @param perm_c The column permutation vector. + * @param Glu_persist Pointer to the structure which tracks the symbolic factorization information. + * @param Glu_freeable Pointer to the structure which tracks the space used to store L/U data structures. + * @param stat Information on program execution. + */ + +int ilu_level_symbfact +/************************************************************************/ +( + superlu_dist_options_t *options, /* input options */ + SuperMatrix *A, /* original matrix A permuted by columns, GAC from caller (input) */ + int_t *perm_c, /* column permutation vector (input) */ + int_t *etree, /* column elimination tree (input) */ + Glu_persist_t *Glu_persist, /* output */ + Glu_freeable_t *Glu_freeable /* output */ + ) +{ + + if ( options->ILU_level != 0 ) { + printf("ERROR: ILU(k>0) is not implemented yet\n"); + return (-1); + } + + /* Now, do ILU(0) */ + + int_t iinfo; + int i, n = A->ncol; + double t; + + /* Set up supernode partition */ + Glu_persist->supno = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t)); + Glu_persist->xsup = (int_t *) SUPERLU_MALLOC((n+1) * sizeof(int_t)); + for (i = 0; i < n; i++) { + Glu_persist->supno[i] = i; + Glu_persist->xsup[i] = i; + } + Glu_persist->xsup[n] = n; + + + NCPformat *GACstore = A->Store; + int_t *colbeg, *colend, *rowind, irow; + rowind = GACstore->rowind; + colbeg = GACstore->colbeg; + colend = GACstore->colend; + + Glu_freeable->xlsub = (int_t *) intCalloc_dist(n+1); + Glu_freeable->xusub = (int_t *) intCalloc_dist(n+1); + int_t nnzL = 0, nnzU = 0; + + /* Count nonzeros per column for L & U */ + for (int j = 0; j < n; ++j) { +#if 0 + fscanf(fpU, "%d %d", &col_num, &Glu_freeable.usub[i]); + Glu_freeable.xusub[col_num] += 1; +#endif + for (i = colbeg[j]; i < colend[j]; ++i) { // (j,j) is diagonal + irow = rowind[i]; + if ( irow < j ) { // in U + nnzU++; + Glu_freeable->xusub[j] += 1; + } else { // in L, including diagonal of U + nnzL++; + Glu_freeable->xlsub[j] += 1; + } + } + } + + Glu_freeable->nnzLU = nnzU + nnzL; + Glu_freeable->lsub = (int_t *) SUPERLU_MALLOC(nnzL * sizeof(int_t)); + Glu_freeable->usub = (int_t *) SUPERLU_MALLOC(nnzU * sizeof(int_t)); + + /* Do prefix sum to set up column pointers */ + for(i = 1; i <= n; i++) { + Glu_freeable->xlsub[i] += Glu_freeable->xlsub[i-1]; + Glu_freeable->xusub[i] += Glu_freeable->xusub[i-1]; + } + + return 0; +} + diff --git a/SRC/prec-independent/psymbfact.c b/SRC/prec-independent/psymbfact.c index 8c0b98c3..ff4b617e 100755 --- a/SRC/prec-independent/psymbfact.c +++ b/SRC/prec-independent/psymbfact.c @@ -2157,9 +2157,11 @@ symbfact_vtx /* TEST available memory */ if (next >= x_aind_end) { if (domain_symb) { + int mem_type = (computeL ? LSUB : USUB); // Sherry fix 5/26/24 if ( (mem_error = psymbfact_LUXpandMem (iam, n, vtx, next, 0, - computeL, DOMAIN_SYMB, 1, + //computeL, DOMAIN_SYMB, 1, + mem_type, DOMAIN_SYMB, 1, Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); } else if ( (mem_error = @@ -4009,9 +4011,11 @@ dnsCurSep_symbfact j = x_newelts[vtx_lid_x+1] + lstVtx - vtx; if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) || (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) { + int mem_type = (computeL ? LSUB : USUB); // Sherry fix 5/26/24 if ( (mem_error = psymbfact_LUXpandMem (iam, n, vtx, next, next + j, - computeL, DNS_CURSEP, 1, + //computeL, DNS_CURSEP, 1, + mem_type, DNS_CURSEP, 1, Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); if (computeL) sub = Llu_symbfact->lsub; diff --git a/SRC/prec-independent/trfAux.c b/SRC/prec-independent/trfAux.c index 20fed84c..b2d46d38 100755 --- a/SRC/prec-independent/trfAux.c +++ b/SRC/prec-independent/trfAux.c @@ -2534,15 +2534,16 @@ void applyRowPerm(int_t* colptr, int_t* rowind, int_t* perm_r, int_t n) { * its memory usage information is fetched. * * @param options The options for the SuperLU distribution. + * @param n Dimension of the global matrix A. * @param GA A pointer to the global matrix A. * @param perm_c The column permutation vector. * @param etree The elimination tree of Pc*Pr*A*Pc'. - * @param iam The processor number (0 <= iam < Pr*Pc). * @param Glu_persist Pointer to the structure which tracks the symbolic factorization information. * @param Glu_freeable Pointer to the structure which tracks the space used to store L/U data structures. * @param stat Information on program execution. + * @param grid3d The 3D process grid. */ -void permCol_SymbolicFact3d(superlu_dist_options_t *options, int_t n, SuperMatrix *GA, int_t *perm_c, int_t *etree, +void permCol_SymbolicFact3d(superlu_dist_options_t *options, int n, SuperMatrix *GA, int_t *perm_c, int_t *etree, Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable, SuperLUStat_t *stat, superlu_dist_mem_usage_t*symb_mem_usage, gridinfo3d_t* grid3d) @@ -2577,7 +2578,13 @@ void permCol_SymbolicFact3d(superlu_dist_options_t *options, int_t n, SuperMatri #endif t = SuperLU_timer_(); - iinfo = symbfact(options, iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable); + + if ( options->ILU_level != SLU_EMPTY ) { + iinfo = ilu_level_symbfact(options, &GAC, perm_c, etree, Glu_persist, Glu_freeable); + } else { + iinfo = symbfact(options, iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable); + } + stat->utime[SYMBFAC] = SuperLU_timer_() - t; if (iinfo < 0) diff --git a/SRC/prec-independent/util.c b/SRC/prec-independent/util.c index 91227306..8e7cdd90 100755 --- a/SRC/prec-independent/util.c +++ b/SRC/prec-independent/util.c @@ -1,3 +1,4 @@ + /*! \file Copyright (c) 2003, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required @@ -209,6 +210,7 @@ void set_default_options_dist(superlu_dist_options_t *options) { options->Fact = DOFACT; options->Equil = YES; + options->ILU_level = SLU_EMPTY; options->ParSymbFact = NO; #ifdef HAVE_PARMETIS options->ColPerm = METIS_AT_PLUS_A;