From fe88f03d1f9d9ba67c9431c07b40b2b7c961961d Mon Sep 17 00:00:00 2001
From: Sherry Li <xsli@lbl.gov>
Date: Wed, 29 May 2024 17:24:39 -0700
Subject: [PATCH] Initial implementation of ILU0 and SolveOnly; Remove the
 unused batch code; small fix related to LSUB/USUB in psymbfact.c

---
 Doxygen_Config.txt                            |    1 +
 SRC/CMakeLists.txt                            |    1 +
 SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp  | 1119 ---------
 .../dsparseTreeFactorGPU_impl.hpp             |  122 +-
 SRC/CplusplusFactor/lupanels_GPU.cuh          |   81 +-
 SRC/CplusplusFactor/schurCompUpdate.cu        | 2106 -----------------
 SRC/CplusplusFactor/schurCompUpdate_impl.cuh  |   12 +-
 SRC/CplusplusFactor/xlupanels_GPU.cuh         |   14 +-
 SRC/complex16/pzgssvx3d.c                     |  259 +-
 SRC/complex16/zssvx3dAux.c                    |   19 +-
 SRC/complex16/zutil_dist.c                    |  197 ++
 SRC/include/superlu_defs.h                    |    6 +-
 SRC/include/superlu_dist_config.h             |    8 +-
 SRC/include/superlu_zdefs.h                   |    9 +
 SRC/prec-independent/get_perm_c.c             |    2 +-
 SRC/prec-independent/ilu_level_symbfact.c     |  110 +
 SRC/prec-independent/psymbfact.c              |    8 +-
 SRC/prec-independent/trfAux.c                 |   13 +-
 SRC/prec-independent/util.c                   |    2 +
 19 files changed, 394 insertions(+), 3695 deletions(-)
 delete mode 100644 SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp
 delete mode 100644 SRC/CplusplusFactor/schurCompUpdate.cu
 create mode 100644 SRC/prec-independent/ilu_level_symbfact.c
diff --git a/Doxygen_Config.txt b/Doxygen_Config.txt
index 091a1c56..8dbe9902 100644
--- a/Doxygen_Config.txt
+++ b/Doxygen_Config.txt
@@ -1,3 +1,4 @@
+
 # Doxyfile 1.8.15
 
 # This file describes the settings to be used by the documentation system
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 58a1fb02..d383873d 100755
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -63,6 +63,7 @@ set(sources
   prec-independent/pxerr_dist.c
   prec-independent/superlu_timer.c
   prec-independent/symbfact.c
+  prec-independent/ilu_level_symbfact.c
   prec-independent/psymbfact.c
   prec-independent/psymbfact_util.c
   prec-independent/get_perm_c_parmetis.c
diff --git a/SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp b/SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp
deleted file mode 100644
index f3029db3..00000000
--- a/SRC/CplusplusFactor/dsparseTreeFactorGPU.cpp
+++ /dev/null
@@ -1,1119 +0,0 @@
-#include <cstdio>
-#include "superlu_ddefs.h"
-#include "lupanels.hpp"
-#ifdef HAVE_CUDA
-#include "lupanels_GPU.cuh"
-#include "batch_block_copy.h"
-
-// [0, 1, 2, 3] independent
-// [0, 1] schur complement update , [2, 3] panel bcast 
-// [2,3] schur complement update, [ 4,5] panel bcast 
-
-
-int getBufferOffset(int k0, int k1, int winSize, int winParity, int halfWin)
-{
-    int_t offset = (k0-k1)%winSize;
-            if(winParity%2)
-                offset+= halfWin;
-
-    return offset;
-}
-
-#if 0 // Sherry: old batch code 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-LUMarshallData::LUMarshallData()
-{
-    dev_diag_ptrs = dev_panel_ptrs = NULL;
-    dev_diag_ld_array = dev_diag_dim_array = dev_info_array = NULL;
-    dev_panel_ld_array = dev_panel_dim_array = NULL;
-}
-
-LUMarshallData::~LUMarshallData()
-{
-    gpuErrchk(cudaFree(dev_diag_ptrs));
-    gpuErrchk(cudaFree(dev_panel_ptrs));
-    gpuErrchk(cudaFree(dev_diag_ld_array));
-    gpuErrchk(cudaFree(dev_diag_dim_array));
-    gpuErrchk(cudaFree(dev_info_array));
-    gpuErrchk(cudaFree(dev_panel_ld_array));
-    gpuErrchk(cudaFree(dev_panel_dim_array));
-}
-
-void LUMarshallData::setBatchSize(int batch_size)
-{
-    gpuErrchk(cudaMalloc(&dev_diag_ptrs, batch_size * sizeof(double*)));
-    gpuErrchk(cudaMalloc(&dev_panel_ptrs, batch_size * sizeof(double*)));
-
-    gpuErrchk(cudaMalloc(&dev_diag_ld_array, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_diag_dim_array, (batch_size + 1) * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_info_array, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_panel_ld_array, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_panel_dim_array, (batch_size + 1) * sizeof(int)));
-    
-    host_diag_ptrs.resize(batch_size);
-    host_diag_ld_array.resize(batch_size);
-    host_diag_dim_array.resize(batch_size);
-    host_panel_ptrs.resize(batch_size);
-    host_panel_ld_array.resize(batch_size);
-    host_panel_dim_array.resize(batch_size);
-}
-
-void LUMarshallData::setMaxDiag()
-{
-    max_diag = 0;
-    for(int i = 0; i < batchsize; i++)
-        max_diag = SUPERLU_MAX(max_diag, host_diag_dim_array[i]); 
-}
-
-void LUMarshallData::setMaxPanel()
-{
-    max_panel = 0;
-    for(int i = 0; i < batchsize; i++)
-        max_panel = SUPERLU_MAX(max_panel, host_panel_dim_array[i]); 
-}
-
-SCUMarshallData::SCUMarshallData()
-{
-    dev_A_ptrs = dev_B_ptrs = dev_C_ptrs = NULL;
-    dev_lda_array = dev_ldb_array = dev_ldc_array = NULL;
-    dev_m_array = dev_n_array = dev_k_array = NULL;
-    dev_gpu_lpanels = NULL;
-    dev_gpu_upanels = NULL;
-    dev_ist = dev_iend = dev_jst = dev_jend = NULL;
-    dev_maxGemmCols = dev_maxGemmRows = NULL;
-}
-
-SCUMarshallData::~SCUMarshallData()
-{
-    gpuErrchk(cudaFree(dev_A_ptrs));
-    gpuErrchk(cudaFree(dev_B_ptrs));
-    gpuErrchk(cudaFree(dev_C_ptrs));
-    gpuErrchk(cudaFree(dev_lda_array));
-    gpuErrchk(cudaFree(dev_ldb_array));
-    gpuErrchk(cudaFree(dev_ldc_array));
-    gpuErrchk(cudaFree(dev_m_array));
-    gpuErrchk(cudaFree(dev_n_array));
-    gpuErrchk(cudaFree(dev_k_array));
-    gpuErrchk(cudaFree(dev_gpu_lpanels));
-    gpuErrchk(cudaFree(dev_gpu_upanels));
-    gpuErrchk(cudaFree(dev_ist));
-    gpuErrchk(cudaFree(dev_iend));
-    gpuErrchk(cudaFree(dev_jst));
-    gpuErrchk(cudaFree(dev_jend));
-    gpuErrchk(cudaFree(dev_maxGemmCols));
-    gpuErrchk(cudaFree(dev_maxGemmRows));
-}
-
-void SCUMarshallData::setBatchSize(int batch_size)
-{
-    gpuErrchk(cudaMalloc(&dev_A_ptrs, batch_size * sizeof(double*)));
-    gpuErrchk(cudaMalloc(&dev_B_ptrs, batch_size * sizeof(double*)));
-    gpuErrchk(cudaMalloc(&dev_C_ptrs, batch_size * sizeof(double*)));
-
-    gpuErrchk(cudaMalloc(&dev_lda_array, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_ldb_array, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_ldc_array, batch_size * sizeof(int)));
-
-    gpuErrchk(cudaMalloc(&dev_m_array, (batch_size + 1) * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_n_array, (batch_size + 1) * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_k_array, (batch_size + 1) * sizeof(int)));
-
-    gpuErrchk(cudaMalloc(&dev_ist, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_iend, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_jst, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_jend, batch_size * sizeof(int)));
-
-    gpuErrchk(cudaMalloc(&dev_maxGemmCols, batch_size * sizeof(int)));
-    gpuErrchk(cudaMalloc(&dev_maxGemmRows, batch_size * sizeof(int)));
-
-    gpuErrchk(cudaMalloc(&dev_gpu_lpanels, batch_size * sizeof(lpanelGPU_t)));
-    gpuErrchk(cudaMalloc(&dev_gpu_upanels, batch_size * sizeof(upanelGPU_t)));
-    
-    host_A_ptrs.resize(batch_size);
-    host_B_ptrs.resize(batch_size);
-    host_C_ptrs.resize(batch_size);
-    host_lda_array.resize(batch_size);
-    host_ldb_array.resize(batch_size);
-    host_ldc_array.resize(batch_size);
-    host_m_array.resize(batch_size);
-    host_n_array.resize(batch_size);
-    host_k_array.resize(batch_size);
-    upanels.resize(batch_size);
-    lpanels.resize(batch_size);
-    host_gpu_upanels.resize(batch_size);
-    host_gpu_lpanels.resize(batch_size);
-    ist.resize(batch_size);
-    iend.resize(batch_size);
-    jst.resize(batch_size);
-    jend.resize(batch_size);
-    maxGemmRows.resize(batch_size);
-    maxGemmCols.resize(batch_size);
-}
-
-void SCUMarshallData::setMaxDims()
-{
-    max_n = max_k = max_m = max_ilen = max_jlen = 0;
-    for(int i = 0; i < batchsize; i++)
-    {
-        max_m = SUPERLU_MAX(max_m, host_m_array[i]); 
-        max_n = SUPERLU_MAX(max_n, host_n_array[i]); 
-        max_k = SUPERLU_MAX(max_k, host_k_array[i]); 
-        max_ilen = SUPERLU_MAX(max_ilen, iend[i] - ist[i]);
-        max_jlen = SUPERLU_MAX(max_jlen, jend[i] - jst[i]);
-    }
-}
-
-void SCUMarshallData::copyToGPU()
-{
-    gpuErrchk(cudaMemcpy(dev_A_ptrs, host_A_ptrs.data(), batchsize * sizeof(double*), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_B_ptrs, host_B_ptrs.data(), batchsize * sizeof(double*), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_C_ptrs, host_C_ptrs.data(), batchsize * sizeof(double*), cudaMemcpyHostToDevice));
-
-    gpuErrchk(cudaMemcpy(dev_lda_array, host_lda_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_ldb_array, host_ldb_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_ldc_array, host_ldc_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-
-    gpuErrchk(cudaMemcpy(dev_m_array, host_m_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_n_array, host_n_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_k_array, host_k_array.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-
-    gpuErrchk(cudaMemcpy(dev_ist, ist.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_iend, iend.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_jst, jst.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_jend, jend.data(), batchsize * sizeof(int), cudaMemcpyHostToDevice));
-}
-
-void SCUMarshallData::copyPanelDataToGPU()
-{
-    gpuErrchk(cudaMemcpy(dev_gpu_lpanels, host_gpu_lpanels.data(), batchsize * sizeof(lpanelGPU_t), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dev_gpu_upanels, host_gpu_upanels.data(), batchsize * sizeof(upanelGPU_t), cudaMemcpyHostToDevice));
-}
-
-void LUstruct_v100::marshallBatchedBufferCopyData(int k_st, int k_end, int_t *perm_c_supno)
-{
-    // First gather up all the pointer and meta data on the host 
-    LUMarshallData& mdata = A_gpu.marshall_data;
-    double **panel_ptrs = mdata.host_panel_ptrs.data();
-    int *panel_ld_batch = mdata.host_panel_ld_array.data();
-    int *panel_dim_batch = mdata.host_panel_dim_array.data();
-    double **diag_ptrs = mdata.host_diag_ptrs.data();
-    int *diag_ld_batch = mdata.host_diag_ld_array.data();
-    int *diag_dim_batch = mdata.host_diag_dim_array.data();
- 
-    mdata.batchsize = 0;
-
-    for (int_t k0 = k_st; k0 < k_end; k0++)
-    {
-        int_t k = perm_c_supno[k0];
-        int_t buffer_offset = k0 - k_st;
-		int ksupc = SuperSize(k);
-
-		if (iam == procIJ(k, k))
-		{			
-            lpanel_t &lpanel = lPanelVec[g2lCol(k)];
- 
-            assert(mdata.batchsize < mdata.host_diag_ptrs.size());
-
-            panel_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0);
-            panel_ld_batch[mdata.batchsize] = lpanel.LDA();
-            panel_dim_batch[mdata.batchsize] = ksupc;
-            
-            diag_ptrs[mdata.batchsize] = A_gpu.dFBufs[buffer_offset];
-            diag_ld_batch[mdata.batchsize] = ksupc;
-            diag_dim_batch[mdata.batchsize] = ksupc;
-
-            mdata.batchsize++;
-
-		}     
-    }
-
-    mdata.setMaxDiag();
-    mdata.setMaxPanel();
-    
-    // Then copy the marshalled data over to the GPU 
-    gpuErrchk(cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(mdata.dev_diag_ld_array, diag_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(mdata.dev_diag_dim_array, diag_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(mdata.dev_panel_ptrs, panel_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(mdata.dev_panel_ld_array, panel_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(mdata.dev_panel_dim_array, panel_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-}
-
-#endif
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-int_t LUstruct_v100::dDFactPSolveGPU(int_t k, int_t offset, ddiagFactBufs_t **dFBufs)
-{
-    // this is new version with diagonal factor being performed on GPU
-    // different from dDiagFactorPanelSolveGPU (it performs diag factor in CPU)
-  
-    /* Sherry: argument dFBufs[] is on CPU, not used in this routine */
-
-    double t0 = SuperLU_timer_();
-    int ksupc = SuperSize(k);
-    cublasHandle_t cubHandle = A_gpu.cuHandles[offset];
-    cusolverDnHandle_t cusolverH = A_gpu.cuSolveHandles[offset];
-    cudaStream_t cuStream = A_gpu.cuStreams[offset];
-
-    /*======= Diagonal Factorization ======*/
-    if (iam == procIJ(k, k))
-    {
-        lPanelVec[g2lCol(k)].diagFactorCuSolver(k,
-                        cusolverH, cuStream,
-                        A_gpu.diagFactWork[offset], A_gpu.diagFactInfo[offset], // CPU pointers
-                        A_gpu.dFBufs[offset], ksupc, // CPU pointers
-                        thresh, xsup, options, stat, info);
-
-    }
-
-    //CHECK_MALLOC(iam, "after diagFactorCuSolver()");
-		 
-    //TODO: need to synchronize the cuda stream 
-    /*======= Diagonal Broadcast ======*/
-    if (myrow == krow(k))
-        MPI_Bcast((void *)A_gpu.dFBufs[offset], ksupc * ksupc,
-                  MPI_DOUBLE, kcol(k), (grid->rscp).comm);
-    
-    //CHECK_MALLOC(iam, "after row Bcast");
-    
-    if (mycol == kcol(k))
-        MPI_Bcast((void *)A_gpu.dFBufs[offset], ksupc * ksupc,
-                  MPI_DOUBLE, krow(k), (grid->cscp).comm);
-
-    // do the panels solver 
-    if (myrow == krow(k))
-    {
-        uPanelVec[g2lRow(k)].panelSolveGPU(
-            cubHandle, cuStream,
-            ksupc, A_gpu.dFBufs[offset], ksupc);
-        cudaStreamSynchronize(cuStream); // synchronize befpre broadcast
-    }
-
-    if (mycol == kcol(k))
-    {
-        lPanelVec[g2lCol(k)].panelSolveGPU(
-            cubHandle, cuStream,
-            ksupc, A_gpu.dFBufs[offset], ksupc);
-        cudaStreamSynchronize(cuStream);
-    }
-    SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0);
-
-    return 0;
-} /* dDFactPSolveGPU */
-
-
-int_t LUstruct_v100::dDFactPSolveGPU(int_t k, int_t handle_offset, int buffer_offset, ddiagFactBufs_t **dFBufs)
-{
-    // this is new version with diagonal factor being performed on GPU 
-    // different from dDiagFactorPanelSolveGPU (it performs diag factor in CPU)
-  
-    /* Sherry: argument dFBufs[] is on CPU, not used in this routine */
-
-    double t0 = SuperLU_timer_();
-    int ksupc = SuperSize(k);
-    cublasHandle_t cubHandle = A_gpu.cuHandles[handle_offset];
-    cusolverDnHandle_t cusolverH = A_gpu.cuSolveHandles[handle_offset];
-    cudaStream_t cuStream = A_gpu.cuStreams[handle_offset];
-
-    /*======= Diagonal Factorization ======*/
-    if (iam == procIJ(k, k))
-    {
-        lPanelVec[g2lCol(k)].diagFactorCuSolver(k,
-                        cusolverH, cuStream, 
-                        A_gpu.diagFactWork[handle_offset], A_gpu.diagFactInfo[handle_offset], // CPU pointers
-                        A_gpu.dFBufs[buffer_offset], ksupc, // CPU pointers
-                        thresh, xsup, options, stat, info);
-                                    
-    }
-
-    //CHECK_MALLOC(iam, "after diagFactorCuSolver()");
-		 
-    //TODO: need to synchronize the cuda stream 
-    /*======= Diagonal Broadcast ======*/
-    if (myrow == krow(k))
-        MPI_Bcast((void *)A_gpu.dFBufs[buffer_offset], ksupc * ksupc,
-                  MPI_DOUBLE, kcol(k), (grid->rscp).comm);
-    
-    //CHECK_MALLOC(iam, "after row Bcast");
-    
-    if (mycol == kcol(k))
-        MPI_Bcast((void *)A_gpu.dFBufs[buffer_offset], ksupc * ksupc,
-                  MPI_DOUBLE, krow(k), (grid->cscp).comm);
-
-    // do the panels solver 
-    if (myrow == krow(k))
-    {
-        uPanelVec[g2lRow(k)].panelSolveGPU(
-            cubHandle, cuStream,
-            ksupc, A_gpu.dFBufs[buffer_offset], ksupc);
-        cudaStreamSynchronize(cuStream); // synchronize befpre broadcast
-    }
-
-    if (mycol == kcol(k))
-    {
-        lPanelVec[g2lCol(k)].panelSolveGPU(
-            cubHandle, cuStream,
-            ksupc, A_gpu.dFBufs[buffer_offset], ksupc);
-        cudaStreamSynchronize(cuStream);
-    }
-    SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0);
-
-    return 0;
-} /* dDFactPSolveGPU */
-
-/* This performs diag factor on CPU */
-int_t LUstruct_v100::dDiagFactorPanelSolveGPU(int_t k, int_t offset, ddiagFactBufs_t **dFBufs)
-{
-    double t0 = SuperLU_timer_();
-    int_t ksupc = SuperSize(k);
-    cublasHandle_t cubHandle = A_gpu.cuHandles[offset];
-    cudaStream_t cuStream = A_gpu.cuStreams[offset];
-    if (iam == procIJ(k, k))
-    {
-
-        lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU(k,
-                                                        dFBufs[offset]->BlockUFactor, ksupc, // CPU pointers
-                                                        dFBufs[offset]->BlockLFactor, ksupc, // CPU pointers
-                                                        thresh, xsup, options, stat, info);
-    }
-
-    /*=======   Diagonal Broadcast          ======*/
-    if (myrow == krow(k))
-        MPI_Bcast((void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
-                  MPI_DOUBLE, kcol(k), (grid->rscp).comm);
-    if (mycol == kcol(k))
-        MPI_Bcast((void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
-                  MPI_DOUBLE, krow(k), (grid->cscp).comm);
-
-    /*=======   Panel Update                ======*/
-    if (myrow == krow(k))
-    {
-
-        cudaMemcpy(A_gpu.dFBufs[offset], dFBufs[offset]->BlockLFactor,
-                   ksupc * ksupc * sizeof(double), cudaMemcpyHostToDevice);
-        uPanelVec[g2lRow(k)].panelSolveGPU(
-            cubHandle, cuStream,
-            ksupc, A_gpu.dFBufs[offset], ksupc);
-        cudaStreamSynchronize(cuStream); // synchronize befpre broadcast
-    }
-
-    if (mycol == kcol(k))
-    {
-        cudaMemcpy(A_gpu.dFBufs[offset], dFBufs[offset]->BlockUFactor,
-                   ksupc * ksupc * sizeof(double), cudaMemcpyHostToDevice);
-        lPanelVec[g2lCol(k)].panelSolveGPU(
-            cubHandle, cuStream,
-            ksupc, A_gpu.dFBufs[offset], ksupc);
-        cudaStreamSynchronize(cuStream);
-    }
-    SCT->tDiagFactorPanelSolve += (SuperLU_timer_() - t0);
-
-    return 0;
-}
-
-int_t LUstruct_v100::dPanelBcastGPU(int_t k, int_t offset)
-{
-    double t0 = SuperLU_timer_();
-    /*=======   Panel Broadcast             ======*/
-    // upanel_t k_upanel(UidxRecvBufs[offset], UvalRecvBufs[offset],
-    //                   A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset]);
-    // lpanel_t k_lpanel(LidxRecvBufs[offset], LvalRecvBufs[offset],
-    //                   A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset]);
-    // if (myrow == krow(k))
-    // {
-    //     k_upanel = uPanelVec[g2lRow(k)];
-    // }
-    // if (mycol == kcol(k))
-    //     k_lpanel = lPanelVec[g2lCol(k)];
-    upanel_t k_upanel = getKUpanel(k,offset);
-    lpanel_t k_lpanel = getKLpanel(k,offset);
-
-
-    if (UidxSendCounts[k] > 0)
-    {
-        // assuming GPU direct is available
-        MPI_Bcast(k_upanel.gpuPanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm);
-        MPI_Bcast(k_upanel.gpuPanel.val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm);
-        // copy the index to cpu
-        gpuErrchk(cudaMemcpy(k_upanel.index, k_upanel.gpuPanel.index,
-                   sizeof(int_t) * UidxSendCounts[k], cudaMemcpyDeviceToHost));
-    }
-
-    if (LidxSendCounts[k] > 0)
-    {
-        MPI_Bcast(k_lpanel.gpuPanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm);
-        MPI_Bcast(k_lpanel.gpuPanel.val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm);
-        gpuErrchk(cudaMemcpy(k_lpanel.index, k_lpanel.gpuPanel.index,
-                   sizeof(int_t) * LidxSendCounts[k], cudaMemcpyDeviceToHost));
-    }
-    SCT->tPanelBcast += (SuperLU_timer_() - t0);
-    return 0;
-}
-
-int_t LUstruct_v100::dsparseTreeFactorGPU(
-    sForest_t *sforest,
-    ddiagFactBufs_t **dFBufs, // size maxEtree level
-    gEtreeInfo_t *gEtreeInfo, // global etree info
-
-    int tag_ub)
-{
-    int_t nnodes = sforest->nNodes; // number of nodes in the tree
-    if (nnodes < 1)
-    {
-        return 1;
-    }
-
-    int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
-    treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
-    int_t *myIperm = treeTopoInfo->myIperm;
-    int_t maxTopoLevel = treeTopoInfo->numLvl;
-    int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
-
-    /*main loop over all the levels*/
-    int_t numLA = SUPERLU_MIN(A_gpu.numCudaStreams, getNumLookAhead(options));
-
-#if (DEBUGlevel >= 1)
-    CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorGPU()");
-#endif
-    printf("Using New C++ code with GPU acceleration\n"); fflush(stdout);
-    printf(". lookahead numLA %d\n", numLA); fflush(stdout);
-    
-    // start the pipeline.  Sherry: need to free these 3 arrays
-    int *donePanelBcast = int32Malloc_dist(nnodes);
-    int *donePanelSolve = int32Malloc_dist(nnodes);
-    int *localNumChildrenLeft = int32Malloc_dist(nnodes);
-
-    //TODO: not needed, remove after testing
-    for (int_t i = 0; i < nnodes; i++)
-    {
-        donePanelBcast[i] = 0;
-        donePanelSolve[i] = 0;
-        localNumChildrenLeft[i] = 0;
-    }
-
-    for (int_t k0 = 0; k0 < nnodes; k0++)
-    {
-        int_t k = perm_c_supno[k0];
-        int_t k_parent = gEtreeInfo->setree[k];
-        int_t ik = myIperm[k_parent];
-        if (ik > -1 && ik < nnodes)
-            localNumChildrenLeft[ik]++;
-    }
-
-    // start the pipeline
-    int_t topoLvl = 0;
-    int_t k_st = eTreeTopLims[topoLvl];
-    int_t k_end = eTreeTopLims[topoLvl + 1];
-
-    //TODO: make this asynchronous 
-    for (int_t k0 = k_st; k0 < k_end; k0++)
-    {
-        int_t k = perm_c_supno[k0];
-		int_t offset = 0;
-        // dDiagFactorPanelSolveGPU(k, offset, dFBufs);
-        dDFactPSolveGPU(k, offset, dFBufs);
-		donePanelSolve[k0]=1;
-    }
-
-    //TODO: its really the panels that needs to be doubled
-    // everything else can remain as it is
-    int_t winSize =  SUPERLU_MIN(numLA/2, eTreeTopLims[1]);
-    
-    printf(". lookahead winSize %d\n", winSize); fflush(stdout);
-    
-    for (int k0 = k_st; k0 < winSize; ++k0)
-    {
-        int_t k = perm_c_supno[k0];
-        int_t offset = k0%numLA;
-        if(!donePanelBcast[k0])
-        {
-            dPanelBcastGPU(k, offset);
-            donePanelBcast[k0] =1;
-        }
-    }/*for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)*/
-
-    int_t k1 =0;
-    int_t winParity=0;
-    int_t halfWin = numLA/2;
-    while(k1<nnodes)
-    {
-        for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1+winSize); ++k0)
-        { 
-			int_t k = perm_c_supno[k0];
-            int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
-            upanel_t k_upanel = getKUpanel(k,offset);
-            lpanel_t k_lpanel = getKLpanel(k,offset);
-            int_t k_parent = gEtreeInfo->setree[k];
-            /* L o o k   A h e a d   P a n e l   U p d a t e */
-            if(UidxSendCounts[k]>0 && LidxSendCounts[k]>0)
-                lookAheadUpdateGPU(offset, k,k_parent, k_lpanel,k_upanel);
-        }
-
-        for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1+winSize); ++k0)
-        {
-            int_t k = perm_c_supno[k0];
-            int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
-            SyncLookAheadUpdate(offset);
-        }
-		
-        for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1+winSize); ++k0)
-        {
-            int_t k = perm_c_supno[k0];
-            int_t offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
-            upanel_t k_upanel = getKUpanel(k,offset);
-            lpanel_t k_lpanel = getKLpanel(k,offset);
-            int_t k_parent = gEtreeInfo->setree[k];
-            /* Look Ahead Panel Solve */
-            if(k_parent < nsupers)
-            {
-                int_t k0_parent =  myIperm[k_parent];
-                if (k0_parent > 0 && k0_parent<nnodes)
-                {
-                    localNumChildrenLeft[k0_parent]--;
-                    if (topoLvl < maxTopoLevel - 1 && !localNumChildrenLeft[k0_parent])
-                    {
-#if (PRNTlevel >= 2)
-						printf("parent %d of node %d during second phase\n", k0_parent, k0);
-#endif
-                        int_t dOffset = 0;  // this is wrong 
-                        // dDiagFactorPanelSolveGPU(k_parent, dOffset,dFBufs);
-                        dDFactPSolveGPU(k_parent, dOffset,dFBufs);
-                        donePanelSolve[k0_parent]=1;
-                    }
-                }
-            }
-
-            /*proceed with remaining SchurComplement update */
-            if(UidxSendCounts[k]>0 && LidxSendCounts[k]>0)
-                    dSchurCompUpdateExcludeOneGPU(offset, k,k_parent, k_lpanel,k_upanel);
-
-        }
-
-        int_t k1_next = k1+winSize;
-        int_t oldWinSize = winSize;
-        for (int_t k0_next = k1_next; k0_next < SUPERLU_MIN(nnodes, k1_next+winSize); ++k0_next)
-        {
-            int k_next = perm_c_supno[k0_next];
-            if (!localNumChildrenLeft[k0_next])
-            {
-                // int offset_next = (k0_next-k1_next)%winSize;
-                // if(!(winParity%2))
-                //     offset_next += halfWin;
-
-                int_t offset_next = getBufferOffset(k0_next, k1_next, winSize, winParity+1, halfWin);
-                dPanelBcastGPU(k_next, offset_next);
-                donePanelBcast[k0_next] =1;
-                // printf("Trying  %d on offset %d\n", k0_next, offset_next);
-            }
-            else
-            {
-                winSize = k0_next - k1_next;
-                break;
-            }
-        }
-
-        for (int_t k0 = k1; k0 < SUPERLU_MIN(nnodes, k1+oldWinSize); ++k0)
-        {
-            int_t k = perm_c_supno[k0];
-            // int_t offset = (k0-k1)%oldWinSize;
-            // if(winParity%2)
-            //     offset+= halfWin;
-            int_t offset = getBufferOffset(k0, k1, oldWinSize, winParity, halfWin);
-            // printf("Syncing stream %d on offset %d\n", k0, offset);
-            if(UidxSendCounts[k]>0 && LidxSendCounts[k]>0)
-                gpuErrchk(cudaStreamSynchronize(A_gpu.cuStreams[offset]));
-        }
-
-        k1=k1_next;
-        winParity++;
-    }
-
-#if 0
-    
-    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
-    {
-        /* code */
-        int_t k_st = eTreeTopLims[topoLvl];
-        int_t k_end = eTreeTopLims[topoLvl + 1];
-        for (int_t k0 = k_st; k0 < k_end; ++k0)
-        {
-            int_t k = perm_c_supno[k0];
-
-            int_t ksupc = SuperSize(k);
-            cublasHandle_t cubHandle = A_gpu.cuHandles[0];
-            cudaStream_t cuStream = A_gpu.cuStreams[0];
-            dDiagFactorPanelSolveGPU(k, 0, dFBufs);
-            /*=======   Panel Broadcast             ======*/
-            // panelBcastGPU(k, 0);
-            int_t offset = k0%numLA;
-            dPanelBcastGPU(k, offset);
-
-            /*=======   Schurcomplement Update      ======*/
-            upanel_t k_upanel(UidxRecvBufs[offset], UvalRecvBufs[offset],
-                              A_gpu.UidxRecvBufs[offset], A_gpu.UvalRecvBufs[offset]);
-            lpanel_t k_lpanel(LidxRecvBufs[offset], LvalRecvBufs[offset],
-                              A_gpu.LidxRecvBufs[offset], A_gpu.LvalRecvBufs[offset]);
-            if (myrow == krow(k))
-            {
-                k_upanel = uPanelVec[g2lRow(k)];
-            }
-            if (mycol == kcol(k))
-                k_lpanel = lPanelVec[g2lCol(k)];
-
-            if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
-            {
-                int streamId = 0;
-
-
-#if 0
-
-                dSchurComplementUpdateGPU(
-                    streamId,
-                    k, k_lpanel, k_upanel);
-#else
-                int_t k_parent = gEtreeInfo->setree[k];
-                lookAheadUpdateGPU(
-                    streamId,
-                    k, k_parent, k_lpanel, k_upanel);
-                dSchurCompUpdateExcludeOneGPU(
-                    streamId,
-                    k, k_parent, k_lpanel, k_upanel);
-
-#endif
-
-            }
-        } /*for k0= k_st:k_end */
-    } /*for topoLvl = 0:maxTopoLevel*/
-
-#endif /* match  #if 0 at line 562 before "for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)" */
-
-
-    /* Sherry added 2/1/23 */
-    SUPERLU_FREE(donePanelBcast);
-    SUPERLU_FREE(donePanelSolve);
-    SUPERLU_FREE(localNumChildrenLeft);
-    
-#if (DEBUGlevel >= 1)
-    CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorGPU()");
-#endif
-
-    return 0;
-} /* dsparseTreeFactorGPU */
-
-
-#if 0 // Sherry: old batch code 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-void LUstruct_v100::dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno)
-{
-#ifdef HAVE_MAGMA
-    // Marshall the data for the leaf level batched LU decomposition   
-    LUMarshallData& mdata = A_gpu.marshall_data;
-    cudaStream_t stream = A_gpu.cuStreams[0];
-    marshallBatchedLUData(k_st, k_end, perm_c_supno);
-    
-    int info = magma_dgetrf_vbatched(
-        mdata.dev_diag_dim_array, mdata.dev_diag_dim_array, 
-        mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, 
-        NULL, mdata.dev_info_array, mdata.batchsize, 
-        A_gpu.magma_queue
-    );
-
-    // Hackathon change: assuming individual matrices in a local batch are not distributed 
-    // so we don't do the buffer copies anymore 
-    // // Copy the diagonal block data over to the bcast buffers
-    // marshallBatchedBufferCopyData(k_st, k_end, perm_c_supno);
-
-    // copyBlock_vbatch(
-    //     stream, mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, mdata.max_diag, mdata.max_panel,
-    //     mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
-    //     mdata.batchsize
-    // );
-    
-    // Upper panel triangular solves
-    marshallBatchedTRSMUData(k_st, k_end, perm_c_supno);
-
-    magmablas_dtrsm_vbatched(
-        MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 
-        mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, 1.0, 
-        mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, 
-        mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, 
-        mdata.batchsize, A_gpu.magma_queue
-    );
-
-    // Lower panel triangular solves
-    marshallBatchedTRSMLData(k_st, k_end, perm_c_supno);
-
-    magmablas_dtrsm_vbatched(
-        MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 
-        mdata.dev_panel_dim_array, mdata.dev_diag_dim_array, 1.0, 
-        mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, 
-        mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, 
-        mdata.batchsize, A_gpu.magma_queue
-    );
-
-    // Wajih: This should be converted to a single bcast if possible 
-    // Hackathon change: assuming individual matrices in a local batch are not distributed 
-    // for (int k0 = k_st; k0 < k_end; k0++)
-    // {
-    //     int k = perm_c_supno[k0];
-    //     int offset = 0;
-    //     /*======= Panel Broadcast  ======*/
-    //     dPanelBcastGPU(k, offset);
-    // }
-
-    // Initialize the schur complement update marshall data 
-    initSCUMarshallData(k_st, k_end, perm_c_supno);
-    SCUMarshallData& sc_mdata = A_gpu.sc_marshall_data;
-
-    // Keep marshalling while there are batches to be processed 
-    int done_i = 0;
-    int total_batches = 0;
-    while(done_i == 0)
-    {
-        done_i = marshallSCUBatchedDataOuter(k_st, k_end, perm_c_supno);
-        int done_j = 0;
-        while(done_i == 0 && done_j == 0)
-        {
-            done_j = marshallSCUBatchedDataInner(k_st, k_end, perm_c_supno);
-            if(done_j != 1)
-            {
-                total_batches++;
-                magmablas_dgemm_vbatched_max_nocheck (
-                    MagmaNoTrans, MagmaNoTrans, sc_mdata.dev_m_array, sc_mdata.dev_n_array, sc_mdata.dev_k_array,
-                    1.0, sc_mdata.dev_A_ptrs, sc_mdata.dev_lda_array, sc_mdata.dev_B_ptrs, sc_mdata.dev_ldb_array,
-                    0.0, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, sc_mdata.batchsize,
-                    sc_mdata.max_m, sc_mdata.max_n, sc_mdata.max_k, A_gpu.magma_queue 
-                );
-
-                scatterGPU_batchDriver(
-                    sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend, 
-                    sc_mdata.max_ilen, sc_mdata.max_jlen, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, 
-                    A_gpu.maxSuperSize, ldt, sc_mdata.dev_gpu_lpanels, sc_mdata.dev_gpu_upanels, 
-                    dA_gpu, sc_mdata.batchsize, A_gpu.cuStreams[0]
-                );
-            }
-        }
-    }
-    // printf("SCU batches = %d\n", total_batches);
-#endif
-}
-
-int LUstruct_v100::dsparseTreeFactorBatchGPU(
-    sForest_t *sforest,
-    ddiagFactBufs_t **dFBufs, // size maxEtree level
-    gEtreeInfo_t *gEtreeInfo, // global etree info
-    int tag_ub)
-{
-#ifdef HAVE_MAGMA
-    int nnodes = sforest->nNodes; // number of nodes in the tree
-    int topoLvl, k_st, k_end, k0, k, offset, ksupc;
-    if (nnodes < 1)
-    {
-        return 1;
-    }
-
-    int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
-    treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
-    int_t *myIperm = treeTopoInfo->myIperm;
-    int_t maxTopoLevel = treeTopoInfo->numLvl;
-    int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
-
-
-#if (DEBUGlevel >= 1)
-    CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorBatchGPU()");
-#endif
-    printf("Using level-based scheduling on GPU\n"); fflush(stdout);
-
-    /* For all the leaves at level 0 */
-    topoLvl = 0;
-    k_st = eTreeTopLims[topoLvl];
-    k_end = eTreeTopLims[topoLvl + 1];
-    printf("level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end); fflush(stdout);
-
-#if 0
-    //ToDo: make this batched 
-    for (k0 = k_st; k0 < k_end; k0++)
-    {
-        k = perm_c_supno[k0];
-        offset = k0 - k_st;
-        // dDiagFactorPanelSolveGPU(k, offset, dFBufs);
-        dDFactPSolveGPU(k, 0, offset, dFBufs);
-
-        /*======= Panel Broadcast  ======*/
-        dPanelBcastGPU(k, offset); // does this only if (UidxSendCounts[k] > 0)
-        //donePanelSolve[k0]=1;
-
-        /*======= Schurcomplement Update ======*/
-        /* UidxSendCounts are computed in LUstruct_v100 constructor in LUpanels.cpp */
-        if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) {
-            // k_upanel.checkCorrectness();
-            int streamId = 0;
-                upanel_t k_upanel = getKUpanel(k,offset);
-                lpanel_t k_lpanel = getKLpanel(k,offset);
-            dSchurComplementUpdateGPU( streamId,
-                        k, k_lpanel, k_upanel);
-            // cudaStreamSynchronize(cuStream); // there is sync inside the kernel
-        }
-    }
-    
-    /* Main loop over all the internal levels */
-    for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl) {
-      
-        k_st = eTreeTopLims[topoLvl];
-        k_end = eTreeTopLims[topoLvl + 1];
-
-    printf("level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end); fflush(stdout);
-    
-	/* loop over all the nodes at level topoLvl */
-        for (k0 = k_st; k0 < k_end; ++k0) { /* ToDo: batch this */
-            k = perm_c_supno[k0];
-            offset = k0 - k_st;
-            // offset = getBufferOffset(k0, k1, winSize, winParity, halfWin);
-            //ksupc = SuperSize(k);
-
-            dDFactPSolveGPU(k, 0, offset, dFBufs);
-
-                /*======= Panel Broadcast  ======*/
-            dPanelBcastGPU(k, offset); // does this only if (UidxSendCounts[k] > 0)
-	    
-            /*======= Schurcomplement Update ======*/
-            if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
-            {
-                // k_upanel.checkCorrectness();
-                int streamId = 0;
-
-		//#define NDEBUG
-#ifndef NDEBUG
-                checkGPU(); // ??
-#endif
-                upanel_t k_upanel = getKUpanel(k,offset);
-                lpanel_t k_lpanel = getKLpanel(k,offset);
-                dSchurComplementUpdateGPU(streamId,
-					  k, k_lpanel, k_upanel);
-// cudaStreamSynchronize(cuStream); // there is sync inside the kernel
-#if 0 // Sherry commented out 7/4/23  
-#ifndef NDEBUG
-                dSchurComplementUpdate(k, k_lpanel, k_upanel);   // ?? why do this on CPU ?
-                cudaStreamSynchronize(cuStream);
-                checkGPU();
-#endif
-#endif		
-            }
-            // MPI_Barrier(grid3d->comm);
-        } /* end for k0= k_st:k_end */
-    } /* end for topoLvl = 0:maxTopoLevel */
-#else
-    printf("Using batched code\n");
-    double t0 = SuperLU_timer_();
-
-    // Copy the nodelist to the GPU 
-    gpuErrchk(cudaMemcpy(A_gpu.dperm_c_supno, perm_c_supno, sizeof(int) * sforest->nNodes, cudaMemcpyHostToDevice));
-
-    // Solve level by level 
-    dFactBatchSolve(k_st, k_end, perm_c_supno);
-    for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl) {
-      
-        k_st = eTreeTopLims[topoLvl];
-        k_end = eTreeTopLims[topoLvl + 1];
-        dFactBatchSolve(k_st, k_end, perm_c_supno);
-    }
-    printf("Batch time = %.3f\n", SuperLU_timer_() - t0);
-#endif
-
-#if (DEBUGlevel >= 1)
-    CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorBatchGPU()");
-#endif
-    
-#else
-    printf("MAGMA is required for batched execution!\n");
-    fflush(stdout);
-    exit(0);
-    
-#endif /* match ifdef have_magma */
-    
-    return 0;
-} /* end dsparseTreeFactorBatchGPU */
-
-#endif // Sherry: old batch code commented out
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-//TODO: needs to be merged as a single factorization function
-int_t LUstruct_v100::dsparseTreeFactorGPUBaseline(
-    sForest_t *sforest,
-    ddiagFactBufs_t **dFBufs, // size maxEtree level
-    gEtreeInfo_t *gEtreeInfo, // global etree info
-
-    int tag_ub)
-{
-    int_t nnodes = sforest->nNodes; // number of nodes in the tree
-    if (nnodes < 1)
-    {
-        return 1;
-    }
-
-    printf("Using New C++ code with GPU acceleration\n");
-#if (DEBUGlevel >= 1)
-    CHECK_MALLOC(grid3d->iam, "Enter dsparseTreeFactorGPUBaseline()");
-#endif
-
-    int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
-    treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
-    int_t *myIperm = treeTopoInfo->myIperm;
-    int_t maxTopoLevel = treeTopoInfo->numLvl;
-    int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
-
-    /*main loop over all the levels*/
-    int_t numLA = getNumLookAhead(options);
-
-    // start the pipeline
-
-    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
-    {
-        /* code */
-        int_t k_st = eTreeTopLims[topoLvl];
-        int_t k_end = eTreeTopLims[topoLvl + 1];
-        for (int_t k0 = k_st; k0 < k_end; ++k0)
-        {
-            int_t k = perm_c_supno[k0];
-            int_t offset = k0 - k_st;
-            int_t ksupc = SuperSize(k);
-            cublasHandle_t cubHandle = A_gpu.cuHandles[0];
-            cudaStream_t cuStream = A_gpu.cuStreams[0];
-            /*=======   Diagonal Factorization      ======*/
-            if (iam == procIJ(k, k))
-            {
-//#define NDEBUG
-#ifndef NDEBUG
-                lPanelVec[g2lCol(k)].checkGPU();
-                lPanelVec[g2lCol(k)].diagFactor(k, dFBufs[offset]->BlockUFactor, ksupc,
-                                                thresh, xsup, options, stat, info);
-                lPanelVec[g2lCol(k)].packDiagBlock(dFBufs[offset]->BlockLFactor, ksupc);
-#endif
-                lPanelVec[g2lCol(k)].diagFactorPackDiagBlockGPU(k,
-                                                                dFBufs[offset]->BlockUFactor, ksupc, // CPU pointers
-                                                                dFBufs[offset]->BlockLFactor, ksupc, // CPU pointers
-                                                                thresh, xsup, options, stat, info);
-// cudaStreamSynchronize(cuStream);
-#ifndef NDEBUG
-                cudaStreamSynchronize(cuStream);
-                lPanelVec[g2lCol(k)].checkGPU();
-#endif
-            }
-
-            /*=======   Diagonal Broadcast          ======*/
-            if (myrow == krow(k))
-                MPI_Bcast((void *)dFBufs[offset]->BlockLFactor, ksupc * ksupc,
-                          MPI_DOUBLE, kcol(k), (grid->rscp).comm);
-            if (mycol == kcol(k))
-                MPI_Bcast((void *)dFBufs[offset]->BlockUFactor, ksupc * ksupc,
-                          MPI_DOUBLE, krow(k), (grid->cscp).comm);
-
-            /*=======   Panel Update                ======*/
-            if (myrow == krow(k))
-            {
-#ifndef NDEBUG
-                uPanelVec[g2lRow(k)].checkGPU();
-#endif
-                cudaMemcpy(A_gpu.dFBufs[0], dFBufs[offset]->BlockLFactor,
-                           ksupc * ksupc * sizeof(double), cudaMemcpyHostToDevice);
-                uPanelVec[g2lRow(k)].panelSolveGPU(
-                    cubHandle, cuStream,
-                    ksupc, A_gpu.dFBufs[0], ksupc);
-                cudaStreamSynchronize(cuStream); // synchronize befpre broadcast
-#ifndef NDEBUG
-                uPanelVec[g2lRow(k)].panelSolve(ksupc, dFBufs[offset]->BlockLFactor, ksupc);
-                cudaStreamSynchronize(cuStream);
-                uPanelVec[g2lRow(k)].checkGPU();
-#endif
-            }
-
-            if (mycol == kcol(k))
-            {
-                cudaMemcpy(A_gpu.dFBufs[0], dFBufs[offset]->BlockUFactor,
-                           ksupc * ksupc * sizeof(double), cudaMemcpyHostToDevice);
-                lPanelVec[g2lCol(k)].panelSolveGPU(
-                    cubHandle, cuStream,
-                    ksupc, A_gpu.dFBufs[0], ksupc);
-                cudaStreamSynchronize(cuStream);
-#ifndef NDEBUG
-                lPanelVec[g2lCol(k)].panelSolve(ksupc, dFBufs[offset]->BlockUFactor, ksupc);
-                cudaStreamSynchronize(cuStream);
-                lPanelVec[g2lCol(k)].checkGPU();
-#endif
-            }
-
-            /*=======   Panel Broadcast             ======*/
-            upanel_t k_upanel(UidxRecvBufs[0], UvalRecvBufs[0],
-                              A_gpu.UidxRecvBufs[0], A_gpu.UvalRecvBufs[0]);
-            lpanel_t k_lpanel(LidxRecvBufs[0], LvalRecvBufs[0],
-                              A_gpu.LidxRecvBufs[0], A_gpu.LvalRecvBufs[0]);
-            if (myrow == krow(k))
-            {
-                k_upanel = uPanelVec[g2lRow(k)];
-            }
-            if (mycol == kcol(k))
-                k_lpanel = lPanelVec[g2lCol(k)];
-
-            if (UidxSendCounts[k] > 0)
-            {
-                // assuming GPU direct is available
-                MPI_Bcast(k_upanel.gpuPanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm);
-                MPI_Bcast(k_upanel.gpuPanel.val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm);
-                // copy the index to cpu
-                cudaMemcpy(k_upanel.index, k_upanel.gpuPanel.index,
-                           sizeof(int_t) * UidxSendCounts[k], cudaMemcpyDeviceToHost);
-
-#ifndef NDEBUG
-                MPI_Bcast(k_upanel.index, UidxSendCounts[k], mpi_int_t, krow(k), grid3d->cscp.comm);
-                MPI_Bcast(k_upanel.val, UvalSendCounts[k], MPI_DOUBLE, krow(k), grid3d->cscp.comm);
-#endif
-            }
-
-            if (LidxSendCounts[k] > 0)
-            {
-                MPI_Bcast(k_lpanel.gpuPanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm);
-                MPI_Bcast(k_lpanel.gpuPanel.val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm);
-                cudaMemcpy(k_lpanel.index, k_lpanel.gpuPanel.index,
-                           sizeof(int_t) * LidxSendCounts[k], cudaMemcpyDeviceToHost);
-
-#ifndef NDEBUG
-                MPI_Bcast(k_lpanel.index, LidxSendCounts[k], mpi_int_t, kcol(k), grid3d->rscp.comm);
-                MPI_Bcast(k_lpanel.val, LvalSendCounts[k], MPI_DOUBLE, kcol(k), grid3d->rscp.comm);
-#endif
-            }
-
-            /*=======   Schurcomplement Update      ======*/
-
-            if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0)
-            {
-                // k_upanel.checkCorrectness();
-                int streamId = 0;
-#ifndef NDEBUG
-                checkGPU();
-#endif
-                dSchurComplementUpdateGPU(
-                    streamId,
-                    k, k_lpanel, k_upanel);
-// cudaStreamSynchronize(cuStream); // there is sync inside the kernel
-#ifndef NDEBUG
-                dSchurComplementUpdate(k, k_lpanel, k_upanel);
-                cudaStreamSynchronize(cuStream);
-                checkGPU();
-#endif
-            }
-            // MPI_Barrier(grid3d->comm);
-
-        } /*for k0= k_st:k_end */
-
-    } /*for topoLvl = 0:maxTopoLevel*/
-
-#if (DEBUGlevel >= 1)
-    CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorGPUBaseline()");
-#endif
-
-    return 0;
-} /* dsparseTreeFactorGPUBaseline */
-#endif
diff --git a/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp b/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp
index 0a9a7b9b..acc738d7 100644
--- a/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp
+++ b/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp
@@ -720,97 +720,6 @@ void xLUstruct_t<Ftype>::marshallBatchedBufferCopyData(int k_st, int k_end, int_
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename Ftype>
-void xLUstruct_t<Ftype>::dFactBatchSolve(int k_st, int k_end, int_t *perm_c_supno)
-{
-#if 0//def HAVE_MAGMA
-    //#ifdef HAVE_MAGMA
-    // Marshall the data for the leaf level batched LU decomposition
-    LUMarshallData &mdata = A_gpu.marshall_data;
-    cudaStream_t stream = A_gpu.cuStreams[0];
-    marshallBatchedLUData(k_st, k_end, perm_c_supno);
-
-    int info = magma_dgetrf_vbatched(
-        mdata.dev_diag_dim_array, mdata.dev_diag_dim_array,
-        mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
-        NULL, mdata.dev_info_array, mdata.batchsize,
-        A_gpu.magma_queue);
-
-    // Hackathon change: assuming individual matrices in a local batch are not distributed
-    // so we don't do the buffer copies anymore
-    // // Copy the diagonal block data over to the bcast buffers
-    // marshallBatchedBufferCopyData(k_st, k_end, perm_c_supno);
-
-    // copyBlock_vbatch(
-    //     stream, mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, mdata.max_diag, mdata.max_panel,
-    //     mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
-    //     mdata.batchsize
-    // );
-
-    // Upper panel triangular solves
-    marshallBatchedTRSMUData(k_st, k_end, perm_c_supno);
-
-    magmablas_dtrsm_vbatched(
-        MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
-        mdata.dev_diag_dim_array, mdata.dev_panel_dim_array, 1.0,
-        mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
-        mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
-        mdata.batchsize, A_gpu.magma_queue);
-
-    // Lower panel triangular solves
-    marshallBatchedTRSMLData(k_st, k_end, perm_c_supno);
-
-    magmablas_dtrsm_vbatched(
-        MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit,
-        mdata.dev_panel_dim_array, mdata.dev_diag_dim_array, 1.0,
-        mdata.dev_diag_ptrs, mdata.dev_diag_ld_array,
-        mdata.dev_panel_ptrs, mdata.dev_panel_ld_array,
-        mdata.batchsize, A_gpu.magma_queue);
-
-    // Wajih: This should be converted to a single bcast if possible
-    // Hackathon change: assuming individual matrices in a local batch are not distributed
-    // for (int k0 = k_st; k0 < k_end; k0++)
-    // {
-    //     int k = perm_c_supno[k0];
-    //     int offset = 0;
-    //     /*======= Panel Broadcast  ======*/
-    //     dPanelBcastGPU(k, offset);
-    // }
-
-    // Initialize the schur complement update marshall data
-    initSCUMarshallData(k_st, k_end, perm_c_supno);
-    xSCUMarshallData &sc_mdata = A_gpu.sc_marshall_data;
-
-    // Keep marshalling while there are batches to be processed
-    int done_i = 0;
-    int total_batches = 0;
-    while (done_i == 0)
-    {
-        done_i = marshallSCUBatchedDataOuter(k_st, k_end, perm_c_supno);
-        int done_j = 0;
-        while (done_i == 0 && done_j == 0)
-        {
-            done_j = marshallSCUBatchedDataInner(k_st, k_end, perm_c_supno);
-            if (done_j != 1)
-            {
-                total_batches++;
-                magmablas_dgemm_vbatched_max_nocheck(
-                    MagmaNoTrans, MagmaNoTrans, sc_mdata.dev_m_array, sc_mdata.dev_n_array, sc_mdata.dev_k_array,
-                    1.0, sc_mdata.dev_A_ptrs, sc_mdata.dev_lda_array, sc_mdata.dev_B_ptrs, sc_mdata.dev_ldb_array,
-                    0.0, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array, sc_mdata.batchsize,
-                    sc_mdata.max_m, sc_mdata.max_n, sc_mdata.max_k, A_gpu.magma_queue);
-
-                scatterGPU_batchDriver(
-                    sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend,
-                    sc_mdata.max_ilen, sc_mdata.max_jlen, sc_mdata.dev_C_ptrs, sc_mdata.dev_ldc_array,
-                    A_gpu.maxSuperSize, ldt, sc_mdata.dev_gpu_lpanels, sc_mdata.dev_gpu_upanels,
-                    dA_gpu, sc_mdata.batchsize, A_gpu.cuStreams[0]);
-            }
-        }
-    }
-    // printf("SCU batches = %d\n", total_batches);
-#endif
-}
 
 #if 0
 //// This is not used anymore //////
@@ -821,7 +730,6 @@ int xLUstruct_t<Ftype>::dsparseTreeFactorBatchGPU(
     gEtreeInfo_t *gEtreeInfo, // global etree info
     int tag_ub)
 {
-#if 0// def HAVE_MAGMA
     int nnodes = sforest->nNodes; // number of nodes in the tree
     int topoLvl, k_st, k_end, k0, k, offset, ksupc;
     if (nnodes < 1)
@@ -848,8 +756,7 @@ int xLUstruct_t<Ftype>::dsparseTreeFactorBatchGPU(
     printf("level %d: k_st %d, k_end %d\n", topoLvl, k_st, k_end);
     fflush(stdout);
 
-#if 0
-    //ToDo: make this batched 
+    //ToDo: make this batched -- may use this when MAGMA is not available
     for (k0 = k_st; k0 < k_end; k0++)
     {
         k = perm_c_supno[k0];
@@ -920,40 +827,17 @@ n                cudaStreamSynchronize(cuStream);
             // MPI_Barrier(grid3d->comm);
         } /* end for k0= k_st:k_end */
     } /* end for topoLvl = 0:maxTopoLevel */
-#else
-    printf("Using batched code\n");
-    double t0 = SuperLU_timer_();
-
-    // Copy the nodelist to the GPU
-    gpuErrchk(cudaMemcpy(A_gpu.dperm_c_supno, perm_c_supno, sizeof(int) * sforest->nNodes, cudaMemcpyHostToDevice));
-
-    // Solve level by level
-    dFactBatchSolve(k_st, k_end, perm_c_supno);
-    for (topoLvl = 1; topoLvl < maxTopoLevel; ++topoLvl)
-    {
-
-        k_st = eTreeTopLims[topoLvl];
-        k_end = eTreeTopLims[topoLvl + 1];
-        dFactBatchSolve(k_st, k_end, perm_c_supno);
-    }
-    printf("Batch time = %.3f\n", SuperLU_timer_() - t0);
-#endif
 
 #if (DEBUGlevel >= 1)
     CHECK_MALLOC(grid3d->iam, "Exit dsparseTreeFactorBatchGPU()");
 #endif
 
-#else
-    printf("MAGMA is required for batched execution!\n");
-    fflush(stdout);
-    exit(0);
-
-#endif /* match ifdef have_magma */
-
     return 0;
 } /* end dsparseTreeFactorBatchGPU */
 
 #endif /* match #if 0 */
+
+
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
diff --git a/SRC/CplusplusFactor/lupanels_GPU.cuh b/SRC/CplusplusFactor/lupanels_GPU.cuh
index 7b650f4b..2da76488 100644
--- a/SRC/CplusplusFactor/lupanels_GPU.cuh
+++ b/SRC/CplusplusFactor/lupanels_GPU.cuh
@@ -308,86 +308,7 @@ public:
     }
 };
 
-#if 0
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Wajih: Device and host memory used to store marshalled batch data for LU and TRSM
-struct LUMarshallData 
-{
-    LUMarshallData();
-    ~LUMarshallData();
-
-    // Diagonal device pointer data 
-    double **dev_diag_ptrs;
-    int *dev_diag_ld_array, *dev_diag_dim_array, *dev_info_array;
-    
-    // TRSM panel device pointer data 
-    double **dev_panel_ptrs;
-    int *dev_panel_ld_array, *dev_panel_dim_array;
-
-    // Max of marshalled device data 
-    int max_panel, max_diag;
-
-    // Number of marshalled operations
-    int batchsize;
 
-    // Data accumulated on the host
-    std::vector<double*> host_diag_ptrs;
-    std::vector<int> host_diag_ld_array, host_diag_dim_array;
-    
-    std::vector<double*> host_panel_ptrs;
-    std::vector<int> host_panel_ld_array, host_panel_dim_array;
-    
-    void setBatchSize(int batch_size);
-    void setMaxDiag();
-    void setMaxPanel();
-};
-
-// Wajih: Device and host memory used to store marshalled batch data for Schur complement update 
-struct SCUMarshallData 
-{
-    SCUMarshallData();
-    ~SCUMarshallData();
-
-    // GEMM device pointer data 
-    double **dev_A_ptrs, **dev_B_ptrs, **dev_C_ptrs;
-    int *dev_lda_array, *dev_ldb_array, *dev_ldc_array;
-    int *dev_m_array, *dev_n_array, *dev_k_array;
-
-    // Panel device pointer data and scu loop limits 
-    lpanelGPU_t* dev_gpu_lpanels;
-    upanelGPU_t* dev_gpu_upanels;
-    int* dev_ist, *dev_iend, *dev_jst, *dev_jend;
-    int *dev_maxGemmRows, *dev_maxGemmCols;
-    
-    // Max of marshalled gemm device data 
-    int max_m, max_n, max_k;    
-    
-    // Max of marshalled loop limits  
-    int max_ilen, max_jlen;
-
-    // Number of marshalled operations
-    int batchsize;
-
-    // Data accumulated on the host
-    std::vector<double*> host_A_ptrs, host_B_ptrs, host_C_ptrs;
-    std::vector<int> host_lda_array, host_ldb_array, host_ldc_array;
-    std::vector<int> host_m_array, host_n_array, host_k_array;
-
-    // Host data initialized once per level 
-    std::vector<upanel_t> upanels;
-    std::vector<lpanel_t> lpanels;
-    std::vector<upanelGPU_t> host_gpu_upanels;
-    std::vector<lpanelGPU_t> host_gpu_lpanels;
-    std::vector<int> ist, iend, jst, jend, maxGemmRows, maxGemmCols;
-
-    void setBatchSize(int batch_size);
-    void setMaxDims();
-    void copyPanelDataToGPU();
-    void copyToGPU();
-};
-
-#endif
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
 #define MAX_CUDA_STREAMS 64 
@@ -469,4 +390,4 @@ void scatterGPU_batchDriver(
     int max_ilen, int max_jlen, double **gemmBuff_ptrs, int *LDgemmBuff_batch, 
     int maxSuperSize, int ldt, lpanelGPU_t *lpanels, upanelGPU_t *upanels, 
     LUstructGPU_t *dA, int batchCount, cudaStream_t cuStream
-);
\ No newline at end of file
+);
diff --git a/SRC/CplusplusFactor/schurCompUpdate.cu b/SRC/CplusplusFactor/schurCompUpdate.cu
deleted file mode 100644
index 8c01e15d..00000000
--- a/SRC/CplusplusFactor/schurCompUpdate.cu
+++ /dev/null
@@ -1,2106 +0,0 @@
-#if 1//def HAVE_CUDA
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/system/cuda/execution_policy.h>
-#include <thrust/logical.h>
-#include <thrust/extrema.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/functional.h>
-#include "superlu_ddefs.h"
-#include "lupanels_GPU.cuh"
-#include "lupanels.hpp"
-
-cudaError_t checkCudaLocal(cudaError_t result)
-{
-    // #if defined(DEBUG) || defined(_DEBUG)
-    // printf("Checking cuda\n");
-    if (result != cudaSuccess)
-    {
-        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
-        assert(result == cudaSuccess);
-    }
-    // #endif
-    return result;
-}
-
-__global__ void indirectCopy(double *dest, double *src, int_t *idx, int n)
-{
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n)
-        dest[idx[i]] = src[i];
-}
-
-/**
- * @file schurCompUpdate.cu
- * @brief This function copies the packed buffers to GPU and performs the sparse
-   initialization on GPU call indirectCopy, this is the kernel
- * @param gpuValBasePtr is the base pointer of the GPU matrix
- * @param valBufferPacked is the packed buffer of the matrix
- * @param valIdx is the index of the packed buffer
- */
-void copyToGPU(double *gpuValBasePtr, std::vector<double> &valBufferPacked,
-               std::vector<int_t> &valIdx)
-{
-    int nnzCount = valBufferPacked.size();
-    // calculate the size of the packed buffers
-    int_t gpuLvalSizePacked = nnzCount * sizeof(double);
-    int_t gpuLidxSizePacked = nnzCount * sizeof(int_t);
-    // allocate the memory for the packed buffers on GPU
-    double *dlvalPacked;
-    int_t *dlidxPacked;
-    gpuErrchk(cudaMalloc(&dlvalPacked, gpuLvalSizePacked));
-    gpuErrchk(cudaMalloc(&dlidxPacked, gpuLidxSizePacked));
-    // copy the packed buffers from CPU to GPU
-    gpuErrchk(cudaMemcpy(dlvalPacked, valBufferPacked.data(), gpuLvalSizePacked, cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMemcpy(dlidxPacked, valIdx.data(), gpuLidxSizePacked, cudaMemcpyHostToDevice));
-    // perform the sparse initialization on GPU call indirectCopy
-    const int ThreadblockSize = 256;
-    int nThreadBlocks = (nnzCount + ThreadblockSize - 1) / ThreadblockSize;
-    indirectCopy<<<nThreadBlocks, ThreadblockSize>>>(
-        gpuValBasePtr, dlvalPacked, dlidxPacked, nnzCount);
-    // wait for it to finish and free dlvalPacked and dlidxPacked
-    gpuErrchk(cudaDeviceSynchronize());
-    gpuErrchk(cudaFree(dlvalPacked));
-    gpuErrchk(cudaFree(dlidxPacked));
-}
-
-// copy the panel to GPU
-void copyToGPU_Sparse(double *gpuValBasePtr, double *valBuffer, int_t gpuLvalSize)
-{
-    // sparse Initialization for GPU, this is the experimental code
-    // find non-zero elements in the panel, their location and values  and copy to GPU
-    int numDoubles = gpuLvalSize / sizeof(double);
-    std::vector<double> valBufferPacked;
-    std::vector<int_t> valIdx;
-    for (int_t i = 0; i < numDoubles; i++)
-    {
-        if (valBuffer[i] != 0)
-        {
-            valBufferPacked.push_back(valBuffer[i]);
-            valIdx.push_back(i);
-        }
-    }
-    printf("%d non-zero elements in the panel, wrt original=%d\n", valBufferPacked.size(), numDoubles);
-    // get the size of the packed buffers and allocate memory on GPU
-    copyToGPU(gpuValBasePtr, valBufferPacked, valIdx);
-}
-
-//#define NDEBUG
-__device__
-    int_t
-    lpanelGPU_t::find(int_t k)
-{
-    int threadId = threadIdx.x;
-    __shared__ int idx;
-    __shared__ int found;
-    if (!threadId)
-    {
-        idx = -1;
-        found = 0;
-    }
-
-    int nThreads = blockDim.x;
-    int blocksPerThreads = CEILING(nblocks(), nThreads);
-    __syncthreads();
-    for (int blk = blocksPerThreads * threadIdx.x;
-         blk < blocksPerThreads * (threadIdx.x + 1);
-         blk++)
-    {
-        // if(found) break;
-
-        if (blk < nblocks())
-        {
-            if (k == gid(blk))
-            {
-                idx = blk;
-                found = 1;
-            }
-        }
-    }
-    __syncthreads();
-    return idx;
-}
-
-__device__
-    int_t
-    upanelGPU_t::find(int_t k)
-{
-    int threadId = threadIdx.x;
-    __shared__ int idx;
-    __shared__ int found;
-    if (!threadId)
-    {
-        idx = -1;
-        found = 0;
-    }
-    __syncthreads();
-
-    int nThreads = blockDim.x;
-    int blocksPerThreads = CEILING(nblocks(), nThreads);
-
-    for (int blk = blocksPerThreads * threadIdx.x;
-         blk < blocksPerThreads * (threadIdx.x + 1);
-         blk++)
-    {
-        // if(found) break;
-
-        if (blk < nblocks())
-        {
-            if (k == gid(blk))
-            {
-                idx = blk;
-                found = 1;
-            }
-        }
-    }
-    __syncthreads();
-    return idx;
-}
-
-__device__ int computeIndirectMapGPU(int *rcS2D, int_t srcLen, int_t *srcVec,
-                                     int_t dstLen, int_t *dstVec,
-                                     int *dstIdx)
-{
-    int threadId = threadIdx.x;
-    if (dstVec == NULL) /*uncompressed dimension*/
-    {
-        if (threadId < srcLen)
-            rcS2D[threadId] = srcVec[threadId];
-        __syncthreads();
-        return 0;
-    }
-
-    if (threadId < dstLen)
-        dstIdx[dstVec[threadId]] = threadId;
-    __syncthreads();
-
-    if (threadId < srcLen)
-        rcS2D[threadId] = dstIdx[srcVec[threadId]];
-    __syncthreads();
-
-    return 0;
-}
-
-__device__ void scatterGPU_dev(
-    int iSt, int jSt,
-    double *gemmBuff, int LDgemmBuff,
-    lpanelGPU_t& lpanel, upanelGPU_t& upanel,
-    LUstructGPU_t *dA
-)
-{
-    // calculate gi,gj
-    int ii = iSt + blockIdx.x;
-    int jj = jSt + blockIdx.y;
-    int threadId = threadIdx.x;
-
-    int gi = lpanel.gid(ii);
-    int gj = upanel.gid(jj);
-#ifndef NDEBUG
-    // if (!threadId)
-    //     printf("Scattering to (%d, %d) \n", gi, gj);
-#endif
-    double *Dst;
-    int_t lddst;
-    int_t dstRowLen, dstColLen;
-    int_t *dstRowList;
-    int_t *dstColList;
-    int li, lj;
-    if (gj > gi) // its in upanel
-    {
-        li = dA->g2lRow(gi);
-        lj = dA->uPanelVec[li].find(gj);
-        Dst = dA->uPanelVec[li].blkPtr(lj);
-        lddst = dA->supersize(gi);
-        dstRowLen = dA->supersize(gi);
-        dstRowList = NULL;
-        dstColLen = dA->uPanelVec[li].nbcol(lj);
-        dstColList = dA->uPanelVec[li].colList(lj);
-    }
-    else
-    {
-        lj = dA->g2lCol(gj);
-        li = dA->lPanelVec[lj].find(gi);
-        Dst = dA->lPanelVec[lj].blkPtr(li);
-        lddst = dA->lPanelVec[lj].LDA();
-        dstRowLen = dA->lPanelVec[lj].nbrow(li);
-        dstRowList = dA->lPanelVec[lj].rowList(li);
-        // if(!threadId )
-        // printf("Scattering to (%d, %d) by %d li=%d\n",gi, gj,threadId,li);
-        dstColLen = dA->supersize(gj);
-        dstColList = NULL;
-    }
-
-    // compute source row to dest row mapping
-    int maxSuperSize = dA->maxSuperSize;
-    extern __shared__ int baseSharedPtr[];
-    int *rowS2D = baseSharedPtr;
-    int *colS2D = &rowS2D[maxSuperSize];
-    int *dstIdx = &colS2D[maxSuperSize];
-
-    int nrows = lpanel.nbrow(ii);
-    int ncols = upanel.nbcol(jj);
-    // lpanel.rowList(ii), upanel.colList(jj)
-
-    computeIndirectMapGPU(rowS2D, nrows, lpanel.rowList(ii),
-                          dstRowLen, dstRowList, dstIdx);
-
-    // compute source col to dest col mapping
-    computeIndirectMapGPU(colS2D, ncols, upanel.colList(jj),
-                          dstColLen, dstColList, dstIdx);
-
-    int nThreads = blockDim.x;
-    int colsPerThreadBlock = nThreads / nrows;
-
-    int rowOff = lpanel.stRow(ii) - lpanel.stRow(iSt);
-    int colOff = upanel.stCol(jj) - upanel.stCol(jSt);
-    double *Src = &gemmBuff[rowOff + colOff * LDgemmBuff];
-    int ldsrc = LDgemmBuff;
-    // TODO: this seems inefficient
-    if (threadId < nrows * colsPerThreadBlock)
-    {
-        /* 1D threads are logically arranged in 2D shape. */
-        int i = threadId % nrows;
-        int j = threadId / nrows;
-
-#pragma unroll 4
-        while (j < ncols)
-        {
-
-#define ATOMIC_SCATTER
-// Atomic Scatter is need if I want to perform multiple Schur Complement
-//  update concurrently
-#ifdef ATOMIC_SCATTER
-            atomicAdd(&Dst[rowS2D[i] + lddst * colS2D[j]], -Src[i + ldsrc * j]);
-#else
-            Dst[rowS2D[i] + lddst * colS2D[j]] -= Src[i + ldsrc * j];
-#endif
-            j += colsPerThreadBlock;
-        }
-    }
-
-    __syncthreads();
-}
-
-__global__ void scatterGPU(
-    int iSt, int jSt,
-    double *gemmBuff, int LDgemmBuff,
-    lpanelGPU_t lpanel, upanelGPU_t upanel,
-    LUstructGPU_t *dA)
-{
-    scatterGPU_dev(iSt, jSt, gemmBuff, LDgemmBuff, lpanel, upanel, dA);
-}
-
-__global__ void scatterGPU_batch(
-    int* iSt_batch, int *iEnd_batch, int *jSt_batch, int *jEnd_batch, 
-    double **gemmBuff_ptrs, int *LDgemmBuff_batch, lpanelGPU_t *lpanels, 
-    upanelGPU_t *upanels, LUstructGPU_t *dA
-)
-{
-    int batch_index = blockIdx.z;
-    int iSt = iSt_batch[batch_index], iEnd = iEnd_batch[batch_index];
-    int jSt = jSt_batch[batch_index], jEnd = jEnd_batch[batch_index];
-    
-    int ii = iSt + blockIdx.x;
-    int jj = jSt + blockIdx.y;
-    if(ii >= iEnd || jj >= jEnd)
-        return;
-    
-    double* gemmBuff = gemmBuff_ptrs[batch_index];
-    if(gemmBuff == NULL)
-        return;
-
-    int LDgemmBuff = LDgemmBuff_batch[batch_index];
-    lpanelGPU_t& lpanel = lpanels[batch_index];
-    upanelGPU_t& upanel = upanels[batch_index];
-    scatterGPU_dev(iSt, jSt, gemmBuff, LDgemmBuff, lpanel, upanel, dA);
-}
-
-void scatterGPU_driver(
-    int iSt, int iEnd, int jSt, int jEnd, double *gemmBuff, int LDgemmBuff,
-    int maxSuperSize, int ldt, lpanelGPU_t lpanel, upanelGPU_t upanel, 
-    LUstructGPU_t *dA, cudaStream_t cuStream
-)
-{
-    dim3 dimBlock(ldt); // 1d thread
-    dim3 dimGrid(iEnd - iSt, jEnd - jSt);
-    size_t sharedMemorySize = 3 * maxSuperSize * sizeof(int_t);
-
-    scatterGPU<<<dimGrid, dimBlock, sharedMemorySize, cuStream>>>(
-        iSt, jSt, gemmBuff, LDgemmBuff, lpanel, upanel, dA
-    );
-
-    gpuErrchk(cudaGetLastError());
-}
-
-void scatterGPU_batchDriver(
-    int* iSt_batch, int *iEnd_batch, int *jSt_batch, int *jEnd_batch, 
-    int max_ilen, int max_jlen, double **gemmBuff_ptrs, int *LDgemmBuff_batch, 
-    int maxSuperSize, int ldt, lpanelGPU_t *lpanels, upanelGPU_t *upanels, 
-    LUstructGPU_t *dA, int batchCount, cudaStream_t cuStream
-)
-{
-    const int op_increment = 65535;
-    
-    for(int op_start = 0; op_start < batchCount; op_start += op_increment)
-	{
-		int batch_size = std::min(op_increment, batchCount - op_start);
-    
-        dim3 dimBlock(ldt); // 1d thread
-        dim3 dimGrid(max_ilen, max_jlen, batch_size);
-        size_t sharedMemorySize = 3 * maxSuperSize * sizeof(int_t);
-
-        scatterGPU_batch<<<dimGrid, dimBlock, sharedMemorySize, cuStream>>>(
-            iSt_batch + op_start, iEnd_batch + op_start, jSt_batch + op_start, 
-            jEnd_batch + op_start, gemmBuff_ptrs + op_start, LDgemmBuff_batch + op_start, 
-            lpanels + op_start, upanels + op_start, dA 
-        );
-    }
-}
-
-int_t LUstruct_v100::dSchurComplementUpdateGPU(
-    int streamId,
-    int_t k, // the k-th panel or supernode
-    lpanel_t &lpanel, upanel_t &upanel)
-{
-
-    if (lpanel.isEmpty() || upanel.isEmpty())
-        return 0;
-
-    int_t st_lb = 0;
-    if (myrow == krow(k))
-        st_lb = 1;
-
-    int_t nlb = lpanel.nblocks();
-    int_t nub = upanel.nblocks();
-
-    int iSt = st_lb;
-    int iEnd = iSt;
-
-    int nrows = lpanel.stRow(nlb) - lpanel.stRow(st_lb);
-    int ncols = upanel.nzcols();
-
-    int maxGemmRows = nrows;
-    int maxGemmCols = ncols;
-    // entire gemm doesn't fit in gemm buffer
-    if (nrows * ncols > A_gpu.gemmBufferSize)
-    {
-        int maxGemmOpSize = (int)sqrt(A_gpu.gemmBufferSize);
-        int numberofRowChunks = (nrows + maxGemmOpSize - 1) / maxGemmOpSize;
-        maxGemmRows = nrows / numberofRowChunks;
-        maxGemmCols = A_gpu.gemmBufferSize / maxGemmRows;
-        /* printf("buffer exceeded! k = %d, st_lb = %d, nlb = %d, nrowsXncols %d, maxGemRows %d, maxGemmCols %d\n",
-	   k, st_lb, nlb, nrows*ncols, maxGemmRows, maxGemmCols);*/
-    }
-
-    while (iEnd < nlb)
-    {
-        iSt = iEnd;
-        iEnd = lpanel.getEndBlock(iSt, maxGemmRows);
-        
-        assert(iEnd > iSt);
-        int jSt = 0;
-        int jEnd = 0;
-        while (jEnd < nub)
-        {
-            jSt = jEnd;
-            jEnd = upanel.getEndBlock(jSt, maxGemmCols);
-            assert(jEnd > jSt);
-            cublasHandle_t handle = A_gpu.cuHandles[streamId];
-            cudaStream_t cuStream = A_gpu.cuStreams[streamId];
-            cublasSetStream(handle, cuStream);
-            int gemm_m = lpanel.stRow(iEnd) - lpanel.stRow(iSt);
-            int gemm_n = upanel.stCol(jEnd) - upanel.stCol(jSt);
-            int gemm_k = supersize(k);
-#if 0           
-            printf("k = %d, iSt = %d, iEnd = %d, jst = %d, jend = %d\n", k, iSt, iEnd, jSt, jEnd);
-            printf("m=%d, n=%d, k=%d\n", gemm_m, gemm_n, gemm_k);
-	    fflush(stdout);
-#endif	    
-
-            double alpha = 1.0;
-            double beta = 0.0;
-            cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
-                        gemm_m, gemm_n, gemm_k, &alpha,
-                        lpanel.blkPtrGPU(iSt), lpanel.LDA(),
-                        upanel.blkPtrGPU(jSt), upanel.LDA(), &beta,
-                        A_gpu.gpuGemmBuffs[streamId], gemm_m);
-            
-            scatterGPU_driver(
-                iSt, iEnd, jSt, jEnd, A_gpu.gpuGemmBuffs[streamId], gemm_m,
-                A_gpu.maxSuperSize, ldt, lpanel.gpuPanel, upanel.gpuPanel, 
-                dA_gpu, cuStream
-            );
-        }
-    }
-    gpuErrchk(cudaStreamSynchronize(A_gpu.cuStreams[streamId]));
-    return 0;
-} /* end dSchurComplementUpdateGPU */
-
-int_t LUstruct_v100::lookAheadUpdateGPU(
-    int streamId,
-    int_t k, int_t laIdx, lpanel_t &lpanel, upanel_t &upanel)
-{
-    if (lpanel.isEmpty() || upanel.isEmpty())
-        return 0;
-
-    int_t st_lb = 0;
-    if (myrow == krow(k))
-        st_lb = 1;
-
-    int_t nlb = lpanel.nblocks();
-    int_t nub = upanel.nblocks();
-
-    int_t laILoc = lpanel.find(laIdx);
-    int_t laJLoc = upanel.find(laIdx);
-
-    int iSt = st_lb;
-    int jSt = 0;
-
-    /* call look ahead update on Lpanel*/
-    if (laJLoc != GLOBAL_BLOCK_NOT_FOUND)
-        dSchurCompUpdatePartGPU(
-            iSt, nlb, laJLoc, laJLoc + 1,
-            k, lpanel, upanel,
-            A_gpu.lookAheadLHandle[streamId], A_gpu.lookAheadLStream[streamId],
-            A_gpu.lookAheadLGemmBuffer[streamId]);
-
-    /* call look ahead update on Upanel*/
-    if (laILoc != GLOBAL_BLOCK_NOT_FOUND)
-    {
-        dSchurCompUpdatePartGPU(
-            laILoc, laILoc + 1, jSt, laJLoc,
-            k, lpanel, upanel,
-            A_gpu.lookAheadUHandle[streamId], A_gpu.lookAheadUStream[streamId],
-            A_gpu.lookAheadUGemmBuffer[streamId]);
-        dSchurCompUpdatePartGPU(
-            laILoc, laILoc + 1, laJLoc + 1, nub,
-            k, lpanel, upanel,
-            A_gpu.lookAheadUHandle[streamId], A_gpu.lookAheadUStream[streamId],
-            A_gpu.lookAheadUGemmBuffer[streamId]);
-    }
-
-    // checkCudaLocal(cudaStreamSynchronize(A_gpu.lookAheadLStream[streamId]));
-    // checkCudaLocal(cudaStreamSynchronize(A_gpu.lookAheadUStream[streamId]));
-
-    return 0;
-}
-
-int_t LUstruct_v100::SyncLookAheadUpdate(int streamId)
-{
-    gpuErrchk(cudaStreamSynchronize(A_gpu.lookAheadLStream[streamId]));
-    gpuErrchk(cudaStreamSynchronize(A_gpu.lookAheadUStream[streamId]));
-
-    return 0;
-}
-
-int_t LUstruct_v100::dSchurCompUpdateExcludeOneGPU(
-    int streamId,
-    int_t k, int_t ex, // suypernodes to be excluded
-    lpanel_t &lpanel, upanel_t &upanel)
-{
-    if (lpanel.isEmpty() || upanel.isEmpty())
-        return 0;
-
-    int_t st_lb = 0;
-    if (myrow == krow(k))
-        st_lb = 1;
-
-    int_t nlb = lpanel.nblocks();
-    int_t nub = upanel.nblocks();
-
-    int_t exILoc = lpanel.find(ex);
-    int_t exJLoc = upanel.find(ex);
-
-    dSchurCompUpLimitedMem(
-        streamId,
-        st_lb, exILoc, 0, exJLoc,
-        k, lpanel, upanel);
-
-    dSchurCompUpLimitedMem(
-        streamId,
-        st_lb, exILoc, exJLoc + 1, nub,
-        k, lpanel, upanel);
-
-    int_t nextStI = exILoc + 1;
-    if (exILoc == GLOBAL_BLOCK_NOT_FOUND)
-        nextStI = st_lb;
-    /*
-    for j we don't need to change since, if exJLoc == GLOBAL_BLOCK_NOT_FOUND =-1
-    then exJLoc+1 =0 will work out correctly as starting j
-    */
-    dSchurCompUpLimitedMem(
-        streamId,
-        nextStI, nlb, 0, exJLoc,
-        k, lpanel, upanel);
-
-    dSchurCompUpLimitedMem(
-        streamId,
-        nextStI, nlb, exJLoc + 1, nub,
-        k, lpanel, upanel);
-
-    // checkCudaLocal(cudaStreamSynchronize(A_gpu.cuStreams[streamId]));
-    return 0;
-}
-
-int_t LUstruct_v100::dSchurCompUpdatePartGPU(
-    int_t iSt, int_t iEnd, int_t jSt, int_t jEnd,
-    int_t k, lpanel_t &lpanel, upanel_t &upanel,
-    cublasHandle_t handle, cudaStream_t cuStream,
-    double *gemmBuff)
-{
-    if (iSt >= iEnd || jSt >= jEnd)
-        return 0;
-
-    cublasSetStream(handle, cuStream);
-    int gemm_m = lpanel.stRow(iEnd) - lpanel.stRow(iSt);
-    int gemm_n = upanel.stCol(jEnd) - upanel.stCol(jSt);
-    int gemm_k = supersize(k);
-    double alpha = 1.0;
-    double beta = 0.0;
-#ifndef NDEBUG
-   // printf("m=%d, n=%d, k=%d\n", gemm_m, gemm_n, gemm_k);
-#endif
-    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
-                gemm_m, gemm_n, gemm_k, &alpha,
-                lpanel.blkPtrGPU(iSt), lpanel.LDA(),
-                upanel.blkPtrGPU(jSt), upanel.LDA(), &beta,
-                gemmBuff, gemm_m);
-
-    // setting up scatter
-    dim3 dimBlock(ldt); // 1d thread
-    dim3 dimGrid(iEnd - iSt, jEnd - jSt);
-    size_t sharedMemorySize = 3 * A_gpu.maxSuperSize * sizeof(int_t);
-
-    scatterGPU<<<dimGrid, dimBlock, sharedMemorySize, cuStream>>>(
-        iSt, jSt,
-        gemmBuff, gemm_m,
-        lpanel.gpuPanel, upanel.gpuPanel, dA_gpu);
-
-    return 0;
-}
-
-int_t LUstruct_v100::dSchurCompUpLimitedMem(
-    int streamId,
-    int_t lStart, int_t lEnd,
-    int_t uStart, int_t uEnd,
-    int_t k, lpanel_t &lpanel, upanel_t &upanel)
-{
-
-    if (lStart >= lEnd || uStart >= uEnd)
-        return 0;
-    int iSt = lStart;
-    int iEnd = iSt;
-    int nrows = lpanel.stRow(lEnd) - lpanel.stRow(lStart);
-    int ncols = upanel.stCol(uEnd) - upanel.stCol(uStart);
-
-    int maxGemmRows = nrows;
-    int maxGemmCols = ncols;
-    // entire gemm doesn't fit in gemm buffer
-    if (nrows * ncols > A_gpu.gemmBufferSize)
-    {
-        int maxGemmOpSize = (int)sqrt(A_gpu.gemmBufferSize);
-        int numberofRowChunks = (nrows + maxGemmOpSize - 1) / maxGemmOpSize;
-        maxGemmRows = nrows / numberofRowChunks;
-        maxGemmCols = A_gpu.gemmBufferSize / maxGemmRows;
-    }
-
-    while (iEnd < lEnd)
-    {
-        iSt = iEnd;
-        iEnd = lpanel.getEndBlock(iSt, maxGemmRows);
-        if (iEnd > lEnd)
-            iEnd = lEnd;
-
-        assert(iEnd > iSt);
-        int jSt = uStart;
-        int jEnd = uStart;
-        while (jEnd < uEnd)
-        {
-            jSt = jEnd;
-            jEnd = upanel.getEndBlock(jSt, maxGemmCols);
-            if (jEnd > uEnd)
-                jEnd = uEnd;
-
-            cublasHandle_t handle = A_gpu.cuHandles[streamId];
-            cudaStream_t cuStream = A_gpu.cuStreams[streamId];
-            dSchurCompUpdatePartGPU(iSt, iEnd, jSt, jEnd,
-                                    k, lpanel, upanel, handle, cuStream, A_gpu.gpuGemmBuffs[streamId]);
-        }
-    }
-
-    return 0;
-}
-
-int getMPIProcsPerGPU()
-{
-    if (!(getenv("MPI_PROCESS_PER_GPU")))
-    {
-        return 1;
-    }
-    else
-    {
-        int devCount;
-        cudaGetDeviceCount(&devCount);
-        int envCount = atoi(getenv("MPI_PROCESS_PER_GPU"));
-        envCount = SUPERLU_MAX(envCount, 1);
-        printf("MPI_PROCESS_PER_GPU=%d, devCount=%d\n", envCount, devCount);
-        return SUPERLU_MIN(envCount, devCount);
-    }
-}
-
-#define USABLE_GPU_MEM_FRACTION 0.9
-
-size_t getGPUMemPerProcs(MPI_Comm baseCommunicator)
-{
-
-    size_t mfree, mtotal;
-    // TODO: shared memory communicator should be part of
-    //  LU struct
-    //  MPI_Comm sharedComm;
-    //  MPI_Comm_split_type(baseCommunicator, MPI_COMM_TYPE_SHARED,
-    //                      0, MPI_INFO_NULL, &sharedComm);
-    //  MPI_Barrier(sharedComm);
-    cudaMemGetInfo(&mfree, &mtotal);
-    // MPI_Barrier(sharedComm);
-    // MPI_Comm_free(&sharedComm);
-#if 0
-    printf("Total memory %zu & free memory %zu\n", mtotal, mfree);
-#endif
-    return (size_t)(USABLE_GPU_MEM_FRACTION * (double)mfree) / getMPIProcsPerGPU();
-}
-
-int_t LUstruct_v100::setLUstruct_GPU()
-{
-    int i, stream;
-    
-#if (DEBUGlevel >= 1)
-    int iam = 0;
-    CHECK_MALLOC(iam, "Enter setLUstruct_GPU()"); fflush(stdout);
-#endif
-	
-    A_gpu.Pr = Pr;
-    A_gpu.Pc = Pc;
-    A_gpu.maxSuperSize = ldt;
-
-    /* Sherry: this mapping may be inefficient on Frontier */
-    /*Mapping to device*/
-    int deviceCount;
-    cudaGetDeviceCount(&deviceCount); // How many GPUs?
-    int device_id = (int)(grid3d->iam/get_mpi_process_per_gpu ()); //YL: allow multiple MPIs per GPU
-    
-    char *ttemp;
-    ttemp = getenv ("SUPERLU_BIND_MPI_GPU");
-    if (ttemp) {    
-    cudaSetDevice(device_id);
-    }
-
-    double tRegion[5];
-    size_t useableGPUMem = getGPUMemPerProcs(grid3d->comm);
-    /**
-     *  Memory is divided into two parts data memory and buffer memory
-     *  data memory is used for useful data
-     *  bufferMemory is used for buffers
-     * */
-    size_t memReqData = 0;
-
-    /*Memory for XSUP*/
-    memReqData += (nsupers + 1) * sizeof(int_t);
-
-    tRegion[0] = SuperLU_timer_();
-    
-    size_t totalNzvalSize = 0; /* too big for gemmBufferSize */
-    size_t max_gemmCsize = 0;  /* Sherry added 2/20/2023 */
-    size_t max_nzrow = 0;  /* Yang added 10/20/2023 */
-    size_t max_nzcol = 0;  
-    
-    /*Memory for lpapenl and upanel Data*/
-    for (i = 0; i < CEILING(nsupers, Pc); ++i)
-    {
-        if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1)
-        {
-            memReqData += lPanelVec[i].totalSize();
-            totalNzvalSize += lPanelVec[i].nzvalSize();
-            if(lPanelVec[i].nzvalSize()>0)
-                max_nzrow = SUPERLU_MAX(lPanelVec[i].nzrows(),max_nzrow);
-	    //max_gemmCsize = SUPERoLU_MAX(max_gemmCsize, ???);
-        }
-    }
-    for (i = 0; i < CEILING(nsupers, Pr); ++i)
-    {
-        if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-        {
-            memReqData += uPanelVec[i].totalSize();
-            totalNzvalSize += uPanelVec[i].nzvalSize();
-            if(uPanelVec[i].nzvalSize()>0)
-                max_nzcol = SUPERLU_MAX(uPanelVec[i].nzcols(),max_nzcol);
-        }
-    }
-    max_gemmCsize = max_nzcol*max_nzrow;
-    
-    memReqData += CEILING(nsupers, Pc) * sizeof(lpanelGPU_t);
-    memReqData += CEILING(nsupers, Pr) * sizeof(upanelGPU_t);
-
-    memReqData += sizeof(LUstructGPU_t);
-    
-    // Per stream data
-    // TODO: estimate based on ancestor size
-    int_t maxBuffSize = sp_ienv_dist (8, options);
-    int maxsup = sp_ienv_dist(3, options); // max. supernode size
-    maxBuffSize = SUPERLU_MAX(maxsup * maxsup, maxBuffSize); // Sherry added 7/10/23
- #if 0   
-    A_gpu.gemmBufferSize = SUPERLU_MIN(maxBuffSize, totalNzvalSize); 
- #else 
-    A_gpu.gemmBufferSize = SUPERLU_MIN(maxBuffSize, SUPERLU_MAX(max_gemmCsize,totalNzvalSize)); /* Yang added 10/20/2023 */
- #endif    
-    size_t dataPerStream = 3 * sizeof(double) * maxLvalCount + 3 * sizeof(double) * maxUvalCount + 2 * sizeof(int_t) * maxLidxCount + 2 * sizeof(int_t) * maxUidxCount + A_gpu.gemmBufferSize * sizeof(double) + ldt * ldt * sizeof(double);
-    if (memReqData + 2 * dataPerStream > useableGPUMem)
-    {
-        printf("Not enough memory on GPU: available = %zu, required for 2 streams =%zu, exiting\n", useableGPUMem, memReqData + 2 * dataPerStream);
-        exit(-1);
-    }
-
-    tRegion[0] = SuperLU_timer_() - tRegion[0];
-#if ( PRNTlevel>=1 )    
-    // print the time taken to estimate memory on GPU
-    if (grid3d->iam == 0)
-    {
-        printf("GPU deviceCount=%d\n", deviceCount);
-        printf("Time taken to estimate memory on GPU: %f\n", tRegion[0]);
-	printf("\t.. totalNzvalSize %ld, gemmBufferSize %ld\n",
-	       (long) totalNzvalSize, (long) A_gpu.gemmBufferSize);
-    }
-#endif
-
-    /*Memory for lapenlPanel Data*/
-    tRegion[1] = SuperLU_timer_();
-
-    int_t maxNumberOfStream = (useableGPUMem - memReqData) / dataPerStream;
-
-    int numberOfStreams = SUPERLU_MIN(getNumLookAhead(options), maxNumberOfStream);
-    numberOfStreams = SUPERLU_MIN(numberOfStreams, MAX_CUDA_STREAMS);
-    int rNumberOfStreams;
-    MPI_Allreduce(&numberOfStreams, &rNumberOfStreams, 1,
-                  MPI_INT, MPI_MIN, grid3d->comm);
-    A_gpu.numCudaStreams = rNumberOfStreams;
-
-    if (!grid3d->iam)
-        printf("Using %d CUDA LookAhead streams\n", rNumberOfStreams);
-    size_t totalMemoryRequired = memReqData + numberOfStreams * dataPerStream;
-
-#if 0 /**** Old code ****/
-    upanelGPU_t *uPanelVec_GPU = new upanelGPU_t[CEILING(nsupers, Pr)];
-    lpanelGPU_t *lPanelVec_GPU = new lpanelGPU_t[CEILING(nsupers, Pc)];
-    void *gpuBasePtr, *gpuCurrentPtr;
-    cudaMalloc(&gpuBasePtr, totalMemoryRequired);
-    gpuCurrentPtr = gpuBasePtr;
-
-    A_gpu.xsup = (int_t *)gpuCurrentPtr;
-    gpuCurrentPtr = (int_t *)gpuCurrentPtr + (nsupers + 1);
-    cudaMemcpy(A_gpu.xsup, xsup, (nsupers + 1) * sizeof(int_t), cudaMemcpyHostToDevice);
-
-    for (int i = 0; i < CEILING(nsupers, Pc); ++i)
-    {
-        if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1)
-        {
-            lPanelVec_GPU[i] = lPanelVec[i].copyToGPU(gpuCurrentPtr);
-            gpuCurrentPtr = (char *)gpuCurrentPtr + lPanelVec[i].totalSize();
-        }
-    }
-    A_gpu.lPanelVec = (lpanelGPU_t *)gpuCurrentPtr;
-    gpuCurrentPtr = (char *)gpuCurrentPtr + CEILING(nsupers, Pc) * sizeof(lpanelGPU_t);
-    cudaMemcpy(A_gpu.lPanelVec, lPanelVec_GPU,
-               CEILING(nsupers, Pc) * sizeof(lpanelGPU_t), cudaMemcpyHostToDevice);
-
-    for (int i = 0; i < CEILING(nsupers, Pr); ++i)
-    {
-        if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-        {
-            uPanelVec_GPU[i] = uPanelVec[i].copyToGPU(gpuCurrentPtr);
-            gpuCurrentPtr = (char *)gpuCurrentPtr + uPanelVec[i].totalSize();
-        }
-    }
-    A_gpu.uPanelVec = (upanelGPU_t *)gpuCurrentPtr;
-    gpuCurrentPtr = (char *)gpuCurrentPtr + CEILING(nsupers, Pr) * sizeof(upanelGPU_t);
-    cudaMemcpy(A_gpu.uPanelVec, uPanelVec_GPU,
-               CEILING(nsupers, Pr) * sizeof(upanelGPU_t), cudaMemcpyHostToDevice);
-
-    for (int stream = 0; stream < A_gpu.numCudaStreams; stream++)
-    {
-
-        cudaStreamCreate(&A_gpu.cuStreams[stream]);
-        cublasCreate(&A_gpu.cuHandles[stream]);
-        A_gpu.LvalRecvBufs[stream] = (double *)gpuCurrentPtr;
-        gpuCurrentPtr = (double *)gpuCurrentPtr + maxLvalCount;
-        A_gpu.UvalRecvBufs[stream] = (double *)gpuCurrentPtr;
-        gpuCurrentPtr = (double *)gpuCurrentPtr + maxUvalCount;
-        A_gpu.LidxRecvBufs[stream] = (int_t *)gpuCurrentPtr;
-        gpuCurrentPtr = (int_t *)gpuCurrentPtr + maxLidxCount;
-        A_gpu.UidxRecvBufs[stream] = (int_t *)gpuCurrentPtr;
-        gpuCurrentPtr = (int_t *)gpuCurrentPtr + maxUidxCount;
-
-        A_gpu.gpuGemmBuffs[stream] = (double *)gpuCurrentPtr;
-        gpuCurrentPtr = (double *)gpuCurrentPtr + A_gpu.gemmBufferSize;
-        A_gpu.dFBufs[stream] = (double *)gpuCurrentPtr;
-        gpuCurrentPtr = (double *)gpuCurrentPtr + ldt * ldt;
-
-        /*lookAhead buffers and stream*/
-        cublasCreate(&A_gpu.lookAheadLHandle[stream]);
-        cudaStreamCreate(&A_gpu.lookAheadLStream[stream]);
-        A_gpu.lookAheadLGemmBuffer[stream] = (double *)gpuCurrentPtr;
-        gpuCurrentPtr = (double *)gpuCurrentPtr + maxLvalCount;
-        cublasCreate(&A_gpu.lookAheadUHandle[stream]);
-        cudaStreamCreate(&A_gpu.lookAheadUStream[stream]);
-        A_gpu.lookAheadUGemmBuffer[stream] = (double *)gpuCurrentPtr;
-        gpuCurrentPtr = (double *)gpuCurrentPtr + maxUvalCount;
-    }
-    // cudaCheckError();
-    // allocate
-    dA_gpu = (LUstructGPU_t *)gpuCurrentPtr;
-
-    cudaMemcpy(dA_gpu, &A_gpu, sizeof(LUstructGPU_t), cudaMemcpyHostToDevice);
-    gpuCurrentPtr = (LUstructGPU_t *)gpuCurrentPtr + 1;
-
-#else /* else of #if 0 ----> this is the current active code - Sherry */
-    gpuErrchk(cudaMalloc(&A_gpu.xsup, (nsupers + 1) * sizeof(int_t)));
-    gpuErrchk(cudaMemcpy(A_gpu.xsup, xsup, (nsupers + 1) * sizeof(int_t), cudaMemcpyHostToDevice));
-
-    double tLsend, tUsend;
-#if 0
-    tLsend = SuperLU_timer_();
-    upanelGPU_t *uPanelVec_GPU = copyUpanelsToGPU();
-    tLsend = SuperLU_timer_() - tLsend;
-    tUsend = SuperLU_timer_();
-    lpanelGPU_t *lPanelVec_GPU = copyLpanelsToGPU();
-    tUsend = SuperLU_timer_() - tUsend;
-#else 
-    upanelGPU_t *uPanelVec_GPU = new upanelGPU_t[CEILING(nsupers, Pr)];
-    lpanelGPU_t *lPanelVec_GPU = new lpanelGPU_t[CEILING(nsupers, Pc)];
-    tLsend = SuperLU_timer_();
-    for (i = 0; i < CEILING(nsupers, Pc); ++i)
-    {
-        if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1)
-            lPanelVec_GPU[i] = lPanelVec[i].copyToGPU();
-    }
-    tLsend = SuperLU_timer_() - tLsend;
-    tUsend = SuperLU_timer_();
-    // cudaCheckError();
-    for (i = 0; i < CEILING(nsupers, Pr); ++i)
-    {
-        if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-            uPanelVec_GPU[i] = uPanelVec[i].copyToGPU();
-    }
-    tUsend = SuperLU_timer_() - tUsend;
-#endif
-    tRegion[1] = SuperLU_timer_() - tRegion[1];
-    printf("TRegion L,U send: \t %g\n", tRegion[1]);
-    printf("Time to send Lpanel=%g  and U panels =%g \n", tLsend, tUsend);
-    fflush(stdout);
-
-    gpuErrchk(cudaMalloc(&A_gpu.lPanelVec, CEILING(nsupers, Pc) * sizeof(lpanelGPU_t)));
-    gpuErrchk(cudaMemcpy(A_gpu.lPanelVec, lPanelVec_GPU,
-               CEILING(nsupers, Pc) * sizeof(lpanelGPU_t), cudaMemcpyHostToDevice));
-    gpuErrchk(cudaMalloc(&A_gpu.uPanelVec, CEILING(nsupers, Pr) * sizeof(upanelGPU_t)));
-    gpuErrchk(cudaMemcpy(A_gpu.uPanelVec, uPanelVec_GPU,
-               CEILING(nsupers, Pr) * sizeof(upanelGPU_t), cudaMemcpyHostToDevice));
-
-    delete [] uPanelVec_GPU;
-    delete [] lPanelVec_GPU;
-
-    tRegion[2] = SuperLU_timer_();
-    int dfactBufSize = 0;
-    // TODO: does it work with NULL pointer?
-    cusolverDnHandle_t cusolverH = NULL;
-    cusolverDnCreate(&cusolverH);
-    
-    cusolverDnDgetrf_bufferSize(cusolverH, ldt, ldt, NULL, ldt, &dfactBufSize);
-    
-    cusolverDnDestroy(cusolverH);
-    printf("Size of dfactBuf is %d\n", dfactBufSize);
-    tRegion[2] = SuperLU_timer_() - tRegion[2];
-    printf("TRegion dfactBuf: \t %g\n", tRegion[2]);
-    fflush(stdout);
-    
-    tRegion[3] = SuperLU_timer_();
-
-    double tcuMalloc=SuperLU_timer_();
-
-    /* Sherry: where are these freed ?? */
-    for (stream = 0; stream < A_gpu.numCudaStreams; stream++)
-    {
-        gpuErrchk(cudaMalloc(&A_gpu.LvalRecvBufs[stream], sizeof(double) * maxLvalCount));
-        gpuErrchk(cudaMalloc(&A_gpu.UvalRecvBufs[stream], sizeof(double) * maxUvalCount));
-        gpuErrchk(cudaMalloc(&A_gpu.LidxRecvBufs[stream], sizeof(int_t) * maxLidxCount));
-        gpuErrchk(cudaMalloc(&A_gpu.UidxRecvBufs[stream], sizeof(int_t) * maxUidxCount));
-        // allocate the space for diagonal factor on GPU
-        gpuErrchk(cudaMalloc(&A_gpu.diagFactWork[stream], sizeof(double) * dfactBufSize));
-        gpuErrchk(cudaMalloc(&A_gpu.diagFactInfo[stream], sizeof(int)));
-
-        /*lookAhead buffers and stream*/
-        gpuErrchk(cudaMalloc(&A_gpu.lookAheadLGemmBuffer[stream], sizeof(double) * maxLvalCount));
-
-        gpuErrchk(cudaMalloc(&A_gpu.lookAheadUGemmBuffer[stream], sizeof(double) * maxUvalCount));
-	// Sherry: replace this by new code 
-        //cudaMalloc(&A_gpu.dFBufs[stream], ldt * ldt * sizeof(double));
-        //cudaMalloc(&A_gpu.gpuGemmBuffs[stream], A_gpu.gemmBufferSize * sizeof(double));
-    }
-    
-    /* Sherry: dfBufs[] changed to double pointer **, max(batch, numCudaStreams) */
-    int mxLeafNode = trf3Dpartition->mxLeafNode, mx_fsize = 0;
-
-    /* Compute gemmCsize[] for batch operations 
-       !!!!!!! WARNING: this only works for 1 MPI  !!!!!! */
-    if ( options->batchCount > 0 ) {
-	trf3Dpartition->gemmCsizes = int32Calloc_dist(mxLeafNode);
-	int k, k0, k_st, k_end, offset, Csize;
-	
-	for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {  /* Loop through the Pz tree levels */
-	    int treeId = trf3Dpartition->myTreeIdxs[ilvl];
-	    sForest_t* sforest = trf3Dpartition->sForests[treeId];
-	    if (sforest){
-		int_t *perm_c_supno = sforest->nodeList ;
-        mx_fsize = max((int_t)mx_fsize, sforest->nNodes);
-
-		int maxTopoLevel = sforest->topoInfo.numLvl;/* number of levels at each outer-tree node */
-		for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) {
-		    k_st = sforest->topoInfo.eTreeTopLims[topoLvl];
-		    k_end = sforest->topoInfo.eTreeTopLims[topoLvl + 1];
-		
-		    for (k0 = k_st; k0 < k_end; ++k0) {
-			offset = k0 - k_st;
-			k = perm_c_supno[k0];
-			Csize = lPanelVec[k].nzrows() * uPanelVec[k].nzcols();
-			trf3Dpartition->gemmCsizes[offset] =
-			    SUPERLU_MAX(trf3Dpartition->gemmCsizes[offset], Csize);
-		    }
-		}
-	    }
-	}
-    }
-    
-    int num_dfbufs;  /* number of diagonal buffers */
-    if ( options->batchCount > 0 ) { /* use batch code */
-	num_dfbufs = mxLeafNode;
-    } else { /* use pipelined code */
-	// num_dfbufs = MAX_CUDA_STREAMS; // 
-    num_dfbufs = A_gpu.numCudaStreams;
-    }
-    int num_gemmbufs = num_dfbufs;
-    printf(".. setLUstrut_GPU: num_dfbufs %d, num_gemmbufs %d\n", num_dfbufs, num_gemmbufs); fflush(stdout);
-
-    A_gpu.dFBufs = (double **) SUPERLU_MALLOC(num_dfbufs * sizeof(double *));
-    A_gpu.gpuGemmBuffs = (double **) SUPERLU_MALLOC(num_gemmbufs * sizeof(double *));
-    
-    int l, sum_diag_size = 0, sum_gemmC_size = 0;
-    
-    if ( options->batchCount > 0 ) { /* set up variable-size buffers for batch code */
-	for (i = 0; i < num_dfbufs; ++i) {
-	    l = trf3Dpartition->diagDims[i];
-	    gpuErrchk(cudaMalloc(&(A_gpu.dFBufs[i]), l * l * sizeof(double)));
-	    //printf("\t diagDims[%d] %d\n", i, l);
-	    gpuErrchk(cudaMalloc(&(A_gpu.gpuGemmBuffs[i]), trf3Dpartition->gemmCsizes[i] * sizeof(double)));
-	    sum_diag_size += l * l;
-	    sum_gemmC_size += trf3Dpartition->gemmCsizes[i];
-	}
-    } else { /* uniform-size buffers */
-	l = ldt * ldt;
-	for (i = 0; i < num_dfbufs; ++i) {
-        gpuErrchk(cudaMalloc(&(A_gpu.dFBufs[i]), l * sizeof(double)));
-	    gpuErrchk(cudaMalloc(&(A_gpu.gpuGemmBuffs[i]), A_gpu.gemmBufferSize * sizeof(double)));
-	}
-    }
-    
-    // Wajih: Adding allocation for batched LU and SCU marshalled data
-    // TODO: these are serialized workspaces, so the allocations can be shared
-
-#if 0 // old batch code 
-    A_gpu.marshall_data.setBatchSize(num_dfbufs);
-    A_gpu.sc_marshall_data.setBatchSize(num_dfbufs);
-#endif
-
-    // TODO: where should these be freed?
-    // Allocate GPU copy for the node list 
-    gpuErrchk(cudaMalloc(&(A_gpu.dperm_c_supno), sizeof(int) * mx_fsize));
-    // Allocate GPU copy of all the gemm buffer pointers and copy the host array to the GPU 
-    gpuErrchk(cudaMalloc(&(A_gpu.dgpuGemmBuffs), sizeof(double*) * num_gemmbufs));
-    gpuErrchk(cudaMemcpy(A_gpu.dgpuGemmBuffs, A_gpu.gpuGemmBuffs, sizeof(double*) * num_gemmbufs, cudaMemcpyHostToDevice));
-
-    tcuMalloc = SuperLU_timer_() - tcuMalloc;
-#if ( PRNTlevel>=1 )
-    printf("Time to allocate GPU memory: %g\n", tcuMalloc);
-    printf("\t.. sum_diag_size %d\t sum_gemmC_size %d\n", sum_diag_size, sum_gemmC_size);
-    fflush(stdout);
-#endif
-
-    double tcuStream=SuperLU_timer_();
-    
-    for (stream = 0; stream < A_gpu.numCudaStreams; stream++)
-    {
-        // cublasCreate(&A_gpu.cuHandles[stream]);
-        cusolverDnCreate(&A_gpu.cuSolveHandles[stream]);
-    }
-    tcuStream = SuperLU_timer_() - tcuStream;
-    printf("Time to create cublas streams: %g\n", tcuStream);
-
-    double tcuStreamCreate=SuperLU_timer_();
-    for (stream = 0; stream < A_gpu.numCudaStreams; stream++)
-    {
-        cudaStreamCreate(&A_gpu.cuStreams[stream]);
-        cublasCreate(&A_gpu.cuHandles[stream]);
-        /*lookAhead buffers and stream*/
-        cublasCreate(&A_gpu.lookAheadLHandle[stream]);
-        cudaStreamCreate(&A_gpu.lookAheadLStream[stream]);
-        cublasCreate(&A_gpu.lookAheadUHandle[stream]);
-        cudaStreamCreate(&A_gpu.lookAheadUStream[stream]);
-
-        // Wajih: Need to create at least one magma queue
-#ifdef HAVE_MAGMA
-        if(stream == 0)
-        {
-            magma_queue_create_from_cuda(
-                device_id, A_gpu.cuStreams[stream], A_gpu.cuHandles[stream], 
-                NULL, &A_gpu.magma_queue
-            );
-        }
-#endif
-    }
-    tcuStreamCreate = SuperLU_timer_() - tcuStreamCreate;
-    printf("Time to create CUDA streams: %g\n", tcuStreamCreate);
-
-    tRegion[3] = SuperLU_timer_() - tRegion[3];
-    printf("TRegion stream: \t %g\n", tRegion[3]);
-    // allocate
-    gpuErrchk(cudaMalloc(&dA_gpu, sizeof(LUstructGPU_t)));
-    gpuErrchk(cudaMemcpy(dA_gpu, &A_gpu, sizeof(LUstructGPU_t), cudaMemcpyHostToDevice));
-
-#endif
-    
-    // cudaCheckError();
-    
-#if (DEBUGlevel >= 1)
-	CHECK_MALLOC(iam, "Exit setLUstruct_GPU()");
-#endif
-    return 0;
-} /* setLUstruct_GPU */
-
-int_t LUstruct_v100::copyLUGPUtoHost()
-{
-
-    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
-        if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1)
-            lPanelVec[i].copyFromGPU();
-
-    for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
-        if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-            uPanelVec[i].copyFromGPU();
-    return 0;
-}
-
-int_t LUstruct_v100::copyLUHosttoGPU()
-{
-    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
-        if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1)
-            lPanelVec[i].copyBackToGPU();
-
-    for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
-        if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-            uPanelVec[i].copyBackToGPU();
-    return 0;
-}
-
-int_t LUstruct_v100::checkGPU()
-{
-
-    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
-        lPanelVec[i].checkGPU();
-
-    for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
-        uPanelVec[i].checkGPU();
-
-    std::cout << "Checking LU struct completed succesfully"
-              << "\n";
-    return 0;
-}
-
-
-#if 0 // Sherry: old batch code 
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Marshall Functors for batched execution 
-struct MarshallLUFunc {
-    int k_st, *ld_batch, *dim_batch;
-    double** diag_ptrs;
-    LUstructGPU_t *A_gpu;
-
-    MarshallLUFunc(int k_st, double** diag_ptrs, int *ld_batch, int *dim_batch, LUstructGPU_t *A_gpu)
-    {
-        this->k_st = k_st;
-        this->ld_batch = ld_batch;
-        this->dim_batch = dim_batch;
-        this->diag_ptrs = diag_ptrs;
-        this->A_gpu = A_gpu;
-    }
-    
-    __device__ void operator()(const unsigned int &i) const
-    {   
-        int k = A_gpu->dperm_c_supno[k_st + i];
-        int_t* xsup = A_gpu->xsup;
-        lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)];
-
-        diag_ptrs[i] = lpanel.blkPtr(0);
-        ld_batch[i] = lpanel.LDA();
-        dim_batch[i] = SuperSize(k);
-    }
-};
-
-struct MarshallTRSMUFunc {
-    int k_st, *diag_ld_batch, *diag_dim_batch, *panel_ld_batch, *panel_dim_batch;
-    double** diag_ptrs, **panel_ptrs;
-    LUstructGPU_t *A_gpu;
-
-    MarshallTRSMUFunc(
-        int k_st, double** diag_ptrs, int *diag_ld_batch, int *diag_dim_batch, double** panel_ptrs,
-        int *panel_ld_batch, int *panel_dim_batch, LUstructGPU_t *A_gpu
-    )
-    {
-        this->k_st = k_st;
-        this->diag_ptrs = diag_ptrs;
-        this->diag_ld_batch = diag_ld_batch;
-        this->diag_dim_batch = diag_dim_batch;
-        this->panel_ptrs = panel_ptrs;
-        this->panel_ld_batch = panel_ld_batch;
-        this->panel_dim_batch = panel_dim_batch;
-        this->A_gpu = A_gpu;
-    }
-    
-    __device__ void operator()(const unsigned int &i) const
-    {   
-        int k = A_gpu->dperm_c_supno[k_st + i];
-        int_t* xsup = A_gpu->xsup;
-        int ksupc = SuperSize(k);
-
-        upanelGPU_t &upanel = A_gpu->uPanelVec[A_gpu->g2lRow(k)];
-        lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)];
-
-        if(!upanel.isEmpty())
-        {
-            panel_ptrs[i] = upanel.blkPtr(0);
-            panel_ld_batch[i] = upanel.LDA();
-            panel_dim_batch[i] = upanel.nzcols();
-            diag_ptrs[i] = lpanel.blkPtr(0);
-            diag_ld_batch[i] = lpanel.LDA();
-            diag_dim_batch[i] = ksupc;
-        }
-        else
-        {
-            panel_ptrs[i] = diag_ptrs[i] = NULL;
-            panel_ld_batch[i] = diag_ld_batch[i] = 1;
-            panel_dim_batch[i] = diag_dim_batch[i] = 0;
-        }    
-    }
-};
-
-struct MarshallTRSMLFunc {
-    int k_st, *diag_ld_batch, *diag_dim_batch, *panel_ld_batch, *panel_dim_batch;
-    double** diag_ptrs, **panel_ptrs;
-    LUstructGPU_t *A_gpu;
-
-    MarshallTRSMLFunc(
-        int k_st, double** diag_ptrs, int *diag_ld_batch, int *diag_dim_batch, double** panel_ptrs,
-        int *panel_ld_batch, int *panel_dim_batch, LUstructGPU_t *A_gpu
-    )
-    {
-        this->k_st = k_st;
-        this->diag_ptrs = diag_ptrs;
-        this->diag_ld_batch = diag_ld_batch;
-        this->diag_dim_batch = diag_dim_batch;
-        this->panel_ptrs = panel_ptrs;
-        this->panel_ld_batch = panel_ld_batch;
-        this->panel_dim_batch = panel_dim_batch;
-        this->A_gpu = A_gpu;
-    }
-    
-    __device__ void operator()(const unsigned int &i) const
-    {
-        int k = A_gpu->dperm_c_supno[k_st + i];
-        int_t* xsup = A_gpu->xsup;
-        int ksupc = SuperSize(k);
-
-        lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)];
-
-        if(!lpanel.isEmpty())
-        {
-            double *lPanelStPtr = lpanel.blkPtr(0);
-            int_t len = lpanel.nzrows();
-            if(lpanel.haveDiag())
-            {
-                lPanelStPtr = lpanel.blkPtr(1);
-                len -= lpanel.nbrow(0);
-            }
-            panel_ptrs[i] = lPanelStPtr;
-            panel_ld_batch[i] = lpanel.LDA();
-            panel_dim_batch[i] = len;
-            diag_ptrs[i] = lpanel.blkPtr(0);
-            diag_ld_batch[i] = lpanel.LDA();
-            diag_dim_batch[i] = ksupc;
-        }
-        else
-        {
-            panel_ptrs[i] = diag_ptrs[i] = NULL;
-            panel_ld_batch[i] = diag_ld_batch[i] = 1;
-            panel_dim_batch[i] = diag_dim_batch[i] = 0;
-        }    
-    }
-};
-
-struct MarshallInitSCUFunc {
-    int k_st, *ist, *iend, *maxGemmRows, *maxGemmCols;
-    lpanelGPU_t* lpanels;
-    upanelGPU_t* upanels;
-    LUstructGPU_t *A_gpu;
-
-    MarshallInitSCUFunc(
-        int k_st, int *ist, int *iend, int *maxGemmRows, int *maxGemmCols, 
-        lpanelGPU_t* lpanels, upanelGPU_t* upanels, LUstructGPU_t *A_gpu
-    )
-    {
-        this->k_st = k_st;
-        this->ist = ist;
-        this->iend = iend;
-        this->maxGemmRows = maxGemmRows;
-        this->maxGemmCols = maxGemmCols;
-        this->lpanels = lpanels;
-        this->upanels = upanels;
-        this->A_gpu = A_gpu;
-    }
-    
-    __device__ void operator()(const unsigned int &i) const
-    {   
-        int k = A_gpu->dperm_c_supno[k_st + i];
-        size_t gemmBufferSize = A_gpu->gemmBufferSize;
-        
-        lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)];
-        upanelGPU_t &upanel = A_gpu->uPanelVec[A_gpu->g2lCol(k)];
-        
-        lpanels[i] = lpanel;
-        upanels[i] = upanel;
-
-        if(!upanel.isEmpty() && !lpanel.isEmpty())
-        {
-            int_t st_lb = 1;
-            int_t nlb = lpanel.nblocks();
-            int_t nub = upanel.nblocks();
-
-            ist[i] = iend[i] = st_lb;
-            int nrows = lpanel.stRow(nlb) - lpanel.stRow(st_lb);
-            int ncols = upanel.nzcols();
-
-            int max_rows = nrows;
-            int max_cols = ncols;
-            // entire gemm doesn't fit in gemm buffer
-            if (nrows * ncols > gemmBufferSize)
-            {
-                int maxGemmOpSize = (int)sqrt((double)gemmBufferSize);
-                int numberofRowChunks = (nrows + maxGemmOpSize - 1) / maxGemmOpSize;
-                max_rows = nrows / numberofRowChunks;
-                max_cols = gemmBufferSize / max_rows;
-            }
-
-            maxGemmRows[i] = max_rows;
-            maxGemmCols[i] = max_cols;
-        }
-        else
-        {
-            ist[i] = iend[i] = 0;
-            maxGemmRows[i] = maxGemmCols[i] = 0;
-        }
-    }
-};
-
-struct MarshallSCUOuter_Predicate
-{
-    __host__ __device__ bool operator()(const int &x)
-    {
-        return x == 1;
-    }
-};
-
-struct MarshallSCUOuterFunc {
-    int k_st, *ist, *iend, *jst, *jend, *maxGemmRows, *done_flags;
-    LUstructGPU_t *A_gpu;
-
-    MarshallSCUOuterFunc(int k_st, int *ist, int *iend, int *jst, int *jend, int *maxGemmRows, int* done_flags, LUstructGPU_t *A_gpu)
-    {
-        this->k_st = k_st;
-        this->ist = ist;
-        this->iend = iend;
-        this->jst = jst;
-        this->jend = jend;
-        this->maxGemmRows = maxGemmRows;
-        this->done_flags = done_flags;
-        this->A_gpu = A_gpu;
-    }
-    
-    __device__ void operator()(const unsigned int &i) const
-    {
-        int k = A_gpu->dperm_c_supno[k_st + i];
-        lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)];
-        upanelGPU_t &upanel = A_gpu->uPanelVec[A_gpu->g2lCol(k)];
-        int& iEnd = iend[i];
-
-        // Not done if even one operation still has work to do 
-        if(!lpanel.isEmpty() && !upanel.isEmpty() && iEnd < lpanel.nblocks())
-        {
-            int& iSt = ist[i];
-            iSt = iEnd;
-            iEnd = lpanel.getEndBlock(iSt, maxGemmRows[i]);
-            assert(iEnd > iSt);
-            jst[i] = jend[i] = 0;
-            done_flags[i] = 0;
-        }
-        else
-        {
-            done_flags[i] = 1;
-        }
-    }
-};
-
-struct MarshallSCUInner_Predicate
-{
-    __host__ __device__ bool operator()(const int &x)
-    {
-        return x == 0;
-    }
-};
-
-template<typename T>
-struct element_diff : public thrust::unary_function<T,T>
-{
-    T* st, *end;
-    element_diff(T* st, T *end) 
-    {
-        this->st = st;
-        this->end = end;
-    }
-    
-    __device__ T operator()(const T &x) const
-    {
-        return end[x] - st[x];
-    }
-};
-
-
-struct MarshallSCUInnerFunc {
-    int k_st, *ist, *iend, *jst, *jend, *maxGemmCols;
-    LUstructGPU_t *A_gpu;
-    double** A_ptrs, **B_ptrs, **C_ptrs;
-    int* lda_array, *ldb_array, *ldc_array, *m_array, *n_array, *k_array;
-
-    MarshallSCUInnerFunc(
-        int k_st, int *ist, int *iend, int *jst, int *jend, int *maxGemmCols, 
-        double** A_ptrs, int* lda_array, double** B_ptrs, int* ldb_array, double **C_ptrs, int *ldc_array,
-        int *m_array, int *n_array, int *k_array, LUstructGPU_t *A_gpu
-    )
-    {
-        this->k_st = k_st;
-        this->ist = ist;
-        this->iend = iend;
-        this->jst = jst;
-        this->jend = jend;
-        this->maxGemmCols = maxGemmCols;
-        this->A_ptrs = A_ptrs;
-        this->B_ptrs = B_ptrs;
-        this->C_ptrs = C_ptrs;
-        this->lda_array = lda_array;
-        this->ldb_array = ldb_array;
-        this->ldc_array = ldc_array;
-        this->m_array = m_array;
-        this->n_array = n_array;
-        this->k_array = k_array;
-        this->A_gpu = A_gpu;
-    }
-
-    __device__ void operator()(const unsigned int &i) const
-    {
-        int k = A_gpu->dperm_c_supno[k_st + i];
-        int_t* xsup = A_gpu->xsup;
-        lpanelGPU_t &lpanel = A_gpu->lPanelVec[A_gpu->g2lCol(k)];
-        upanelGPU_t &upanel = A_gpu->uPanelVec[A_gpu->g2lCol(k)];
-        
-        int iSt = ist[i], iEnd = iend[i]; 
-        int& jSt = jst[i], &jEnd = jend[i];
-        
-        // Not done if even one operation still has work to do 
-        if(!lpanel.isEmpty() && !upanel.isEmpty() && jEnd < upanel.nblocks())
-        {
-            jSt = jEnd;
-            jEnd = upanel.getEndBlock(jSt, maxGemmCols[i]);
-            assert(jEnd > jSt);
-        
-            A_ptrs[i] = lpanel.blkPtr(iSt);
-            B_ptrs[i] = upanel.blkPtr(jSt);
-            C_ptrs[i] = A_gpu->dgpuGemmBuffs[i];
-
-            lda_array[i] = lpanel.LDA();
-            ldb_array[i] = upanel.LDA();
-            ldc_array[i] = lpanel.stRow(iEnd) - lpanel.stRow(iSt);
-                        
-            m_array[i] = ldc_array[i];
-            n_array[i] = upanel.stCol(jEnd) - upanel.stCol(jSt);
-            k_array[i] = SuperSize(k);
-        }
-        else
-        {
-            A_ptrs[i] = B_ptrs[i] = C_ptrs[i] = NULL;
-            lda_array[i] = ldb_array[i] = ldc_array[i] = 1;
-            m_array[i] = n_array[i] = k_array[i] = 0;
-        }
-    }
-};
-
-// Marshalling routines for batched execution 
-void LUstruct_v100::marshallBatchedLUData(int k_st, int k_end, int_t *perm_c_supno)
-{
-    // First gather up all the pointer and meta data on the host 
-    LUMarshallData& mdata = A_gpu.marshall_data;
-    mdata.batchsize = k_end - k_st;
-    
-    MarshallLUFunc func(k_st, mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_diag_dim_array, dA_gpu);
-
-    thrust::for_each(
-        thrust::system::cuda::par, thrust::counting_iterator<int>(0),
-        thrust::counting_iterator<int>(mdata.batchsize), func
-    );
-
-    // double **diag_ptrs = mdata.host_diag_ptrs.data();
-    // int *ld_batch = mdata.host_diag_ld_array.data();
-    // int *dim_batch = mdata.host_diag_dim_array.data();
-
-	// mdata.batchsize = 0;
-
-    // for (int_t k0 = k_st; k0 < k_end; k0++)
-    // {
-    //     int_t k = perm_c_supno[k0];
-        
-	// 	if (iam == procIJ(k, k))
-	// 	{			
-    //         assert(mdata.batchsize < mdata.host_diag_ptrs.size());
-
-    //         lpanel_t &lpanel = lPanelVec[g2lCol(k)];
-	// 		diag_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0);
-	// 		ld_batch[mdata.batchsize] = lpanel.LDA();
-	// 		dim_batch[mdata.batchsize] = SuperSize(k);	
-	// 		mdata.batchsize++;
-	// 	}     
-    // }
-
-    // mdata.setMaxDiag();
-    // // Then copy the marshalled data over to the GPU 
-    // gpuErrchk(cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice));
-    // gpuErrchk(cudaMemcpy(mdata.dev_diag_ld_array, ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    // gpuErrchk(cudaMemcpy(mdata.dev_diag_dim_array, dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-}
-
-void LUstruct_v100::marshallBatchedTRSMUData(int k_st, int k_end, int_t *perm_c_supno)
-{
-    // First gather up all the pointer and meta data on the host 
-    LUMarshallData& mdata = A_gpu.marshall_data;
-    mdata.batchsize = k_end - k_st;
-
-    MarshallTRSMUFunc func(
-        k_st, mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_diag_dim_array, 
-        mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, mdata.dev_panel_dim_array, dA_gpu
-    );
-
-    thrust::for_each(
-        thrust::system::cuda::par, thrust::counting_iterator<int>(0),
-        thrust::counting_iterator<int>(mdata.batchsize), func
-    );
-
-    // double **panel_ptrs = mdata.host_panel_ptrs.data();
-    // int *panel_ld_batch = mdata.host_panel_ld_array.data();
-    // int *panel_dim_batch = mdata.host_panel_dim_array.data();
-    // double **diag_ptrs = mdata.host_diag_ptrs.data();
-    // int *diag_ld_batch = mdata.host_diag_ld_array.data();
-    // int *diag_dim_batch = mdata.host_diag_dim_array.data();
-
-	// mdata.batchsize = 0;
-
-    // for (int_t k0 = k_st; k0 < k_end; k0++)
-    // {
-    //     int_t k = perm_c_supno[k0];
-    //     int_t buffer_offset = k0 - k_st;
-	// 	int ksupc = SuperSize(k);
-
-	// 	if (myrow == krow(k))
-	// 	{			
-    //         upanel_t& upanel = uPanelVec[g2lRow(k)];
-    //         lpanel_t &lpanel = lPanelVec[g2lCol(k)];
-    //         if(!upanel.isEmpty())
-    //         {
-    //             assert(mdata.batchsize < mdata.host_diag_ptrs.size());
-
-    //             panel_ptrs[mdata.batchsize] = upanel.blkPtrGPU(0);
-    //             panel_ld_batch[mdata.batchsize] = upanel.LDA();
-    //             panel_dim_batch[mdata.batchsize] = upanel.nzcols();
-                
-    //             // Hackathon change: using the original diagonal block instead of the bcast buffer
-    //             // diag_ptrs[mdata.batchsize] = A_gpu.dFBufs[buffer_offset];
-    //             // diag_ld_batch[mdata.batchsize] = ksupc;
-    //             // diag_dim_batch[mdata.batchsize] = ksupc;
-    //             diag_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0);
-    //             diag_ld_batch[mdata.batchsize] = lpanel.LDA();
-    //             diag_dim_batch[mdata.batchsize] = ksupc;
-
-    //             mdata.batchsize++;
-    //         }
-	// 	}     
-    // }
-
-    // mdata.setMaxDiag();
-    // mdata.setMaxPanel();
-    
-    // // Then copy the marshalled data over to the GPU 
-    // gpuErrchk(cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice));
-    // gpuErrchk(cudaMemcpy(mdata.dev_diag_ld_array, diag_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    // gpuErrchk(cudaMemcpy(mdata.dev_diag_dim_array, diag_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    // gpuErrchk(cudaMemcpy(mdata.dev_panel_ptrs, panel_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice));
-    // gpuErrchk(cudaMemcpy(mdata.dev_panel_ld_array, panel_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-    // gpuErrchk(cudaMemcpy(mdata.dev_panel_dim_array, panel_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice));
-}
-
-void LUstruct_v100::marshallBatchedTRSMLData(int k_st, int k_end, int_t *perm_c_supno)
-{
-    // First gather up all the pointer and meta data on the host 
-    LUMarshallData& mdata = A_gpu.marshall_data;
-    mdata.batchsize = k_end - k_st;
-
-    MarshallTRSMLFunc func(
-        k_st, mdata.dev_diag_ptrs, mdata.dev_diag_ld_array, mdata.dev_diag_dim_array, 
-        mdata.dev_panel_ptrs, mdata.dev_panel_ld_array, mdata.dev_panel_dim_array, dA_gpu
-    );
-
-    thrust::for_each(
-        thrust::system::cuda::par, thrust::counting_iterator<int>(0),
-        thrust::counting_iterator<int>(mdata.batchsize), func
-    );
-
-    // double **panel_ptrs = mdata.host_panel_ptrs.data();
-    // int *panel_ld_batch = mdata.host_panel_ld_array.data();
-    // int *panel_dim_batch = mdata.host_panel_dim_array.data();
-    // double **diag_ptrs = mdata.host_diag_ptrs.data();
-    // int *diag_ld_batch = mdata.host_diag_ld_array.data();
-    // int *diag_dim_batch = mdata.host_diag_dim_array.data();
-
-	// mdata.batchsize = 0;
-
-    // for (int_t k0 = k_st; k0 < k_end; k0++)
-    // {
-    //     int_t k = perm_c_supno[k0];
-    //     int_t buffer_offset = k0 - k_st;
-	// 	int ksupc = SuperSize(k);
-
-	// 	if (mycol == kcol(k))
-	// 	{			
-    //         lpanel_t &lpanel = lPanelVec[g2lCol(k)];
-    //         if(!lpanel.isEmpty())
-    //         {
-    //             assert(mdata.batchsize < mdata.host_diag_ptrs.size());
-
-    //             double *lPanelStPtr = lpanel.blkPtrGPU(0);
-    //             int_t len = lpanel.nzrows();
-    //             if(lpanel.haveDiag())
-    //             {
-    //                 /* code */
-    //                 lPanelStPtr = lpanel.blkPtrGPU(1);
-    //                 len -= lpanel.nbrow(0);
-    //             }
-    //             panel_ptrs[mdata.batchsize] = lPanelStPtr;
-    //             panel_ld_batch[mdata.batchsize] = lpanel.LDA();
-    //             panel_dim_batch[mdata.batchsize] = len;
-                
-    //             // Hackathon change: using the original diagonal block instead of the bcast buffer
-    //             // diag_ptrs[mdata.batchsize] = A_gpu.dFBufs[buffer_offset];
-    //             // diag_ld_batch[mdata.batchsize] = ksupc;
-    //             // diag_dim_batch[mdata.batchsize] = ksupc;
-    //             diag_ptrs[mdata.batchsize] = lpanel.blkPtrGPU(0);
-    //             diag_ld_batch[mdata.batchsize] = lpanel.LDA();
-    //             diag_dim_batch[mdata.batchsize] = ksupc;
-
-    //             mdata.batchsize++;
-    //         }
-	// 	}     
-    // }
-    
-    // mdata.setMaxDiag();
-    // mdata.setMaxPanel();
-
-    // // Then copy the marshalled data over to the GPU 
-    // cudaMemcpy(mdata.dev_diag_ptrs, diag_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice);
-    // cudaMemcpy(mdata.dev_diag_ld_array, diag_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice);
-    // cudaMemcpy(mdata.dev_diag_dim_array, diag_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice);
-    // cudaMemcpy(mdata.dev_panel_ptrs, panel_ptrs, mdata.batchsize * sizeof(double*), cudaMemcpyHostToDevice);
-    // cudaMemcpy(mdata.dev_panel_ld_array, panel_ld_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice);
-    // cudaMemcpy(mdata.dev_panel_dim_array, panel_dim_batch, mdata.batchsize * sizeof(int), cudaMemcpyHostToDevice);
-}
-
-void LUstruct_v100::initSCUMarshallData(int k_st, int k_end, int_t *perm_c_supno)
-{
-    SCUMarshallData& sc_mdata = A_gpu.sc_marshall_data;
-    sc_mdata.batchsize = k_end - k_st;
-
-    MarshallInitSCUFunc func(
-        k_st, sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_maxGemmRows, sc_mdata.dev_maxGemmCols, 
-        sc_mdata.dev_gpu_lpanels, sc_mdata.dev_gpu_upanels, dA_gpu
-    );
-    
-    thrust::for_each(
-        thrust::system::cuda::par, thrust::counting_iterator<int>(0),
-        thrust::counting_iterator<int>(sc_mdata.batchsize), func
-    );
-
-    // for (int_t k0 = k_st; k0 < k_end; k0++)
-    // {
-    //     int_t k = perm_c_supno[k0];
-    //     int_t buffer_offset = k0 - k_st;
-
-    //     assert(buffer_offset < sc_mdata.upanels.size());
-
-    //     // Wajih: TODO: figure out what this offset does
-    //     int offset = 0;
-    //     if (UidxSendCounts[k] > 0 && LidxSendCounts[k] > 0) 
-    //     {
-    //         sc_mdata.upanels[buffer_offset] = getKUpanel(k, offset);
-    //         sc_mdata.lpanels[buffer_offset] = getKLpanel(k, offset);
-
-    //         // Set gemm loop parameters for the panels 
-    //         upanel_t& upanel = sc_mdata.upanels[buffer_offset];
-    //         lpanel_t& lpanel = sc_mdata.lpanels[buffer_offset];
-            
-    //         sc_mdata.host_gpu_upanels[buffer_offset] = upanel.gpuPanel;
-    //         sc_mdata.host_gpu_lpanels[buffer_offset] = lpanel.gpuPanel;
-
-    //         if(!upanel.isEmpty() && !lpanel.isEmpty())
-    //         {
-    //             int_t st_lb = 0;
-    //             if (myrow == krow(k))
-    //                 st_lb = 1;
-
-    //             int_t nlb = lpanel.nblocks();
-    //             int_t nub = upanel.nblocks();
-                
-    //             sc_mdata.ist[buffer_offset] = st_lb;
-    //             sc_mdata.iend[buffer_offset] = sc_mdata.ist[buffer_offset];
-
-    //             int nrows = lpanel.stRow(nlb) - lpanel.stRow(st_lb);
-    //             int ncols = upanel.nzcols();
-
-    //             int maxGemmRows = nrows;
-    //             int maxGemmCols = ncols;
-    //             // entire gemm doesn't fit in gemm buffer
-    //             if (nrows * ncols > A_gpu.gemmBufferSize)
-    //             {
-    //                 int maxGemmOpSize = (int)sqrt(A_gpu.gemmBufferSize);
-    //                 int numberofRowChunks = (nrows + maxGemmOpSize - 1) / maxGemmOpSize;
-    //                 maxGemmRows = nrows / numberofRowChunks;
-    //                 maxGemmCols = A_gpu.gemmBufferSize / maxGemmRows;
-    //             }
-
-    //             sc_mdata.maxGemmRows[buffer_offset] = maxGemmRows;
-    //             sc_mdata.maxGemmCols[buffer_offset] = maxGemmCols;
-    //         }
-    //     }
-    // }
-
-    // sc_mdata.copyPanelDataToGPU();
-}
-
-int LUstruct_v100::marshallSCUBatchedDataOuter(int k_st, int k_end, int_t *perm_c_supno)
-{
-    SCUMarshallData& sc_mdata = A_gpu.sc_marshall_data;
-    sc_mdata.batchsize = k_end - k_st;
-
-    // Temporarily use the m array for the done flags
-    int *done_flags = sc_mdata.dev_m_array;
-    MarshallSCUOuterFunc func(
-        k_st, sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend, 
-        sc_mdata.dev_maxGemmRows, done_flags, dA_gpu 
-    );
-
-    thrust::for_each(
-        thrust::system::cuda::par, thrust::counting_iterator<int>(0),
-        thrust::counting_iterator<int>(sc_mdata.batchsize), func
-    );
-
-    bool done = thrust::all_of(
-        thrust::system::cuda::par, done_flags, done_flags + sc_mdata.batchsize,
-        MarshallSCUOuter_Predicate()
-    );
-
-    return done;
-
-    // int done_i = 1;
-    // for (int k0 = k_st; k0 < k_end; k0++)
-    // {
-    //     int k = perm_c_supno[k0];
-    //     int buffer_index = k0 - k_st;
-    //     lpanel_t& lpanel = sc_mdata.lpanels[buffer_index];
-    //     upanel_t& upanel = sc_mdata.upanels[buffer_index];
-    //     if(lpanel.isEmpty() || upanel.isEmpty())
-    //         continue;
-
-    //     int& iEnd = sc_mdata.iend[buffer_index];
-    //     // Not done if even one operation still has work to do 
-    //     if(iEnd < lpanel.nblocks())
-    //     {
-    //         done_i = 0;
-    //         int& iSt = sc_mdata.ist[buffer_index];
-    //         iSt = iEnd;
-    //         iEnd = lpanel.getEndBlock(iSt, sc_mdata.maxGemmRows[buffer_index]);
-    //         assert(iEnd > iSt);
-    //         sc_mdata.jst[buffer_index] = sc_mdata.jend[buffer_index] = 0;
-    //     }
-    // }
-
-    // return done_i;
-}
-
-int LUstruct_v100::marshallSCUBatchedDataInner(int k_st, int k_end, int_t *perm_c_supno)
-{
-    SCUMarshallData& sc_mdata = A_gpu.sc_marshall_data;
-    int knum = k_end - k_st;
-    sc_mdata.batchsize = knum;
-
-    MarshallSCUInnerFunc func(
-        k_st, sc_mdata.dev_ist, sc_mdata.dev_iend, sc_mdata.dev_jst, sc_mdata.dev_jend, sc_mdata.dev_maxGemmCols, 
-        sc_mdata.dev_A_ptrs, sc_mdata.dev_lda_array, sc_mdata.dev_B_ptrs, sc_mdata.dev_ldb_array, sc_mdata.dev_C_ptrs, 
-        sc_mdata.dev_ldc_array, sc_mdata.dev_m_array, sc_mdata.dev_n_array, sc_mdata.dev_k_array, dA_gpu
-    );
-    
-    thrust::counting_iterator<int> start(0), end(knum);
-    thrust::for_each(thrust::system::cuda::par, start, end, func);
-
-    // Set the max dims in the marshalled data 
-    sc_mdata.max_m = thrust::reduce(thrust::system::cuda::par, sc_mdata.dev_m_array, sc_mdata.dev_m_array + knum, 0, thrust::maximum<int>());
-    sc_mdata.max_n = thrust::reduce(thrust::system::cuda::par, sc_mdata.dev_n_array, sc_mdata.dev_n_array + knum, 0, thrust::maximum<int>());
-    sc_mdata.max_k = thrust::reduce(thrust::system::cuda::par, sc_mdata.dev_k_array, sc_mdata.dev_k_array + knum, 0, thrust::maximum<int>());
-    sc_mdata.max_ilen = thrust::transform_reduce(thrust::system::cuda::par, start, end, element_diff<int>(sc_mdata.dev_ist, sc_mdata.dev_iend), 0, thrust::maximum<int>());
-    sc_mdata.max_jlen = thrust::transform_reduce(thrust::system::cuda::par, start, end, element_diff<int>(sc_mdata.dev_jst, sc_mdata.dev_jend), 0, thrust::maximum<int>());
-    
-    printf("SCU %d -> %d: %d %d %d %d %d\n", k_st, k_end, sc_mdata.max_m, sc_mdata.max_n, sc_mdata.max_k, sc_mdata.max_ilen, sc_mdata.max_jlen);
-
-    return thrust::all_of(
-        thrust::system::cuda::par, sc_mdata.dev_m_array, sc_mdata.dev_m_array + sc_mdata.batchsize,
-        MarshallSCUInner_Predicate()
-    );
-    // int done_j = 1;
-    // for(int k0 = k_st; k0 < k_end; k0++)
-    // {   
-    //     int k = perm_c_supno[k0];
-    //     int buffer_index = k0 - k_st;
-    //     lpanel_t& lpanel = sc_mdata.lpanels[buffer_index];
-    //     upanel_t& upanel = sc_mdata.upanels[buffer_index];
-        
-    //     if(lpanel.isEmpty() || upanel.isEmpty())
-    //         continue;
-
-    //     int iSt = sc_mdata.ist[buffer_index];
-    //     int iEnd = sc_mdata.iend[buffer_index]; 
-
-    //     int& jSt = sc_mdata.jst[buffer_index];
-    //     int& jEnd = sc_mdata.jend[buffer_index];
-
-    //     // Not done if even one operation still has work to do 
-    //     if(jEnd < upanel.nblocks())
-    //     {
-    //         jSt = jEnd;
-    //         jEnd = upanel.getEndBlock(jSt, sc_mdata.maxGemmCols[buffer_index]);
-
-    //         assert(jEnd > jSt);
-    //         done_j = 0;
-    //         // printf("k = %d, ist = %d, iend = %d, jst = %d, jend = %d\n", k, iSt, iEnd, jSt, jEnd);
-            
-    //         sc_mdata.host_m_array[buffer_index] = lpanel.stRow(iEnd) - lpanel.stRow(iSt);
-    //         sc_mdata.host_n_array[buffer_index] = upanel.stCol(jEnd) - upanel.stCol(jSt);
-    //         sc_mdata.host_k_array[buffer_index] = supersize(k);
-        
-    //         sc_mdata.host_A_ptrs[buffer_index] = lpanel.blkPtrGPU(iSt);
-    //         sc_mdata.host_B_ptrs[buffer_index] = upanel.blkPtrGPU(jSt);
-    //         sc_mdata.host_C_ptrs[buffer_index] = A_gpu.gpuGemmBuffs[buffer_index];
-
-    //         sc_mdata.host_lda_array[buffer_index] = lpanel.LDA();
-    //         sc_mdata.host_ldb_array[buffer_index] = upanel.LDA();
-    //         sc_mdata.host_ldc_array[buffer_index] = sc_mdata.host_m_array[buffer_index];
-    //     }
-    //     else
-    //     {
-    //         sc_mdata.host_A_ptrs[buffer_index] = NULL;
-    //         sc_mdata.host_B_ptrs[buffer_index] = NULL;
-    //         sc_mdata.host_C_ptrs[buffer_index] = NULL;
-
-    //         sc_mdata.host_m_array[buffer_index] = 0;
-    //         sc_mdata.host_n_array[buffer_index] = 0;
-    //         sc_mdata.host_k_array[buffer_index] = 0;
-
-    //         sc_mdata.host_lda_array[buffer_index] = 1;
-    //         sc_mdata.host_ldb_array[buffer_index] = 1;
-    //         sc_mdata.host_ldc_array[buffer_index] = 1;
-    //     }
-    // } 
-
-    // if(done_j == 0)
-    // {   
-    //     // Upload the buffers to the gpu 
-    //     sc_mdata.setMaxDims();
-    //     sc_mdata.copyToGPU();
-    // }
-
-    // return done_j;
-}
-
-#endif // Sherry: old batch code commented out
-//////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-
-/**
- * @brief Pack non-zero values into a vector.
- *
- * @param spNzvalArray The array of non-zero values.
- * @param nzvalSize The size of the array of non-zero values.
- * @param valOffset The offset of the non-zero values.
- * @param packedNzvals The vector to store the non-zero values.
- * @param packedNzvalsIndices The vector to store the indices of the non-zero values.
- */
-void packNzvals(std::vector<double> &packedNzvals, std::vector<int_t> &packedNzvalsIndices,
-                double *spNzvalArray, int_t nzvalSize, int_t valOffset)
-{
-    for (int k = 0; k < nzvalSize; k++)
-    {
-        if (spNzvalArray[k] != 0)
-        {
-            packedNzvals.push_back(spNzvalArray[k]);
-            packedNzvalsIndices.push_back(valOffset + k);
-        }
-    }
-}
-
-const int AVOID_CPU_NZVAL = 1;
-lpanelGPU_t *LUstruct_v100::copyLpanelsToGPU()
-{
-    lpanelGPU_t *lPanelVec_GPU = new lpanelGPU_t[CEILING(nsupers, Pc)];
-
-    // TODO: set gpuLvalSize, gpuLidxSize
-    gpuLvalSize = 0;
-    gpuLidxSize = 0;
-    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
-    {
-        if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1)
-        {
-            gpuLvalSize += sizeof(double) * lPanelVec[i].nzvalSize();
-            gpuLidxSize += sizeof(int_t) * lPanelVec[i].indexSize();
-        }
-    }
-
-    double *valBuffer = (double *)SUPERLU_MALLOC(gpuLvalSize);
-    int_t *idxBuffer = (int_t *)SUPERLU_MALLOC(gpuLidxSize);
-
-    // allocate memory buffer on GPU
-    gpuErrchk(cudaMalloc(&gpuLvalBasePtr, gpuLvalSize));
-    gpuErrchk(cudaMalloc(&gpuLidxBasePtr, gpuLidxSize));
-
-    size_t valOffset = 0;
-    size_t idxOffset = 0;
-    double tCopyToCPU = SuperLU_timer_();
-
-    std::vector<double> packedNzvals;
-    std::vector<int_t> packedNzvalsIndices;
-
-    // do a memcpy to CPU buffer
-    for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
-    {
-        if (i * Pc + mycol < nsupers && isNodeInMyGrid[i * Pc + mycol] == 1)
-        {
-            if (lPanelVec[i].isEmpty())
-            {
-                lpanelGPU_t ithLpanel(NULL, NULL);
-                lPanelVec[i].gpuPanel = ithLpanel;
-                lPanelVec_GPU[i] = ithLpanel;
-            }
-            else
-            {
-                lpanelGPU_t ithLpanel(&gpuLidxBasePtr[idxOffset], &gpuLvalBasePtr[valOffset]);
-                lPanelVec[i].gpuPanel = ithLpanel;
-                lPanelVec_GPU[i] = ithLpanel;
-                if (AVOID_CPU_NZVAL)
-                    packNzvals(packedNzvals, packedNzvalsIndices, lPanelVec[i].val, lPanelVec[i].nzvalSize(), valOffset);
-                else
-                    memcpy(&valBuffer[valOffset], lPanelVec[i].val, sizeof(double) * lPanelVec[i].nzvalSize());
-
-                memcpy(&idxBuffer[idxOffset], lPanelVec[i].index, sizeof(int_t) * lPanelVec[i].indexSize());
-
-                valOffset += lPanelVec[i].nzvalSize();
-                idxOffset += lPanelVec[i].indexSize();
-            }
-        }
-    }
-    tCopyToCPU = SuperLU_timer_() - tCopyToCPU;
-    std::cout << "Time to copy-L to CPU: " << tCopyToCPU << "\n";
-    // do a cudaMemcpy to GPU
-    double tLsend = SuperLU_timer_();
-    if (AVOID_CPU_NZVAL)
-        copyToGPU(gpuLvalBasePtr, packedNzvals, packedNzvalsIndices);
-    else
-    {
-#if 0
-            cudaMemcpy(gpuLvalBasePtr, valBuffer, gpuLvalSize, cudaMemcpyHostToDevice);
-#else
-        copyToGPU_Sparse(gpuLvalBasePtr, valBuffer, gpuLvalSize);
-#endif
-    }
-    // find
-    gpuErrchk(cudaMemcpy(gpuLidxBasePtr, idxBuffer, gpuLidxSize, cudaMemcpyHostToDevice));
-    tLsend = SuperLU_timer_() - tLsend;
-    printf("cudaMemcpy time L =%g \n", tLsend);
-
-    SUPERLU_FREE(valBuffer);
-    SUPERLU_FREE(idxBuffer);
-    return lPanelVec_GPU;
-} /* copyLpanelsToGPU */
-
-
-upanelGPU_t *LUstruct_v100::copyUpanelsToGPU()
-{
-#if (DEBUGlevel >= 1)
-    int iam = 0;
-    CHECK_MALLOC(iam, "Enter copyUpanelsToGPU()");
-#endif
-    
-    upanelGPU_t *uPanelVec_GPU = new upanelGPU_t[CEILING(nsupers, Pr)];
-
-    gpuUvalSize = 0;
-    gpuUidxSize = 0;
-    for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
-    {
-        if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-        {
-            gpuUvalSize += sizeof(double) * uPanelVec[i].nzvalSize();
-            gpuUidxSize += sizeof(int_t) * uPanelVec[i].indexSize();
-        }
-    }
-
-    // TODO: set gpuUvalSize, gpuUidxSize
-
-    // allocate memory buffer on GPU
-    gpuErrchk(cudaMalloc(&gpuUvalBasePtr, gpuUvalSize));
-    gpuErrchk(cudaMalloc(&gpuUidxBasePtr, gpuUidxSize));
-
-    size_t valOffset = 0;
-    size_t idxOffset = 0;
-
-    double tCopyToCPU = SuperLU_timer_();
-    for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
-    {
-        if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-        {
-            if (uPanelVec[i].isEmpty())
-            {
-                upanelGPU_t ithupanel(NULL, NULL);
-                uPanelVec[i].gpuPanel = ithupanel;
-                uPanelVec_GPU[i] = ithupanel;
-            }
-        }
-    }
-
-    int_t *idxBuffer = NULL;
-    if ( gpuUidxSize>0 ) /* Sherry fix: gpuUidxSize can be 0 */
-	idxBuffer = (int_t *)SUPERLU_MALLOC(gpuUidxSize);
-
-    if (AVOID_CPU_NZVAL)
-    {
-        printf("AVOID_CPU_NZVAL is set\n");
-        std::vector<double> packedNzvals;
-        std::vector<int_t> packedNzvalsIndices;
-        for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
-        {
-            if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-            {
-                if (!uPanelVec[i].isEmpty())
-                {
-
-                    upanelGPU_t ithupanel(&gpuUidxBasePtr[idxOffset], &gpuUvalBasePtr[valOffset]);
-                    uPanelVec[i].gpuPanel = ithupanel;
-                    uPanelVec_GPU[i] = ithupanel;
-                    packNzvals(packedNzvals, packedNzvalsIndices, uPanelVec[i].val,
-                               uPanelVec[i].nzvalSize(), valOffset);
-                    memcpy(&idxBuffer[idxOffset], uPanelVec[i].index, sizeof(int_t) * uPanelVec[i].indexSize());
-
-                    valOffset += uPanelVec[i].nzvalSize();
-                    idxOffset += uPanelVec[i].indexSize();
-                }
-            }
-        }
-        tCopyToCPU = SuperLU_timer_() - tCopyToCPU;
-        printf("copyU to CPU-buff time = %g\n", tCopyToCPU);
-
-        // do a cudaMemcpy to GPU
-        double tLsend = SuperLU_timer_();
-        copyToGPU(gpuUvalBasePtr, packedNzvals, packedNzvalsIndices);
-        gpuErrchk(cudaMemcpy(gpuUidxBasePtr, idxBuffer, gpuUidxSize, cudaMemcpyHostToDevice));
-        tLsend = SuperLU_timer_() - tLsend;
-        printf("cudaMemcpy time U =%g \n", tLsend);
-        // SUPERLU_FREE(valBuffer);
-    }
-    else /* AVOID_CPU_NZVAL not set */
-    {
-        // do a memcpy to CPU buffer
-        double *valBuffer = (double *)SUPERLU_MALLOC(gpuUvalSize);
-
-        for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
-        {
-            if (i * Pr + myrow < nsupers && isNodeInMyGrid[i * Pr + myrow] == 1)
-            {
-                if (!uPanelVec[i].isEmpty())
-                {
-
-                    upanelGPU_t ithupanel(&gpuUidxBasePtr[idxOffset], &gpuUvalBasePtr[valOffset]);
-                    uPanelVec[i].gpuPanel = ithupanel;
-                    uPanelVec_GPU[i] = ithupanel;
-                    memcpy(&valBuffer[valOffset], uPanelVec[i].val, sizeof(double) * uPanelVec[i].nzvalSize());
-                    memcpy(&idxBuffer[idxOffset], uPanelVec[i].index, sizeof(int_t) * uPanelVec[i].indexSize());
-
-                    valOffset += uPanelVec[i].nzvalSize();
-                    idxOffset += uPanelVec[i].indexSize();
-                }
-            }
-        }
-        tCopyToCPU = SuperLU_timer_() - tCopyToCPU;
-        printf("copyU to CPU-buff time = %g\n", tCopyToCPU);
-
-        // do a cudaMemcpy to GPU
-        double tLsend = SuperLU_timer_();
-        const int USE_GPU_COPY = 1;
-        if (USE_GPU_COPY)
-        {
-            gpuErrchk(cudaMemcpy(gpuUvalBasePtr, valBuffer, gpuUvalSize, cudaMemcpyHostToDevice));
-        }
-        else
-            copyToGPU_Sparse(gpuUvalBasePtr, valBuffer, gpuUvalSize);
-
-        gpuErrchk(cudaMemcpy(gpuUidxBasePtr, idxBuffer, gpuUidxSize, cudaMemcpyHostToDevice));
-        tLsend = SuperLU_timer_() - tLsend;
-        printf("cudaMemcpy time U =%g \n", tLsend);
-
-        SUPERLU_FREE(valBuffer);
-    } /* end else AVOID_CPU_NZVAL not set */
-    
-    if ( gpuUidxSize>0 ) /* Sherry fix: gpuUidxSize can be 0 */
-	SUPERLU_FREE(idxBuffer);
-
-#if (DEBUGlevel >= 1)
-    CHECK_MALLOC(iam, "Exit copyUpanelsToGPU()");
-#endif
-    
-    return uPanelVec_GPU;
-    
-} /* copyUpanelsToGPU */
-
-#endif
diff --git a/SRC/CplusplusFactor/schurCompUpdate_impl.cuh b/SRC/CplusplusFactor/schurCompUpdate_impl.cuh
index 5b958e40..7085f85d 100644
--- a/SRC/CplusplusFactor/schurCompUpdate_impl.cuh
+++ b/SRC/CplusplusFactor/schurCompUpdate_impl.cuh
@@ -1,4 +1,4 @@
-#pragma once 
+ss#pragma once 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/system/cuda/execution_policy.h>
@@ -1063,16 +1063,6 @@ int_t xLUstruct_t<Ftype>::setLUstruct_GPU()
         cublasCreate(&A_gpu.lookAheadUHandle[stream]);
         cudaStreamCreate(&A_gpu.lookAheadUStream[stream]);
 
-        // Wajih: Need to create at least one magma queue
-#ifdef HAVE_MAGMA
-        if(stream == 0)
-        {
-            magma_queue_create_from_cuda(
-                device_id, A_gpu.cuStreams[stream], A_gpu.cuHandles[stream], 
-                NULL, &A_gpu.magma_queue
-            );
-        }
-#endif
     }
     tcuStreamCreate = SuperLU_timer_() - tcuStreamCreate;
     tRegion[3] = SuperLU_timer_() - tRegion[3];
diff --git a/SRC/CplusplusFactor/xlupanels_GPU.cuh b/SRC/CplusplusFactor/xlupanels_GPU.cuh
index a048b9a6..f4ce8e9c 100644
--- a/SRC/CplusplusFactor/xlupanels_GPU.cuh
+++ b/SRC/CplusplusFactor/xlupanels_GPU.cuh
@@ -7,9 +7,6 @@
 #ifdef HAVE_CUDA
   #include <cuda_runtime.h>
   #include <cusolverDn.h>
-#ifdef HAVE_MAGMA
-  #include "magma.h"
-#endif 
 #endif
 
 #include "lu_common.hpp"
@@ -412,15 +409,6 @@ struct xLUstructGPU_t
     cudaStream_t cuStreams[MAX_CUDA_STREAMS];
     cublasHandle_t cuHandles[MAX_CUDA_STREAMS];
     
-    // Magma is needed for non-uniform batched execution 
-#ifdef HAVE_MAGMA
-    magma_queue_t magma_queue;
-#endif
-#if 0  ////////////////////////////
-    LUMarshallData marshall_data;
-    xSCUMarshallData<Ftype> sc_marshall_data;
-#endif
-
     int* dperm_c_supno;
 
     /* Sherry: Allocate an array of buffers for the diagonal blocks
@@ -475,4 +463,4 @@ void scatterGPU_batchDriver(
     int max_ilen, int max_jlen, Ftype **gemmBuff_ptrs, int *LDgemmBuff_batch, 
     int maxSuperSize, int ldt, xlpanelGPU_t<Ftype> *lpanels, xupanelGPU_t<Ftype> *upanels, 
     xLUstructGPU_t<Ftype> *dA, int batchCount, cudaStream_t cuStream
-);
\ No newline at end of file
+);
diff --git a/SRC/complex16/pzgssvx3d.c b/SRC/complex16/pzgssvx3d.c
index 833392be..b81f6012 100755
--- a/SRC/complex16/pzgssvx3d.c
+++ b/SRC/complex16/pzgssvx3d.c
@@ -1,3 +1,10 @@
+//
+// Solve-only setup
+//  turn off: equil, rowperm, colperm, 
+//  options->ilu_level = 0;
+//    1. DOFACT -> distribution
+//    2. FACTORED -> solve
+//
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
 Lawrence Berkeley National Laboratory (subject to receipt of any required
@@ -507,207 +514,6 @@ at the top-level directory.
  * </pre>
  */
 
-// Sherry: TODO - this should be moved to a utility file.
-int zwriteLUtoDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct)
-{
-
-	if (getenv("LUFILE"))
-	{
-		FILE *fp = fopen(getenv("LUFILE"), "w");
-		printf("writing to %s", getenv("LUFILE"));
-		for (int i = 0; i < nsupers; i++)
-		{
-			if (LUstruct->Llu->Lrowind_bc_ptr[i])
-			{
-				int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i];
-				doublecomplex *nzval = LUstruct->Llu->Lnzval_bc_ptr[i];
-
-				int_t len = lsub[1]; /* LDA of the nzval[] */
-				int_t len2 = SuperSize(i) * len;
-				fwrite(nzval, sizeof(doublecomplex), len2, fp); // assume fp will be incremented
-			}
-
-			if (LUstruct->Llu->Ufstnz_br_ptr[i])
-			{
-				int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i];
-				doublecomplex *nzval = LUstruct->Llu->Unzval_br_ptr[i];
-				int_t lenv = usub[1];
-
-				fwrite(nzval, sizeof(doublecomplex), lenv, fp); // assume fp will be incremented
-			}
-		}
-
-		fclose(fp);
-	}
-	else
-	{
-		printf("Please set environment variable LUFILE to write\n..bye bye");
-		exit(0);
-	}
-
-	return 0;
-}
-
-#define EPSILON 1e-3
-
-// Sherry: TODO - this is used for debugging
-static int zcheckArr(doublecomplex *A, doublecomplex *B, int n)
-{
-	for (int i = 0; i < n; i++)
-	{
-		doublecomplex temp;
-        z_sub(&temp, &A[i], &B[i]);
-		assert(slud_z_abs(&temp) <= EPSILON * SUPERLU_MIN(slud_z_abs(&A[i]), slud_z_abs(&B[i])));
-	}
-
-	return 0;
-}
-
-// Sherry: TODO - this should be moved to a utility file.
-int zcheckLUFromDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct)
-{
-	zLocalLU_t *Llu = LUstruct->Llu;
-
-	doublecomplex *Lval_buf = doublecomplexMalloc_dist(Llu->bufmax[1]); // DOUBLE_ALLOC(Llu->bufmax[1]);
-	doublecomplex *Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3]); // DOUBLE_ALLOC(Llu->bufmax[3]);
-
-	if (getenv("LUFILE"))
-	{
-		FILE *fp = fopen(getenv("LUFILE"), "r");
-		printf("reading from %s", getenv("LUFILE"));
-		for (int i = 0; i < nsupers; i++)
-		{
-			if (LUstruct->Llu->Lrowind_bc_ptr[i])
-			{
-				int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i];
-				doublecomplex *nzval = LUstruct->Llu->Lnzval_bc_ptr[i];
-
-				int_t len = lsub[1]; /* LDA of the nzval[] */
-				int_t len2 = SuperSize(i) * len;
-				fread(Lval_buf, sizeof(doublecomplex), len2, fp); // assume fp will be incremented
-				zcheckArr(nzval, Lval_buf, len2);
-			}
-
-			if (LUstruct->Llu->Ufstnz_br_ptr[i])
-			{
-				int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i];
-				doublecomplex *nzval = LUstruct->Llu->Unzval_br_ptr[i];
-				int_t lenv = usub[1];
-
-				fread(Uval_buf, sizeof(doublecomplex), lenv, fp); // assume fp will be incremented
-				zcheckArr(nzval, Uval_buf, lenv);
-			}
-		}
-		printf("CHecking LU from  %s is succesful ", getenv("LUFILE"));
-		fclose(fp);
-	}
-	else
-	{
-		printf("Please set environment variable LUFILE to read\n..bye bye");
-		exit(0);
-	}
-
-	return 0;
-}
-
-// Sherry: TODO - this should be moved to a utility file.
-/*! \brief Dump the factored matrix L using matlab triple-let format
- */
-void zDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d,
-		  Glu_persist_t *Glu_persist, zLocalLU_t *Llu)
-{
-    register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
-    int k, mycol, r, n, nmax;
-    int_t nnzL;
-    int_t *xsup = Glu_persist->xsup;
-    int_t *index;
-    doublecomplex *nzval;
-	char filename[256];
-	FILE *fp, *fopen();
-	gridinfo_t *grid = &(grid3d->grid2d);
-	int iam = grid->iam;
-	int iam3d = grid3d->iam;
-
-	// assert(grid->npcol*grid->nprow==1);
-
-	// count nonzeros in the first pass
-	nnzL = 0;
-	n = 0;
-    ncb = nsupers / grid->npcol;
-    extra = nsupers % grid->npcol;
-    mycol = MYCOL( iam, grid );
-    if ( mycol < extra ) ++ncb;
-    for (lb = 0; lb < ncb; ++lb) {
-	index = Llu->Lrowind_bc_ptr[lb];
-	if ( index ) { /* Not an empty column */
-	    nzval = Llu->Lnzval_bc_ptr[lb];
-	    nb = index[0];
-	    nsupr = index[1];
-	    gb = lb * grid->npcol + mycol;
-	    nsupc = SuperSize( gb );
-	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
-		len = index[k+1];
-
-		for (j = 0; j < nsupc; ++j) {
-		for (i=0; i<len; ++i){
-
-		if(index[k+LB_DESCRIPTOR+i]+1>=xsup[gb]+j+1){
-			nnzL ++;
-			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);
-			n = nmax;
-		}
-
-		}
-		}
-		k += LB_DESCRIPTOR + len;
-		r += len;
-	    }
-	}
-    }
-	MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm);
-	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);
-
-	snprintf(filename, sizeof(filename), "%s-%d", "L", iam3d);
-    printf("Dumping L factor to --> %s\n", filename);
- 	if ( !(fp = fopen(filename, "w")) ) {
-			ABORT("File open failed");
-		}
-
-	if(grid->iam==0){
-		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
-	}
-
-     ncb = nsupers / grid->npcol;
-    extra = nsupers % grid->npcol;
-    mycol = MYCOL( iam, grid );
-    if ( mycol < extra ) ++ncb;
-    for (lb = 0; lb < ncb; ++lb) {
-	index = Llu->Lrowind_bc_ptr[lb];
-	if ( index ) { /* Not an empty column */
-	    nzval = Llu->Lnzval_bc_ptr[lb];
-	    nb = index[0];
-	    nsupr = index[1];
-	    gb = lb * grid->npcol + mycol;
-	    nsupc = SuperSize( gb );
-	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
-		len = index[k+1];
-
-		for (j = 0; j < nsupc; ++j) {
-		for (i=0; i<len; ++i){
-			fprintf(fp, IFMT IFMT " %e %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr].r,nzval[r +i+ j*nsupr].i);
-		}
-		}
-		k += LB_DESCRIPTOR + len;
-		r += len;
-	    }
-	}
-    }
- 	fclose(fp);
-
-} /* zDumpLblocks3D */
-
-
-
 void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 			   zScalePermstruct_t *ScalePermstruct,
 			   doublecomplex B[], int ldb, int nrhs, gridinfo3d_t *grid3d,
@@ -777,6 +583,14 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 
     /* Test the options choices. */
     *info = 0;
+
+    if ( options->SolveOnly == YES ) {
+	options->Fact = DOFACT;
+	options->Equil = NO;
+	options->RowPerm = NOROWPERM;
+	options->ColPerm = NATURAL;
+	options->ILU_level = 0;
+    }
     Fact = options->Fact;
 
     validateInput_pzgssvx3d(options, A, ldb, nrhs, grid3d, info);
@@ -887,8 +701,10 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	if (Equil) {
 	    zscaleMatrixDiagonally(Fact, ScalePermstruct,
 				  A, stat, grid, &rowequ, &colequ, &iinfo);
-	    if (iinfo < 0)
-		return; // Sherry: TOO - return a number in INFO
+	    if (iinfo < 0) {
+		*info = -20 - iinfo;
+		return;
+	    }
 
 	} /* end if Equil ... LAPACK style, not involving MC64 */
 
@@ -897,12 +713,12 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	     * Gather A from the distributed compressed row format to
 	     * global A in compressed column format.
 	     * Numerical values are gathered only when a row permutation
-	     ** for large diagonal is sought after.
+	     * for large diagonal is sought after.
 	     */
 	    if (Fact != SamePattern_SameRowPerm &&
 			(parSymbFact == NO || options->RowPerm != NO))
 	    {
-		int_t need_value = (options->RowPerm == LargeDiag_MC64);
+		int need_value = (options->RowPerm == LargeDiag_MC64);
 		pzCompRow_loc_to_CompCol_global(need_value, A, grid, &GA);
 		GAstore = (NCformat *)GA.Store;
 		nnz = GAstore->nnz;
@@ -1045,15 +861,15 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 		if (Glu_freeable == NULL)
 		{
 		    if (!(Glu_freeable = (Glu_freeable_t *)
-				SUPERLU_MALLOC(sizeof(Glu_freeable_t))))
+			  SUPERLU_MALLOC(sizeof(Glu_freeable_t))))
 			ABORT("Malloc fails for Glu_freeable.");
 		}
 		zbcastPermutedSparseA(A, ScalePermstruct, Glu_freeable,
-				     LUstruct, grid3d);
-		} else {
-		    //TODO: need a parmetis version of zbcastPermutedSparseA broadcasting Pslu_freeable
-		}
+				      LUstruct, grid3d);
+	} else {
+	    ; /*TODO: need a parmetis version of zbcastPermutedSparseA broadcasting Pslu_freeable*/
 	}
+    }
 
 	perm_r = ScalePermstruct->perm_r;
 	perm_c = ScalePermstruct->perm_c;
@@ -1102,7 +918,7 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 			t = SuperLU_timer_();
 
 			dist_mem_use = pzdistribute3d_Yang(options, n, A, ScalePermstruct,
-											Glu_freeable, LUstruct, grid3d);
+							   Glu_freeable, LUstruct, grid3d);
 			stat->utime[DIST] = SuperLU_timer_() - t;
 
 			/* Deallocate storage used in symbolic factorization. */
@@ -1174,9 +990,6 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 		#endif
 
 
-
-
-
 		SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t));
 		slu_SCT_init(SCT);
 
@@ -1189,9 +1002,6 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 #endif
 
 
-
-
-
 		t = SuperLU_timer_();
 
 		/*factorize in grid 1*/
@@ -1427,18 +1237,18 @@ void pzgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 			} /* end printing stats */
 
 		} /* end if not Factored */
-    }
+	} /* end if grid-0 */
 
 		if(Solve3D){
 
 			if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
-			/* Need to reset the solve's communication pattern,
-			because perm_r[] and/or perm_c[] is changed.    */
-			if ( options->SolveInitialized == YES ) { /* Initialized before */
+			    /* Need to reset the solve's communication pattern,
+			       because perm_r[] and/or perm_c[] is changed.    */
+			    if ( options->SolveInitialized == YES ) { /* Initialized before */
 				zSolveFinalize(options, SOLVEstruct); /* Clean up structure */
 				pzgstrs_delete_device_lsum_x(SOLVEstruct);
 				options->SolveInitialized = NO;   /* Reset the solve state */
-			}
+			    }
 			}
 
 			if (get_new3dsolve()){
@@ -2011,9 +1821,8 @@ if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
 			{
 				if (colequ)
 				{
-					b_col = B;
-					for (j = 0; j < nrhs; ++j)
-					{
+				    b_col = B;
+				    for (j = 0; j < nrhs; ++j) {
 						irow = fst_row;
 						for (i = 0; i < m_loc; ++i)
 						{
diff --git a/SRC/complex16/zssvx3dAux.c b/SRC/complex16/zssvx3dAux.c
index 1bab02b5..b8655419 100644
--- a/SRC/complex16/zssvx3dAux.c
+++ b/SRC/complex16/zssvx3dAux.c
@@ -406,11 +406,9 @@ void zperform_LargeDiag_MC64(
     }
 
     // int iinfo;
-    zfindRowPerm_MC64(grid, job, m, n,
-    nnz,
-    colptr,
-    rowind,
-     a_GA, Equil, perm_r, R1, C1, iinfo);
+    zfindRowPerm_MC64(grid, job, m, n, nnz,
+		      colptr, rowind,
+		      a_GA, Equil, perm_r, R1, C1, iinfo);
 
     if (*iinfo && job == 5) {
         SUPERLU_FREE(R1);
@@ -508,12 +506,11 @@ void zperform_row_permutation(
             else if (options->RowPerm == LargeDiag_MC64)
             {
 
-                zperform_LargeDiag_MC64(
-                options, Fact,
-                ScalePermstruct, LUstruct,
-                m, n, grid,
-                A, GA, stat, job,
-                Equil, rowequ, colequ, iinfo);
+                zperform_LargeDiag_MC64(options, Fact,
+					ScalePermstruct, LUstruct,
+					m, n, grid,
+					A, GA, stat, job,
+					Equil, rowequ, colequ, iinfo);
             }
             else // LargeDiag_HWPM
             {
diff --git a/SRC/complex16/zutil_dist.c b/SRC/complex16/zutil_dist.c
index cdd415b4..b2452cbb 100755
--- a/SRC/complex16/zutil_dist.c
+++ b/SRC/complex16/zutil_dist.c
@@ -995,6 +995,203 @@ zprint_gsmv_comm(FILE *fp, int_t m_loc, pzgsmv_comm_t *gsmv_comm,
   return 0;
 }
 
+int zwriteLUtoDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct)
+{
+
+	if (getenv("LUFILE"))
+	{
+		FILE *fp = fopen(getenv("LUFILE"), "w");
+		printf("writing to %s", getenv("LUFILE"));
+		for (int i = 0; i < nsupers; i++)
+		{
+			if (LUstruct->Llu->Lrowind_bc_ptr[i])
+			{
+				int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i];
+				doublecomplex *nzval = LUstruct->Llu->Lnzval_bc_ptr[i];
+
+				int_t len = lsub[1]; /* LDA of the nzval[] */
+				int_t len2 = SuperSize(i) * len;
+				fwrite(nzval, sizeof(doublecomplex), len2, fp); // assume fp will be incremented
+			}
+
+			if (LUstruct->Llu->Ufstnz_br_ptr[i])
+			{
+				int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i];
+				doublecomplex *nzval = LUstruct->Llu->Unzval_br_ptr[i];
+				int_t lenv = usub[1];
+
+				fwrite(nzval, sizeof(doublecomplex), lenv, fp); // assume fp will be incremented
+			}
+		}
+
+		fclose(fp);
+	}
+	else
+	{
+		printf("Please set environment variable LUFILE to write\n..bye bye");
+		exit(0);
+	}
+
+	return 0;
+}
+
+#define EPSILON 1e-3
+
+int zcheckArr(doublecomplex *A, doublecomplex *B, int n)
+{
+	for (int i = 0; i < n; i++)
+	{
+		doublecomplex temp;
+        z_sub(&temp, &A[i], &B[i]);
+		assert(slud_z_abs(&temp) <= EPSILON * SUPERLU_MIN(slud_z_abs(&A[i]), slud_z_abs(&B[i])));
+	}
+
+	return 0;
+}
+
+// Sherry: TODO - this should be moved to a utility file.
+int zcheckLUFromDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct)
+{
+	zLocalLU_t *Llu = LUstruct->Llu;
+
+	doublecomplex *Lval_buf = doublecomplexMalloc_dist(Llu->bufmax[1]); // DOUBLE_ALLOC(Llu->bufmax[1]);
+	doublecomplex *Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3]); // DOUBLE_ALLOC(Llu->bufmax[3]);
+
+	if (getenv("LUFILE"))
+	{
+		FILE *fp = fopen(getenv("LUFILE"), "r");
+		printf("reading from %s", getenv("LUFILE"));
+		for (int i = 0; i < nsupers; i++)
+		{
+			if (LUstruct->Llu->Lrowind_bc_ptr[i])
+			{
+				int_t *lsub = LUstruct->Llu->Lrowind_bc_ptr[i];
+				doublecomplex *nzval = LUstruct->Llu->Lnzval_bc_ptr[i];
+
+				int_t len = lsub[1]; /* LDA of the nzval[] */
+				int_t len2 = SuperSize(i) * len;
+				fread(Lval_buf, sizeof(doublecomplex), len2, fp); // assume fp will be incremented
+				zcheckArr(nzval, Lval_buf, len2);
+			}
+
+			if (LUstruct->Llu->Ufstnz_br_ptr[i])
+			{
+				int_t *usub = LUstruct->Llu->Ufstnz_br_ptr[i];
+				doublecomplex *nzval = LUstruct->Llu->Unzval_br_ptr[i];
+				int_t lenv = usub[1];
+
+				fread(Uval_buf, sizeof(doublecomplex), lenv, fp); // assume fp will be incremented
+				zcheckArr(nzval, Uval_buf, lenv);
+			}
+		}
+		printf("CHecking LU from  %s is succesful ", getenv("LUFILE"));
+		fclose(fp);
+	}
+	else
+	{
+		printf("Please set environment variable LUFILE to read\n..bye bye");
+		exit(0);
+	}
+
+	return 0;
+}
+
+// Sherry: TODO - this should be moved to a utility file.
+/*! \brief Dump the factored matrix L using matlab triple-let format
+ */
+void zDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d,
+		  Glu_persist_t *Glu_persist, zLocalLU_t *Llu)
+{
+    register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
+    int_t *xsup = Glu_persist->xsup;
+    int_t *index;
+    doublecomplex *nzval;
+	char filename[256];
+	FILE *fp, *fopen();
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int iam = grid->iam;
+	int iam3d = grid3d->iam;
+
+	// assert(grid->npcol*grid->nprow==1);
+
+	// count nonzeros in the first pass
+	nnzL = 0;
+	n = 0;
+    ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+
+		for (j = 0; j < nsupc; ++j) {
+		for (i=0; i<len; ++i){
+
+		if(index[k+LB_DESCRIPTOR+i]+1>=xsup[gb]+j+1){
+			nnzL ++;
+			nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);
+			n = nmax;
+		}
+
+		}
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+    }
+	MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm);
+	MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);
+
+	snprintf(filename, sizeof(filename), "%s-%d", "L", iam3d);
+    printf("Dumping L factor to --> %s\n", filename);
+ 	if ( !(fp = fopen(filename, "w")) ) {
+			ABORT("File open failed");
+		}
+
+	if(grid->iam==0){
+		fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
+	}
+
+     ncb = nsupers / grid->npcol;
+    extra = nsupers % grid->npcol;
+    mycol = MYCOL( iam, grid );
+    if ( mycol < extra ) ++ncb;
+    for (lb = 0; lb < ncb; ++lb) {
+	index = Llu->Lrowind_bc_ptr[lb];
+	if ( index ) { /* Not an empty column */
+	    nzval = Llu->Lnzval_bc_ptr[lb];
+	    nb = index[0];
+	    nsupr = index[1];
+	    gb = lb * grid->npcol + mycol;
+	    nsupc = SuperSize( gb );
+	    for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
+		len = index[k+1];
+
+		for (j = 0; j < nsupc; ++j) {
+		for (i=0; i<len; ++i){
+			fprintf(fp, IFMT IFMT " %e %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+j+1, nzval[r +i+ j*nsupr].r,nzval[r +i+ j*nsupr].i);
+		}
+		}
+		k += LB_DESCRIPTOR + len;
+		r += len;
+	    }
+	}
+    }
+ 	fclose(fp);
+
+} /* zDumpLblocks3D */
+
 
 /* cg5.cua
             b = A*x           y = L\b
diff --git a/SRC/include/superlu_defs.h b/SRC/include/superlu_defs.h
index d56754b6..dec09f50 100755
--- a/SRC/include/superlu_defs.h
+++ b/SRC/include/superlu_defs.h
@@ -737,6 +737,8 @@ typedef struct {
     yes_no_t      PivotGrowth;
     yes_no_t      ConditionNumber;
     rowperm_t     RowPerm;
+    yes_no_t      SolveOnly;      /* Treat the input matrix A as the L & U factors */
+    int           ILU_level;      /* level-based ILU */
     int 	  ILU_DropRule;
     double	  ILU_DropTol;    /* threshold for dropping */
     double	  ILU_FillFactor; /* gamma in the secondary dropping */
@@ -1119,6 +1121,8 @@ extern int_t symbfact_SubInit(superlu_dist_options_t *options,
 extern int_t symbfact_SubXpand(int_t, int_t, int_t, MemType, int_t *,
 			       Glu_freeable_t *);
 extern int symbfact_SubFree(Glu_freeable_t *);
+extern int ilu_level_symbfact(superlu_dist_options_t *, SuperMatrix *, int_t *,
+			      int_t *, Glu_persist_t *, Glu_freeable_t *);
 extern void    countnz_dist (const int_t, int_t *, int_t *, int_t *,
 			     Glu_persist_t *, Glu_freeable_t *);
 extern int64_t fixupL_dist (const int_t, const int_t *, Glu_persist_t *,
@@ -1298,7 +1302,7 @@ extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT);
 extern void slu_SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT);
 extern int_t zAllocBcast(int_t size, void** ptr, gridinfo3d_t* grid3d);
 extern int_t zAllocBcast_gridID(int_t size, void** ptr, int_t gridID, gridinfo3d_t* grid3d);
-extern void permCol_SymbolicFact3d(superlu_dist_options_t *options, int_t n, SuperMatrix *GA, int_t *perm_c, int_t *etree, 
+extern void permCol_SymbolicFact3d(superlu_dist_options_t *options, int n, SuperMatrix *GA, int_t *perm_c, int_t *etree, 
                            Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable, SuperLUStat_t *stat,
 						   superlu_dist_mem_usage_t*symb_mem_usage,
 						   gridinfo3d_t* grid3d);
diff --git a/SRC/include/superlu_dist_config.h b/SRC/include/superlu_dist_config.h
index e4b31ab6..2e2db19f 100644
--- a/SRC/include/superlu_dist_config.h
+++ b/SRC/include/superlu_dist_config.h
@@ -1,10 +1,10 @@
 /* superlu_dist_config.h.in */
 
 /* Enable CUDA */
-#define HAVE_CUDA TRUE
+/* #undef HAVE_CUDA */
 
 /* Enable NVSHMEM */
-#define HAVE_NVSHMEM TRUE
+/* #undef HAVE_NVSHMEM */
 
 /* Enable HIP */
 /* #undef HAVE_HIP */
@@ -16,7 +16,7 @@
 /* #undef HAVE_COLAMD */
 
 /* Enable LAPACK */
-#define SLU_HAVE_LAPACK TRUE
+/* #undef SLU_HAVE_LAPACK */
 
 /* Enable CombBLAS */
 /* #undef HAVE_COMBBLAS */
@@ -25,7 +25,7 @@
 /* #undef HAVE_MAGMA */
 
 /* enable 64bit index mode */
-#define XSDK_INDEX_SIZE 64
+/* #undef XSDK_INDEX_SIZE */
 
 #if defined(XSDK_INDEX_SIZE) && (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
diff --git a/SRC/include/superlu_zdefs.h b/SRC/include/superlu_zdefs.h
index 21bfe84e..25c5d595 100755
--- a/SRC/include/superlu_zdefs.h
+++ b/SRC/include/superlu_zdefs.h
@@ -1611,6 +1611,15 @@ extern int zpivot_batch(
     //    DeviceContext context /* device context including queues, events, dependencies */
     );
 
+    
+extern int zwriteLUtoDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct);
+extern int zcheckArr(doublecomplex *A, doublecomplex *B, int n);
+extern int zcheckLUFromDisk(int nsupers, int_t *xsup, zLUstruct_t *LUstruct);
+extern void zDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d,
+			   Glu_persist_t *Glu_persist, zLocalLU_t *Llu);
+extern void zDumpLblocks3D(int_t nsupers, gridinfo3d_t *grid3d,
+			   Glu_persist_t *Glu_persist, zLocalLU_t *Llu);
+    
 /*== end 3D prototypes ===================*/
 
 extern doublecomplex *zready_x;
diff --git a/SRC/prec-independent/get_perm_c.c b/SRC/prec-independent/get_perm_c.c
index f048cced..823411fd 100755
--- a/SRC/prec-independent/get_perm_c.c
+++ b/SRC/prec-independent/get_perm_c.c
@@ -549,7 +549,7 @@ get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c)
 	      if ( !pnum ) printf(".. Use METIS ordering on A'+A\n");
 #endif
 	      return;
-#endif
+#endif /* matching ifdef HAVE_PARMETIS */
 
         default:
 	      ABORT("Invalid ISPEC");
diff --git a/SRC/prec-independent/ilu_level_symbfact.c b/SRC/prec-independent/ilu_level_symbfact.c
new file mode 100644
index 00000000..3589ace9
--- /dev/null
+++ b/SRC/prec-independent/ilu_level_symbfact.c
@@ -0,0 +1,110 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file
+ * \brief Performs a level-based ILU symbolic factorization
+ *
+ * <pre>
+ * -- Distributed SuperLU routine (version 9.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * May 20, 2024
+ * 
+ * Modified:
+ */
+
+#include "superlu_defs.h"
+
+/**
+ * @brief This function performs the level-based ILU symbolic factorization on
+ * matrix Pc*Pr*A*Pc' and sets up the nonzero data structures for L & U matrices.
+ * In the process, the matrix is also ordered and its memory usage information is gathered.
+ * 
+ * @param options Input parameters to control how the ILU decomposition will be performed.
+ * @param A Pointer to the global supermatrix A, permuted by columns in NCPfomat.
+ * @param perm_c The column permutation vector.
+ * @param Glu_persist Pointer to the structure which tracks the symbolic factorization information.
+ * @param Glu_freeable Pointer to the structure which tracks the space used to store L/U data structures.
+ * @param stat Information on program execution.
+ */
+
+int ilu_level_symbfact
+/************************************************************************/
+(
+ superlu_dist_options_t *options, /* input options */
+ SuperMatrix *A,      /* original matrix A permuted by columns, GAC from caller (input) */
+ int_t       *perm_c, /* column permutation vector (input) */
+ int_t       *etree,  /* column elimination tree (input) */
+ Glu_persist_t *Glu_persist,  /* output */
+ Glu_freeable_t *Glu_freeable /* output */
+ )
+{
+    
+    if ( options->ILU_level != 0 ) {
+	printf("ERROR: ILU(k>0) is not implemented yet\n");
+	return (-1);
+    }
+
+    /* Now, do ILU(0) */
+
+    int_t iinfo;
+    int i, n = A->ncol;
+    double t;
+    
+    /* Set up supernode partition */
+    Glu_persist->supno = (int_t *) SUPERLU_MALLOC(n * sizeof(int_t));
+    Glu_persist->xsup = (int_t *) SUPERLU_MALLOC((n+1) * sizeof(int_t));
+    for (i = 0; i < n; i++) {
+	Glu_persist->supno[i] = i;
+	Glu_persist->xsup[i] = i;
+    }
+    Glu_persist->xsup[n] = n;
+
+
+    NCPformat *GACstore = A->Store;
+    int_t *colbeg, *colend, *rowind, irow;
+    rowind = GACstore->rowind;
+    colbeg = GACstore->colbeg;
+    colend = GACstore->colend;
+
+    Glu_freeable->xlsub = (int_t *) intCalloc_dist(n+1);
+    Glu_freeable->xusub = (int_t *) intCalloc_dist(n+1);
+    int_t nnzL = 0, nnzU = 0;
+    
+    /* Count nonzeros per column for L & U */
+    for (int j = 0; j < n; ++j) {
+#if 0	
+	fscanf(fpU, "%d %d", &col_num, &Glu_freeable.usub[i]);
+	Glu_freeable.xusub[col_num] += 1;
+#endif
+	for (i = colbeg[j]; i < colend[j]; ++i) { // (j,j) is diagonal
+	    irow = rowind[i];
+	    if ( irow < j ) { // in U
+		nnzU++;
+		Glu_freeable->xusub[j] += 1;
+	    } else { // in L, including diagonal of U
+		nnzL++;
+		Glu_freeable->xlsub[j] += 1;
+	    }
+	}
+    }
+
+    Glu_freeable->nnzLU = nnzU + nnzL;
+    Glu_freeable->lsub = (int_t *) SUPERLU_MALLOC(nnzL * sizeof(int_t));
+    Glu_freeable->usub = (int_t *) SUPERLU_MALLOC(nnzU * sizeof(int_t));
+    
+    /* Do prefix sum to set up column pointers */
+    for(i = 1; i <= n; i++) {
+	Glu_freeable->xlsub[i] += Glu_freeable->xlsub[i-1];	
+	Glu_freeable->xusub[i] += Glu_freeable->xusub[i-1];
+    }
+
+    return 0;
+}
+
diff --git a/SRC/prec-independent/psymbfact.c b/SRC/prec-independent/psymbfact.c
index 8c0b98c3..ff4b617e 100755
--- a/SRC/prec-independent/psymbfact.c
+++ b/SRC/prec-independent/psymbfact.c
@@ -2157,9 +2157,11 @@ symbfact_vtx
 	/* TEST available memory */
 	if (next >= x_aind_end) {	
 	  if (domain_symb) {
+	      int mem_type = (computeL ? LSUB : USUB);  // Sherry fix 5/26/24 
 	      if ( (mem_error =
 		    psymbfact_LUXpandMem (iam, n, vtx, next, 0,
-					  computeL, DOMAIN_SYMB, 1, 
+					  //computeL, DOMAIN_SYMB, 1, 
+					  mem_type, DOMAIN_SYMB, 1, 
 					  Pslu_freeable, Llu_symbfact, VInfo, PS)) )
 		  return (mem_error);
 	  } else if ( (mem_error =
@@ -4009,9 +4011,11 @@ dnsCurSep_symbfact
 	  j = x_newelts[vtx_lid_x+1] + lstVtx - vtx;
 	  if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) ||
 	      (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) {
+	      int mem_type = (computeL ? LSUB : USUB);  // Sherry fix 5/26/24 
 	      if ( (mem_error =
 		    psymbfact_LUXpandMem (iam, n, vtx, next, next + j,
-					  computeL, DNS_CURSEP, 1,
+					  //computeL, DNS_CURSEP, 1,
+					  mem_type, DNS_CURSEP, 1,
 					  Pslu_freeable, Llu_symbfact, VInfo, PS)) )
 	      return (mem_error);
 	    if (computeL) sub = Llu_symbfact->lsub;
diff --git a/SRC/prec-independent/trfAux.c b/SRC/prec-independent/trfAux.c
index 20fed84c..b2d46d38 100755
--- a/SRC/prec-independent/trfAux.c
+++ b/SRC/prec-independent/trfAux.c
@@ -2534,15 +2534,16 @@ void applyRowPerm(int_t* colptr, int_t* rowind, int_t* perm_r, int_t n) {
  * its memory usage information is fetched.
  * 
  * @param options The options for the SuperLU distribution.
+ * @param n Dimension of the global matrix A.
  * @param GA A pointer to the global matrix A.
  * @param perm_c The column permutation vector.
  * @param etree The elimination tree of Pc*Pr*A*Pc'.
- * @param iam The processor number (0 <= iam < Pr*Pc).
  * @param Glu_persist Pointer to the structure which tracks the symbolic factorization information.
  * @param Glu_freeable Pointer to the structure which tracks the space used to store L/U data structures.
  * @param stat Information on program execution.
+ * @param grid3d The 3D process grid. 
  */
-void permCol_SymbolicFact3d(superlu_dist_options_t *options, int_t n, SuperMatrix *GA, int_t *perm_c, int_t *etree, 
+void permCol_SymbolicFact3d(superlu_dist_options_t *options, int n, SuperMatrix *GA, int_t *perm_c, int_t *etree, 
                            Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable, SuperLUStat_t *stat,
 						   superlu_dist_mem_usage_t*symb_mem_usage,
 						   gridinfo3d_t* grid3d)
@@ -2577,7 +2578,13 @@ void permCol_SymbolicFact3d(superlu_dist_options_t *options, int_t n, SuperMatri
 #endif
 
     t = SuperLU_timer_();
-    iinfo = symbfact(options, iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable);
+
+    if ( options->ILU_level != SLU_EMPTY ) {
+	iinfo = ilu_level_symbfact(options, &GAC, perm_c, etree, Glu_persist, Glu_freeable);
+    } else {
+	iinfo = symbfact(options, iam, &GAC, perm_c, etree, Glu_persist, Glu_freeable);
+    }
+    
     stat->utime[SYMBFAC] = SuperLU_timer_() - t;
 
     if (iinfo < 0)
diff --git a/SRC/prec-independent/util.c b/SRC/prec-independent/util.c
index 91227306..8e7cdd90 100755
--- a/SRC/prec-independent/util.c
+++ b/SRC/prec-independent/util.c
@@ -1,3 +1,4 @@
+
 /*! \file
 Copyright (c) 2003, The Regents of the University of California, through
 Lawrence Berkeley National Laboratory (subject to receipt of any required 
@@ -209,6 +210,7 @@ void set_default_options_dist(superlu_dist_options_t *options)
 {
     options->Fact = DOFACT;
     options->Equil = YES;
+    options->ILU_level = SLU_EMPTY;
     options->ParSymbFact = NO;
 #ifdef HAVE_PARMETIS
     options->ColPerm = METIS_AT_PLUS_A;