From a71306fc4b2a4056ae6b9300293b4ebfe3ef3884 Mon Sep 17 00:00:00 2001 From: xiaoye Date: Tue, 19 Mar 2024 14:14:14 -0700 Subject: [PATCH] Commented out (potentially) unused files; Does not compile yet. --- EXAMPLE/pddrive.c | 36 +++--- EXAMPLE/pddrive3d.c | 112 ++++++++--------- EXAMPLE/psdrive.c | 4 +- EXAMPLE/psdrive3d.c | 112 ++++++++--------- EXAMPLE/pzdrive.c | 70 +++++------ EXAMPLE/pzdrive3d.c | 114 +++++++++--------- SRC/CMakeLists.txt | 43 ++++--- .../LUgpuCHandle_interface_impl.cu | 4 +- SRC/CplusplusFactor/batch_factorize.cu | 5 +- .../dsparseTreeFactorGPU_impl.hpp | 1 + SRC/CplusplusFactor/l_panels_impl.hpp | 4 +- SRC/CplusplusFactor/lupanels.hpp | 4 +- SRC/CplusplusFactor/lupanels_impl.hpp | 8 +- .../pdgstrf3d_upacked_impl.hpp | 8 +- SRC/CplusplusFactor/sparseTreeFactor_impl.hpp | 2 +- SRC/CplusplusFactor/xlupanels.hpp | 4 +- SRC/CplusplusFactor/xlupanels_GPU.cuh | 1 + SRC/complex16/pzgstrf.c | 2 +- SRC/complex16/pzgstrf3d.c | 2 +- SRC/complex16/ztrfAux.c | 4 +- SRC/double/dtrfAux.c | 4 +- SRC/double/pdgstrf.c | 2 +- SRC/double/pdgstrf3d.c | 2 +- SRC/include/superlu_ddefs.h | 16 +-- SRC/include/superlu_defs.h | 2 +- SRC/include/superlu_dist_config.h | 4 +- SRC/include/superlu_sdefs.h | 9 +- SRC/include/superlu_zdefs.h | 9 +- SRC/prec-independent/sec_structs.c | 9 +- SRC/prec-independent/util.c | 13 +- SRC/single/psgstrf.c | 2 +- SRC/single/psgstrf3d.c | 2 +- SRC/single/strfAux.c | 4 +- 33 files changed, 324 insertions(+), 294 deletions(-) diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c index 3c0f6f70..aec1f123 100755 --- a/EXAMPLE/pddrive.c +++ b/EXAMPLE/pddrive.c @@ -92,6 +92,7 @@ int main(int argc, char *argv[]) //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + #if ( VAMPIR>=1 ) VT_traceoff(); #endif @@ -99,7 +100,7 @@ int main(int argc, char *argv[]) #if ( VTUNE>=1 ) __itt_pause(); #endif - + /* Set the default input options: options.Fact = DOFACT; options.Equil = YES; @@ -115,16 +116,11 @@ int main(int argc, char *argv[]) options.DiagInv = NO; */ set_default_options_dist(&options); - options.ReplaceTinyPivot = YES; - options.IterRefine = NOREFINE; - options.DiagInv = YES; #if 0 - options.ParSymbFact = YES; - options.ColPerm = PARMETIS; options.RowPerm = LargeDiag_HWPM; options.IterRefine = NOREFINE; options.ColPerm = NATURAL; - options.Equil = NO; + options.Equil = NO; options.ReplaceTinyPivot = YES; #endif @@ -178,6 +174,8 @@ int main(int argc, char *argv[]) if (ir != -1) options.IterRefine = ir; if (symbfact != -1) options.ParSymbFact = symbfact; + int superlu_acc_offload = sp_ienv_dist(10, &options); //get_acc_offload(); + /* In the batch mode: create multiple SuperLU grids, each grid solving one linear system. */ if ( batch ) { @@ -187,7 +185,7 @@ int main(int argc, char *argv[]) MPI_Comm_rank(MPI_COMM_WORLD, &myrank); usermap = SUPERLU_MALLOC(nprow*npcol * sizeof(int)); ldumap = nprow; - + /* Assuming each grid uses the same number of nprow and npcol */ int color = myrank/(nprow*npcol); MPI_Comm_split(MPI_COMM_WORLD, color, myrank, &SubComm); @@ -196,9 +194,8 @@ int main(int argc, char *argv[]) for (int j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; superlu_gridmap(SubComm, nprow, npcol, usermap, ldumap, &grid); SUPERLU_FREE(usermap); - + #ifdef GPU_ACC - int superlu_acc_offload = get_acc_offload(); if (superlu_acc_offload) { /* Binding each MPI to a GPU device */ char *ttemp; @@ -211,12 +208,12 @@ int main(int argc, char *argv[]) gpuSetDevice(rank % devs); // Set device to be used for GPU executions } - // This is to initialize GPU, which can be costly. - double t1 = SuperLU_timer_(); + // This is to initialize GPU, which can be costly. + double t1 = SuperLU_timer_(); gpuFree(0); - double t2 = SuperLU_timer_(); + double t2 = SuperLU_timer_(); if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1); - gpublasHandle_t hb; + gpublasHandle_t hb; gpublasCreate(&hb); if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2); gpublasDestroy(hb); @@ -229,19 +226,16 @@ int main(int argc, char *argv[]) /* ------------------------------------------------------------ INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------ */ - // nv_init_wrapper(grid.comm); - superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); #ifdef GPU_ACC - int superlu_acc_offload = get_acc_offload(); if (superlu_acc_offload) { MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - double t1 = SuperLU_timer_(); + double t1 = SuperLU_timer_(); gpuFree(0); - double t2 = SuperLU_timer_(); + double t2 = SuperLU_timer_(); if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1); - gpublasHandle_t hb; + gpublasHandle_t hb; gpublasCreate(&hb); if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2); gpublasDestroy(hb); @@ -294,7 +288,7 @@ int main(int argc, char *argv[]) print_options_dist(&options); fflush(stdout); } - + #if ( VAMPIR>=1 ) VT_traceoff(); #endif diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index efb4ad59..2dc70360 100755 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -154,6 +154,17 @@ int main (int argc, char *argv[]) } } + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + iam = grid.iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* Parse command line argv[]. */ for (cpp = argv + 1; *cpp; ++cpp) { @@ -205,12 +216,53 @@ int main (int argc, char *argv[]) } } - /* ------------------------------------------------------------ - INITIALIZE THE SUPERLU PROCESS GRID. - ------------------------------------------------------------ */ - superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = SLU_DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); + options.Algo3d = YES; + options.IterRefine = NOREFINE; + // options.ParSymbFact = YES; + // options.ColPerm = PARMETIS; +#if 0 + options.DiagInv = YES; // only if SLU_HAVE_LAPACK + options.ReplaceTinyPivot = YES; + options.RowPerm = NOROWPERM; + options.ColPerm = NATURAL; + options.ReplaceTinyPivot = YES; +#endif + + if ( batchCount > 0 ) + options.batchCount = batchCount; + + if (equil != -1) options.Equil = equil; + if (rowperm != -1) options.RowPerm = rowperm; + if (colperm != -1) options.ColPerm = colperm; + if (ir != -1) options.IterRefine = ir; + if (lookahead != -1) options.num_lookaheads = lookahead; + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + #ifdef GPU_ACC - int superlu_acc_offload = get_acc_offload(); + int superlu_acc_offload = sp_ienv_dist(10, &options); //get_acc_offload(); if (superlu_acc_offload) { MPI_Comm_rank(MPI_COMM_WORLD, &myrank); double t1 = SuperLU_timer_(); @@ -247,7 +299,6 @@ int main (int argc, char *argv[]) } /* Bail out if I do not belong in the grid. */ - iam = grid.iam; if (iam == -1) goto out; if (!iam) { int v_major, v_minor, v_bugfix; @@ -265,55 +316,6 @@ int main (int argc, char *argv[]) fflush(stdout); } -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Enter main()"); -#endif - - /* Set the default input options: - options.Fact = DOFACT; - options.Equil = YES; - options.ParSymbFact = NO; - options.ColPerm = METIS_AT_PLUS_A; - options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = NO; - options.IterRefine = SLU_DOUBLE; - options.Trans = NOTRANS; - options.SolveInitialized = NO; - options.RefineInitialized = NO; - options.PrintStat = YES; - options->num_lookaheads = 10; - options->lookahead_etree = NO; - options->SymPattern = NO; - options.DiagInv = NO; - */ - set_default_options_dist (&options); - options.Algo3d = YES; - options.IterRefine = NOREFINE; - // options.ParSymbFact = YES; - // options.ColPerm = PARMETIS; -#if 0 - options.DiagInv = YES; // only if SLU_HAVE_LAPACK - options.ReplaceTinyPivot = YES; - options.RowPerm = NOROWPERM; - options.ColPerm = NATURAL; - options.ReplaceTinyPivot = YES; -#endif - - if ( batchCount > 0 ) - options.batchCount = batchCount; - - if (equil != -1) options.Equil = equil; - if (rowperm != -1) options.RowPerm = rowperm; - if (colperm != -1) options.ColPerm = colperm; - if (ir != -1) options.IterRefine = ir; - if (lookahead != -1) options.num_lookaheads = lookahead; - - if (!iam) { - print_sp_ienv_dist(&options); - print_options_dist(&options); - fflush(stdout); - } - /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/psdrive.c b/EXAMPLE/psdrive.c index 14ac5b95..666e69f9 100644 --- a/EXAMPLE/psdrive.c +++ b/EXAMPLE/psdrive.c @@ -175,6 +175,8 @@ int main(int argc, char *argv[]) if (ir != -1) options.IterRefine = ir; if (symbfact != -1) options.ParSymbFact = symbfact; + int superlu_acc_offload = sp_ienv_dist(10, &options); //get_acc_offload(); + /* In the batch mode: create multiple SuperLU grids, each grid solving one linear system. */ if ( batch ) { @@ -195,7 +197,6 @@ int main(int argc, char *argv[]) SUPERLU_FREE(usermap); #ifdef GPU_ACC - int superlu_acc_offload = get_acc_offload(); if (superlu_acc_offload) { /* Binding each MPI to a GPU device */ char *ttemp; @@ -229,7 +230,6 @@ int main(int argc, char *argv[]) superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); #ifdef GPU_ACC - int superlu_acc_offload = get_acc_offload(); if (superlu_acc_offload) { MPI_Comm_rank(MPI_COMM_WORLD, &myrank); double t1 = SuperLU_timer_(); diff --git a/EXAMPLE/psdrive3d.c b/EXAMPLE/psdrive3d.c index 39cf409f..43b73fc0 100644 --- a/EXAMPLE/psdrive3d.c +++ b/EXAMPLE/psdrive3d.c @@ -154,6 +154,17 @@ int main (int argc, char *argv[]) } } + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + iam = grid.iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* Parse command line argv[]. */ for (cpp = argv + 1; *cpp; ++cpp) { @@ -205,12 +216,53 @@ int main (int argc, char *argv[]) } } - /* ------------------------------------------------------------ - INITIALIZE THE SUPERLU PROCESS GRID. - ------------------------------------------------------------ */ - superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.IterRefine = SLU_DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); + options.Algo3d = YES; + options.IterRefine = NOREFINE; + // options.ParSymbFact = YES; + // options.ColPerm = PARMETIS; +#if 0 + options.DiagInv = YES; // only if SLU_HAVE_LAPACK + options.ReplaceTinyPivot = YES; + options.RowPerm = NOROWPERM; + options.ColPerm = NATURAL; + options.ReplaceTinyPivot = YES; +#endif + + if ( batchCount > 0 ) + options.batchCount = batchCount; + + if (equil != -1) options.Equil = equil; + if (rowperm != -1) options.RowPerm = rowperm; + if (colperm != -1) options.ColPerm = colperm; + if (ir != -1) options.IterRefine = ir; + if (lookahead != -1) options.num_lookaheads = lookahead; + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + #ifdef GPU_ACC - int superlu_acc_offload = get_acc_offload(); + int superlu_acc_offload = sp_ienv_dist(10, &options); //get_acc_offload(); if (superlu_acc_offload) { MPI_Comm_rank(MPI_COMM_WORLD, &myrank); double t1 = SuperLU_timer_(); @@ -247,7 +299,6 @@ int main (int argc, char *argv[]) } /* Bail out if I do not belong in the grid. */ - iam = grid.iam; if (iam == -1) goto out; if (!iam) { int v_major, v_minor, v_bugfix; @@ -265,55 +316,6 @@ int main (int argc, char *argv[]) fflush(stdout); } -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Enter main()"); -#endif - - /* Set the default input options: - options.Fact = DOFACT; - options.Equil = YES; - options.ParSymbFact = NO; - options.ColPerm = METIS_AT_PLUS_A; - options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = NO; - options.IterRefine = SLU_DOUBLE; - options.Trans = NOTRANS; - options.SolveInitialized = NO; - options.RefineInitialized = NO; - options.PrintStat = YES; - options->num_lookaheads = 10; - options->lookahead_etree = NO; - options->SymPattern = NO; - options.DiagInv = NO; - */ - set_default_options_dist (&options); - options.Algo3d = YES; - options.IterRefine = NOREFINE; - // options.ParSymbFact = YES; - // options.ColPerm = PARMETIS; -#if 0 - options.DiagInv = YES; // only if SLU_HAVE_LAPACK - options.ReplaceTinyPivot = YES; - options.RowPerm = NOROWPERM; - options.ColPerm = NATURAL; - options.ReplaceTinyPivot = YES; -#endif - - if ( batchCount > 0 ) - options.batchCount = batchCount; - - if (equil != -1) options.Equil = equil; - if (rowperm != -1) options.RowPerm = rowperm; - if (colperm != -1) options.ColPerm = colperm; - if (ir != -1) options.IterRefine = ir; - if (lookahead != -1) options.num_lookaheads = lookahead; - - if (!iam) { - print_sp_ienv_dist(&options); - print_options_dist(&options); - fflush(stdout); - } - /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index 357aec50..d6e2a7a2 100755 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -1,15 +1,15 @@ /*! \file Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) -All rights reserved. +All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ -/*! @file +/*! @file * \brief Driver program for PZGSSVX example * *
@@ -34,7 +34,7 @@ at the top-level directory.
  *
  * This example illustrates how to use PZGSSVX with the full
  * (default) options to solve a linear system.
- * 
+ *
  * Five basic steps are required:
  *   1. Initialize the MPI environment and the SuperLU process grid
  *   2. Set up the input matrix and the right-hand side
@@ -86,20 +86,20 @@ int main(int argc, char *argv[])
     batch = 0;
 
     /* ------------------------------------------------------------
-       INITIALIZE MPI ENVIRONMENT. 
+       INITIALIZE MPI ENVIRONMENT.
        ------------------------------------------------------------*/
     //MPI_Init( &argc, &argv );
-    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-	
+    MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);
+
 
 #if ( VAMPIR>=1 )
-    VT_traceoff(); 
+    VT_traceoff();
 #endif
 
 #if ( VTUNE>=1 )
 	__itt_pause();
 #endif
-	
+
     /* Set the default input options:
         options.Fact              = DOFACT;
         options.Equil             = YES;
@@ -119,7 +119,7 @@ int main(int argc, char *argv[])
     options.RowPerm = LargeDiag_HWPM;
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
-    options.Equil = NO; 
+    options.Equil = NO;
     options.ReplaceTinyPivot = YES;
 #endif
 
@@ -173,27 +173,28 @@ int main(int argc, char *argv[])
     if (ir != -1) options.IterRefine = ir;
     if (symbfact != -1) options.ParSymbFact = symbfact;
 
+    int superlu_acc_offload = sp_ienv_dist(10, &options); //get_acc_offload();
+    
     /* In the batch mode: create multiple SuperLU grids,
         each grid solving one linear system. */
     if ( batch ) {
 	/* ------------------------------------------------------------
-	   INITIALIZE MULTIPLE SUPERLU PROCESS GRIDS. 
+	   INITIALIZE MULTIPLE SUPERLU PROCESS GRIDS.
 	   ------------------------------------------------------------*/
         MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
         usermap = SUPERLU_MALLOC(nprow*npcol * sizeof(int));
         ldumap = nprow;
-	
+
         /* Assuming each grid uses the same number of nprow and npcol */
 	int color = myrank/(nprow*npcol);
 	MPI_Comm_split(MPI_COMM_WORLD, color, myrank, &SubComm);
-        p = 0;    
+        p = 0;
         for (int i = 0; i < nprow; ++i)
     	    for (int j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++;
         superlu_gridmap(SubComm, nprow, npcol, usermap, ldumap, &grid);
         SUPERLU_FREE(usermap);
 
 #ifdef GPU_ACC
-        int superlu_acc_offload = get_acc_offload();
         if (superlu_acc_offload) {
             /* Binding each MPI to a GPU device */
             char *ttemp;
@@ -206,12 +207,12 @@ int main(int argc, char *argv[])
 	        gpuSetDevice(rank % devs); // Set device to be used for GPU executions
             }
 
-            // This is to initialize GPU, which can be costly. 
-            double t1 = SuperLU_timer_();                       
+            // This is to initialize GPU, which can be costly.
+            double t1 = SuperLU_timer_();
             gpuFree(0);
-            double t2 = SuperLU_timer_();    
+            double t2 = SuperLU_timer_();
             if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
-            gpublasHandle_t hb;           
+            gpublasHandle_t hb;
             gpublasCreate(&hb);
             if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
             gpublasDestroy(hb);
@@ -225,23 +226,22 @@ int main(int argc, char *argv[])
            INITIALIZE THE SUPERLU PROCESS GRID.
            ------------------------------------------------------------ */
         superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);
-	
+
 #ifdef GPU_ACC
-        int superlu_acc_offload = get_acc_offload();
         if (superlu_acc_offload) {
             MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-            double t1 = SuperLU_timer_();                       
+            double t1 = SuperLU_timer_();
             gpuFree(0);
-            double t2 = SuperLU_timer_();    
+            double t2 = SuperLU_timer_();
             if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
-            gpublasHandle_t hb;           
+            gpublasHandle_t hb;
             gpublasCreate(&hb);
             if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
             gpublasDestroy(hb);
 	}
 #endif
     }
-    
+
     if(grid.iam==0){
 	MPI_Query_thread(&omp_mpi_level);
         switch (omp_mpi_level) {
@@ -263,7 +263,7 @@ int main(int argc, char *argv[])
 	        break;
         }
     }
-	
+
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
     if ( (iam >= nprow * npcol) || (iam == -1) ) goto out;
@@ -287,7 +287,7 @@ int main(int argc, char *argv[])
 	print_options_dist(&options);
 	fflush(stdout);
     }
-    
+
 #if ( VAMPIR>=1 )
     VT_traceoff();
 #endif
@@ -302,9 +302,9 @@ int main(int argc, char *argv[])
 	}
     }
     // printf("%s\n", postfix);
-	
+
     /* ------------------------------------------------------------
-       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. 
+       GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
        ------------------------------------------------------------*/
     zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
 
@@ -362,10 +362,10 @@ int main(int argc, char *argv[])
        ------------------------------------------------------------*/
 out:
     if ( batch ) {
-        result_min[0] = stat.utime[FACT];   
-        result_min[1] = stat.utime[SOLVE];  
-        result_max[0] = stat.utime[FACT];   
-        result_max[1] = stat.utime[SOLVE];    
+        result_min[0] = stat.utime[FACT];
+        result_min[1] = stat.utime[SOLVE];
+        result_max[0] = stat.utime[FACT];
+        result_max[1] = stat.utime[SOLVE];
         MPI_Allreduce(MPI_IN_PLACE, result_min, 2, MPI_FLOAT,MPI_MIN, MPI_COMM_WORLD);
         MPI_Allreduce(MPI_IN_PLACE, result_max, 2, MPI_FLOAT,MPI_MAX, MPI_COMM_WORLD);
         if (!myrank) {
@@ -376,10 +376,10 @@ int main(int argc, char *argv[])
             fflush(stdout);
         }
     }
-    
+
     superlu_gridexit(&grid);
     if ( iam != -1 ) PStatFree(&stat);
-    
+
     /* ------------------------------------------------------------
        TERMINATES THE MPI EXECUTION ENVIRONMENT.
        ------------------------------------------------------------*/
diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c
index 95507ec6..f6dc9eb5 100755
--- a/EXAMPLE/pzdrive3d.c
+++ b/EXAMPLE/pzdrive3d.c
@@ -154,6 +154,17 @@ int main (int argc, char *argv[])
         }
     }
 
+    /* ------------------------------------------------------------
+       INITIALIZE THE SUPERLU PROCESS GRID.
+       ------------------------------------------------------------ */
+    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+    iam = grid.iam;
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Enter main()");
+#endif
+
+
     /* Parse command line argv[]. */
     for (cpp = argv + 1; *cpp; ++cpp)
     {
@@ -205,12 +216,53 @@ int main (int argc, char *argv[])
         }
     }
 
-    /* ------------------------------------------------------------
-       INITIALIZE THE SUPERLU PROCESS GRID.
-       ------------------------------------------------------------ */
-    superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
+    /* Set the default input options:
+       options.Fact              = DOFACT;
+       options.Equil             = YES;
+       options.ParSymbFact       = NO;
+       options.ColPerm           = METIS_AT_PLUS_A;
+       options.RowPerm           = LargeDiag_MC64;
+       options.ReplaceTinyPivot  = NO;
+       options.IterRefine        = SLU_DOUBLE;
+       options.Trans             = NOTRANS;
+       options.SolveInitialized  = NO;
+       options.RefineInitialized = NO;
+       options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
+       options.DiagInv           = NO;
+     */
+    set_default_options_dist (&options);
+    options.Algo3d = YES;
+    options.IterRefine = NOREFINE;
+    // options.ParSymbFact       = YES;
+    // options.ColPerm           = PARMETIS;
+#if 0
+    options.DiagInv           = YES; // only if SLU_HAVE_LAPACK
+    options.ReplaceTinyPivot = YES;
+    options.RowPerm = NOROWPERM;
+    options.ColPerm = NATURAL;
+    options.ReplaceTinyPivot = YES;
+#endif
+
+    if ( batchCount > 0 )
+        options.batchCount = batchCount;
+
+    if (equil != -1) options.Equil = equil;
+    if (rowperm != -1) options.RowPerm = rowperm;
+    if (colperm != -1) options.ColPerm = colperm;
+    if (ir != -1) options.IterRefine = ir;
+    if (lookahead != -1) options.num_lookaheads = lookahead;
+
+    if (!iam) {
+	print_sp_ienv_dist(&options);
+	print_options_dist(&options);
+	fflush(stdout);
+    }
+    
 #ifdef GPU_ACC
-    int superlu_acc_offload = get_acc_offload();
+    int superlu_acc_offload = sp_ienv_dist(10, &options); //get_acc_offload();
     if (superlu_acc_offload) {
         MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
         double t1 = SuperLU_timer_();
@@ -247,7 +299,6 @@ int main (int argc, char *argv[])
     }
 	
     /* Bail out if I do not belong in the grid. */
-    iam = grid.iam;
     if (iam == -1)     goto out;
     if (!iam) {
 	int v_major, v_minor, v_bugfix;
@@ -265,62 +316,13 @@ int main (int argc, char *argv[])
 	fflush(stdout);
     }
 
-#if ( DEBUGlevel>=1 )
-    CHECK_MALLOC (iam, "Enter main()");
-#endif
-
-    /* Set the default input options:
-       options.Fact              = DOFACT;
-       options.Equil             = YES;
-       options.ParSymbFact       = NO;
-       options.ColPerm           = METIS_AT_PLUS_A;
-       options.RowPerm           = LargeDiag_MC64;
-       options.ReplaceTinyPivot  = NO;
-       options.IterRefine        = SLU_DOUBLE;
-       options.Trans             = NOTRANS;
-       options.SolveInitialized  = NO;
-       options.RefineInitialized = NO;
-       options.PrintStat         = YES;
-       options->num_lookaheads    = 10;
-       options->lookahead_etree   = NO;
-       options->SymPattern        = NO;
-       options.DiagInv           = NO;
-     */
-    set_default_options_dist (&options);
-    options.Algo3d = YES;
-    options.IterRefine = NOREFINE;
-    // options.ParSymbFact       = YES;
-    // options.ColPerm           = PARMETIS;
-#if 0
-    options.DiagInv           = YES; // only if SLU_HAVE_LAPACK
-    options.ReplaceTinyPivot = YES;
-    options.RowPerm = NOROWPERM;
-    options.ColPerm = NATURAL;
-    options.ReplaceTinyPivot = YES;
-#endif
-
-    if ( batchCount > 0 )
-        options.batchCount = batchCount;
-
-    if (equil != -1) options.Equil = equil;
-    if (rowperm != -1) options.RowPerm = rowperm;
-    if (colperm != -1) options.ColPerm = colperm;
-    if (ir != -1) options.IterRefine = ir;
-    if (lookahead != -1) options.num_lookaheads = lookahead;
-    
-    if (!iam) {
-	print_sp_ienv_dist(&options);
-	print_options_dist(&options);
-	fflush(stdout);
-    }
-
     /* ------------------------------------------------------------
        GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE.
        ------------------------------------------------------------ */
     for (ii = 0; ii
 #include 
 #include 
+#include 
+#include 
 
-#include "lupanels.hpp" // For gpuErrchk - maybe move that function to utils 
+// #include "lupanels.hpp" // For gpuErrchk - maybe move that function to utils
+#include "gpuCommon.hpp"
 #include "batch_factorize_marshall.h"
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp b/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp
index e65c2862..000b8a01 100644
--- a/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp
+++ b/SRC/CplusplusFactor/dsparseTreeFactorGPU_impl.hpp
@@ -3,6 +3,7 @@
 #include 
 #include "superlu_ddefs.h"
 #include "lupanels.hpp"
+#include "xlupanels.hpp"
 #ifdef HAVE_CUDA
 #include "lupanels_GPU.cuh"
 #include "batch_block_copy.h"
diff --git a/SRC/CplusplusFactor/l_panels_impl.hpp b/SRC/CplusplusFactor/l_panels_impl.hpp
index 51a49d3b..1a97dae4 100644
--- a/SRC/CplusplusFactor/l_panels_impl.hpp
+++ b/SRC/CplusplusFactor/l_panels_impl.hpp
@@ -1,5 +1,5 @@
 #pragma once 
-#include "lupanels.hpp"
+// #include "lupanels.hpp"
 #include "xgstrf2.hpp"
 template 
 xlpanel_t::xlpanel_t(int_t k, int_t *lsub, Ftype* lval, int_t *xsup, int_t isDiagIncluded)
@@ -141,4 +141,4 @@ int xlpanel_t::getEndBlock(int iSt, int maxRows)
         iEnd = ii - 1;
 #endif 
     return iEnd; 
-}
\ No newline at end of file
+}
diff --git a/SRC/CplusplusFactor/lupanels.hpp b/SRC/CplusplusFactor/lupanels.hpp
index 9a4b4146..d44a7afb 100644
--- a/SRC/CplusplusFactor/lupanels.hpp
+++ b/SRC/CplusplusFactor/lupanels.hpp
@@ -369,7 +369,7 @@ struct LUstruct_v100
     int_t maxUvalCount = 0;
     int_t maxUidxCount = 0;
     std::vector diagFactBufs; /* stores diagonal blocks, 
-					   each one is a normal dense matrix.
+v					   each one is a normal dense matrix.
 					Sherry: where are they free'd ?? */
     std::vector LvalRecvBufs;
     std::vector UvalRecvBufs;
@@ -474,7 +474,7 @@ struct LUstruct_v100
 	for (i = 0; i < numDiagBufs; i++) SUPERLU_FREE(diagFactBufs[i]);
 
 	/* Sherry added the following, which comes from batch setup */
-    superlu_acc_offload = get_acc_offload();    
+	superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();    
     if (superlu_acc_offload){
 	printf(".. free batch buffers\n"); fflush(stdout);
 	SUPERLU_FREE(A_gpu.dFBufs);
diff --git a/SRC/CplusplusFactor/lupanels_impl.hpp b/SRC/CplusplusFactor/lupanels_impl.hpp
index c905cd17..f0277996 100644
--- a/SRC/CplusplusFactor/lupanels_impl.hpp
+++ b/SRC/CplusplusFactor/lupanels_impl.hpp
@@ -5,9 +5,11 @@
 #include "superlu_defs.h"
 #include "luAuxStructTemplated.hpp"
 #ifdef HAVE_CUDA
-#include "lupanels_GPU.cuh"
+//#include "lupanels_GPU.cuh"
+#include "xlupanels_GPU.cuh"
 #endif
-#include "lupanels.hpp"
+// #include "lupanels.hpp"
+#include "xlupanels.hpp"
 #include "superlu_blas.hpp"
 
 template 
@@ -93,7 +95,7 @@ xLUstruct_t::xLUstruct_t(int_t nsupers_, int_t ldt_,
 {
     maxLvl = log2i(grid3d->zscp.Np) + 1;
     isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, trf3Dpartition->myNodeCount, trf3Dpartition->treePerm);
-    superlu_acc_offload = get_acc_offload();
+    superlu_acc_offload = sp_ienv_dist(10, options); // get_acc_offload();
 
 #if (DEBUGlevel >= 1)
     CHECK_MALLOC(grid3d_in->iam, "Enter xLUstruct_t constructor");
diff --git a/SRC/CplusplusFactor/pdgstrf3d_upacked_impl.hpp b/SRC/CplusplusFactor/pdgstrf3d_upacked_impl.hpp
index 8e0da126..1709bd38 100644
--- a/SRC/CplusplusFactor/pdgstrf3d_upacked_impl.hpp
+++ b/SRC/CplusplusFactor/pdgstrf3d_upacked_impl.hpp
@@ -10,12 +10,12 @@
 // #include "acc_aux.c"
 #endif
 
-#include "lupanels.hpp"
+// n#include "lupanels.hpp"
 #include "superlu_upacked.h"
 #include "luAuxStructTemplated.hpp"
-#include "anc25d-GPU_impl.hpp"
+// #include "anc25d-GPU_impl.hpp"
 #include "dAncestorFactor_impl.hpp"
-#include "anc25d_impl.hpp"
+// #include "anc25d_impl.hpp"
 #include "dsparseTreeFactorGPU_impl.hpp"
 #include "dsparseTreeFactor_upacked_impl.hpp"
 #include "schurCompUpdate_impl.cuh"
@@ -101,7 +101,7 @@ int_t pdgstrf3d_upacked(superlu_dist_options_t *options, int m, int n, AnormType
          * ******************************************/
         // Create the new LU structure
         int *isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm);
-        int superlu_acc_offload = get_acc_offload();
+        int superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();
         double tConst = SuperLU_timer_();
         xLUstruct_t LU_packed(nsupers, ldt, trf3Dpartition, LUstruct, grid3d,
                                 SCT, options, stat, thresh, info);
diff --git a/SRC/CplusplusFactor/sparseTreeFactor_impl.hpp b/SRC/CplusplusFactor/sparseTreeFactor_impl.hpp
index 9d3d0db7..b1fc457c 100644
--- a/SRC/CplusplusFactor/sparseTreeFactor_impl.hpp
+++ b/SRC/CplusplusFactor/sparseTreeFactor_impl.hpp
@@ -1,6 +1,6 @@
 #pragma once 
 #include "superlu_ddefs.h"
-#include "lupanels.hpp"
+// #include "lupanels.hpp"
 
 template 
 int_t xLUstruct_t::dsparseTreeFactor(
diff --git a/SRC/CplusplusFactor/xlupanels.hpp b/SRC/CplusplusFactor/xlupanels.hpp
index 2eb8a9ab..65103345 100644
--- a/SRC/CplusplusFactor/xlupanels.hpp
+++ b/SRC/CplusplusFactor/xlupanels.hpp
@@ -4,7 +4,7 @@
 #include "superlu_ddefs.h"
 #include "lu_common.hpp"
 #ifdef HAVE_CUDA
-#include "lupanels_GPU.cuh"
+// #include "lupanels_GPU.cuh"
 #include "xlupanels_GPU.cuh"
 #include "gpuCommon.hpp"
 #endif
@@ -487,7 +487,7 @@ struct xLUstruct_t
             SUPERLU_FREE(diagFactBufs[i]);
 
         /* Sherry added the following, which comes from batch setup */
-        superlu_acc_offload = get_acc_offload();
+        superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();
         if (superlu_acc_offload)
         {
             printf(".. free batch buffers\n");
diff --git a/SRC/CplusplusFactor/xlupanels_GPU.cuh b/SRC/CplusplusFactor/xlupanels_GPU.cuh
index 88ae4dbc..b8cef51e 100644
--- a/SRC/CplusplusFactor/xlupanels_GPU.cuh
+++ b/SRC/CplusplusFactor/xlupanels_GPU.cuh
@@ -14,6 +14,7 @@
 
 #include "lu_common.hpp"
 // #include "lupanels.hpp" 
+#include "lupanels_GPU.cuh" 
 
 #ifdef __CUDACC__
 #define DEVICE_CALLABLE __device__
diff --git a/SRC/complex16/pzgstrf.c b/SRC/complex16/pzgstrf.c
index 96408305..3aafca13 100755
--- a/SRC/complex16/pzgstrf.c
+++ b/SRC/complex16/pzgstrf.c
@@ -800,7 +800,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #endif
 
 #ifdef GPU_ACC /*-- use GPU --*/
-    int superlu_acc_offload = get_acc_offload();
+    int superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();
 
     int gpublas_nb = get_gpublas_nb(); // default 64
     int nstreams = get_num_gpu_streams (); // default 8
diff --git a/SRC/complex16/pzgstrf3d.c b/SRC/complex16/pzgstrf3d.c
index 7496d1ed..80f66878 100755
--- a/SRC/complex16/pzgstrf3d.c
+++ b/SRC/complex16/pzgstrf3d.c
@@ -261,7 +261,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t mcb =    (nsupers + Pc - 1) / Pc;
     HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t));
 
-    zInit_HyP(HyP, Llu, mcb, mrb);
+    zInit_HyP(options, HyP, Llu, mcb, mrb);
     HyP->first_l_block_acc = first_l_block_acc;
     HyP->first_u_block_acc = first_u_block_acc;
 
diff --git a/SRC/complex16/ztrfAux.c b/SRC/complex16/ztrfAux.c
index 6b8354ee..ce683d6b 100755
--- a/SRC/complex16/ztrfAux.c
+++ b/SRC/complex16/ztrfAux.c
@@ -27,7 +27,7 @@ at the top-level directory.
 #endif
 
 /* Inititalize the data structure to assist HALO offload of Schur-complement. */
-void zInit_HyP(HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb )
+void zInit_HyP(superlu_dist_options_t *options, HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb )
 {
     HyP->last_offload = -1;
 #if 0
@@ -63,7 +63,7 @@ void zInit_HyP(HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb )
     }
 
     HyP->last_offload = -1;
-    HyP->superlu_acc_offload = get_acc_offload ();
+    HyP->superlu_acc_offload = sp_ienv_dist(10, options); // get_acc_offload();
 
     HyP->nGPUStreams =0;
 } /* zInit_HyP */
diff --git a/SRC/double/dtrfAux.c b/SRC/double/dtrfAux.c
index f98a1566..ea5a4878 100755
--- a/SRC/double/dtrfAux.c
+++ b/SRC/double/dtrfAux.c
@@ -28,7 +28,7 @@ at the top-level directory.
 #endif
 
 /* Inititalize the data structure to assist HALO offload of Schur-complement. */
-void dInit_HyP(HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb )
+void dInit_HyP(superlu_dist_options_t *options, HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb )
 {
     HyP->last_offload = -1;
 #if 0
@@ -64,7 +64,7 @@ void dInit_HyP(HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb )
     }
 
     HyP->last_offload = -1;
-    HyP->superlu_acc_offload = get_acc_offload ();
+    HyP->superlu_acc_offload = sp_ienv_dist(10, options); // get_acc_offload();
 
     HyP->nGPUStreams =0;
 } /* dInit_HyP */
diff --git a/SRC/double/pdgstrf.c b/SRC/double/pdgstrf.c
index ac813663..ee42c0d6 100755
--- a/SRC/double/pdgstrf.c
+++ b/SRC/double/pdgstrf.c
@@ -800,7 +800,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #endif
 
 #ifdef GPU_ACC /*-- use GPU --*/
-    int superlu_acc_offload = get_acc_offload();
+    int superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();
 
     int gpublas_nb = get_gpublas_nb(); // default 64
     int nstreams = get_num_gpu_streams (); // default 8
diff --git a/SRC/double/pdgstrf3d.c b/SRC/double/pdgstrf3d.c
index 09bfd55f..73d5e0cb 100755
--- a/SRC/double/pdgstrf3d.c
+++ b/SRC/double/pdgstrf3d.c
@@ -262,7 +262,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t mcb =    (nsupers + Pc - 1) / Pc;
     HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t));
 
-    dInit_HyP(HyP, Llu, mcb, mrb);
+    dInit_HyP(options, HyP, Llu, mcb, mrb);
     HyP->first_l_block_acc = first_l_block_acc;
     HyP->first_u_block_acc = first_u_block_acc;
 
diff --git a/SRC/include/superlu_ddefs.h b/SRC/include/superlu_ddefs.h
index 0016578e..c1d9278b 100755
--- a/SRC/include/superlu_ddefs.h
+++ b/SRC/include/superlu_ddefs.h
@@ -771,6 +771,9 @@ extern int_t dleafForestForwardSolve3d(superlu_dist_options_t *options, int_t tr
                                dSOLVEstruct_t * SOLVEstruct, SuperLUStat_t * stat, xtrsTimer_t *xtrsTimer);
 
 
+extern int_t dtrs_compute_communication_structure(superlu_dist_options_t *options, int_t n, dLUstruct_t * LUstruct,
+                           dScalePermstruct_t * ScalePermstruct,
+                           int* supernodeMask, gridinfo_t *grid, SuperLUStat_t * stat);
 extern int_t dreduceSolvedX_newsolve(int_t treeId, int_t sender, int_t receiver, double* x, int nrhs,
                       dtrf3Dpartition_t*  trf3Dpartition, dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, double* recvbuf, xtrsTimer_t *xtrsTimer);
 
@@ -986,7 +989,6 @@ extern void dperform_row_permutation(superlu_dist_options_t *, fact_t Fact,
            dScalePermstruct_t *, dLUstruct_t *LUstruct, int_t m, int_t n,
 	       gridinfo_t *, SuperMatrix *A, SuperMatrix *GA, SuperLUStat_t *,
 	       int job, int Equil, int_t *rowequ, int *colequ, int *iinfo);
-           
 extern double dcomputeA_Norm(int notran, SuperMatrix *, gridinfo_t *);
 extern int dtrs_compute_communication_structure(superlu_dist_options_t *options,
        int_t n, dLUstruct_t *, dScalePermstruct_t * ScalePermstruct,
@@ -1121,7 +1123,7 @@ extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *,
 extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm,
 		       dtrf3Dpartition_t*, SCT_t *, dLUstruct_t *,
 		       gridinfo3d_t *, SuperLUStat_t *, int *);
-extern void dInit_HyP(HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb );
+extern void dInit_HyP(superlu_dist_options_t *, HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb);
 extern void Free_HyP(HyP_t* HyP);
 extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
 
@@ -1546,11 +1548,11 @@ extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_
 
 extern int pdflatten_LDATA(superlu_dist_options_t *options, int_t n, dLUstruct_t * LUstruct,
                            gridinfo_t *grid, SuperLUStat_t * stat);
-extern void pdconvert_flatten_skyline2UROWDATA(superlu_dist_options_t *options, gridinfo_t *grid,
-	   dLUstruct_t *LUstruct, SuperLUStat_t *stat, int n);
-extern void pdconvertUROWDATA2skyline(superlu_dist_options_t *options, gridinfo_t *grid,
-	   dLUstruct_t *LUstruct, SuperLUStat_t *stat, int n);
-       
+extern void pdconvert_flatten_skyline2UROWDATA(superlu_dist_options_t *, gridinfo_t *,
+	                 dLUstruct_t *, SuperLUStat_t *, int n);
+extern void pdconvertUROWDATA2skyline(superlu_dist_options_t *, gridinfo_t *,
+       	    		dLUstruct_t *, SuperLUStat_t *, int n);
+
 extern int_t
 dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
                 Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno,
diff --git a/SRC/include/superlu_defs.h b/SRC/include/superlu_defs.h
index 4dce4f4d..e8fea9c0 100755
--- a/SRC/include/superlu_defs.h
+++ b/SRC/include/superlu_defs.h
@@ -1228,7 +1228,7 @@ extern int_t get_min (int_t *, int_t);
 extern int compare_pair (const void *, const void *);
 extern int_t static_partition (struct superlu_pair *, int_t, int_t *, int_t,
 			       int_t *, int_t *, int);
-extern int get_acc_offload(void);
+extern int get_acc_offload(superlu_dist_options_t *);
 extern int get_acc_solve(void);
 extern int get_new3dsolve(void);
 extern int get_new3dsolvetreecomm(void);
diff --git a/SRC/include/superlu_dist_config.h b/SRC/include/superlu_dist_config.h
index e27651a0..62f057db 100644
--- a/SRC/include/superlu_dist_config.h
+++ b/SRC/include/superlu_dist_config.h
@@ -1,7 +1,7 @@
 /* superlu_dist_config.h.in */
 
 /* Enable CUDA */
-/* #undef HAVE_CUDA */
+#define HAVE_CUDA TRUE
 
 /* Enable NVSHMEM */
 /* #undef HAVE_NVSHMEM */
@@ -22,7 +22,7 @@
 /* #undef HAVE_COMBBLAS */
 
 /* Enable MAGMA */
-/* #undef HAVE_MAGMA */
+#define HAVE_MAGMA TRUE
 
 /* enable 64bit index mode */
 /* #undef XSDK_INDEX_SIZE */
diff --git a/SRC/include/superlu_sdefs.h b/SRC/include/superlu_sdefs.h
index 903fbd88..18e999dd 100755
--- a/SRC/include/superlu_sdefs.h
+++ b/SRC/include/superlu_sdefs.h
@@ -771,6 +771,9 @@ extern int_t sleafForestForwardSolve3d(superlu_dist_options_t *options, int_t tr
                                sSOLVEstruct_t * SOLVEstruct, SuperLUStat_t * stat, xtrsTimer_t *xtrsTimer);
 
 
+extern int_t strs_compute_communication_structure(superlu_dist_options_t *options, int_t n, sLUstruct_t * LUstruct,
+                           sScalePermstruct_t * ScalePermstruct,
+                           int* supernodeMask, gridinfo_t *grid, SuperLUStat_t * stat);
 extern int_t sreduceSolvedX_newsolve(int_t treeId, int_t sender, int_t receiver, float* x, int nrhs,
                       strf3Dpartition_t*  trf3Dpartition, sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, float* recvbuf, xtrsTimer_t *xtrsTimer);
 
@@ -1120,7 +1123,7 @@ extern void psgssvx3d (superlu_dist_options_t *, SuperMatrix *,
 extern int_t psgstrf3d(superlu_dist_options_t *, int m, int n, float anorm,
 		       strf3Dpartition_t*, SCT_t *, sLUstruct_t *,
 		       gridinfo3d_t *, SuperLUStat_t *, int *);
-extern void sInit_HyP(HyP_t* HyP, sLocalLU_t *Llu, int_t mcb, int_t mrb );
+extern void sInit_HyP(superlu_dist_options_t *, HyP_t* HyP, sLocalLU_t *Llu, int_t mcb, int_t mrb);
 extern void Free_HyP(HyP_t* HyP);
 extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
 
@@ -1545,6 +1548,10 @@ extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_
 
 extern int psflatten_LDATA(superlu_dist_options_t *options, int_t n, sLUstruct_t * LUstruct,
                            gridinfo_t *grid, SuperLUStat_t * stat);
+extern void psconvert_flatten_skyline2UROWDATA(superlu_dist_options_t *, gridinfo_t *,
+	                 sLUstruct_t *, SuperLUStat_t *, int n);
+extern void psconvertUROWDATA2skyline(superlu_dist_options_t *, gridinfo_t *,
+       	    		sLUstruct_t *, SuperLUStat_t *, int n);
 
 extern int_t
 sReDistribute_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct,
diff --git a/SRC/include/superlu_zdefs.h b/SRC/include/superlu_zdefs.h
index 708063f1..61f7e1c3 100755
--- a/SRC/include/superlu_zdefs.h
+++ b/SRC/include/superlu_zdefs.h
@@ -771,6 +771,9 @@ extern int_t zleafForestForwardSolve3d(superlu_dist_options_t *options, int_t tr
                                zSOLVEstruct_t * SOLVEstruct, SuperLUStat_t * stat, xtrsTimer_t *xtrsTimer);
 
 
+extern int_t ztrs_compute_communication_structure(superlu_dist_options_t *options, int_t n, zLUstruct_t * LUstruct,
+                           zScalePermstruct_t * ScalePermstruct,
+                           int* supernodeMask, gridinfo_t *grid, SuperLUStat_t * stat);
 extern int_t zreduceSolvedX_newsolve(int_t treeId, int_t sender, int_t receiver, doublecomplex* x, int nrhs,
                       ztrf3Dpartition_t*  trf3Dpartition, zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, doublecomplex* recvbuf, xtrsTimer_t *xtrsTimer);
 
@@ -1122,7 +1125,7 @@ extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *,
 extern int_t pzgstrf3d(superlu_dist_options_t *, int m, int n, double anorm,
 		       ztrf3Dpartition_t*, SCT_t *, zLUstruct_t *,
 		       gridinfo3d_t *, SuperLUStat_t *, int *);
-extern void zInit_HyP(HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb );
+extern void zInit_HyP(superlu_dist_options_t *, HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb);
 extern void Free_HyP(HyP_t* HyP);
 extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
 
@@ -1547,6 +1550,10 @@ extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_
 
 extern int pzflatten_LDATA(superlu_dist_options_t *options, int_t n, zLUstruct_t * LUstruct,
                            gridinfo_t *grid, SuperLUStat_t * stat);
+extern void pzconvert_flatten_skyline2UROWDATA(superlu_dist_options_t *, gridinfo_t *,
+	                 zLUstruct_t *, SuperLUStat_t *, int n);
+extern void pzconvertUROWDATA2skyline(superlu_dist_options_t *, gridinfo_t *,
+       	    		zLUstruct_t *, SuperLUStat_t *, int n);
 
 extern int_t
 zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
diff --git a/SRC/prec-independent/sec_structs.c b/SRC/prec-independent/sec_structs.c
index 3565cd89..f0120201 100755
--- a/SRC/prec-independent/sec_structs.c
+++ b/SRC/prec-independent/sec_structs.c
@@ -581,15 +581,10 @@ void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT)
 }
 
 int
-get_acc_offload ()
+get_acc_offload (superlu_dist_options_t *options)
 {
-    char *ttemp;
-    ttemp = getenv ("SUPERLU_ACC_OFFLOAD");
 #ifdef GPU_ACC
-    if (ttemp)
-        return atoi (ttemp);
-    else
-        return 1;  // default
+    sp_ienv_dist(10, options);
 #else
     return 0;  
 #endif        
diff --git a/SRC/prec-independent/util.c b/SRC/prec-independent/util.c
index 00c866f7..1a8978ee 100755
--- a/SRC/prec-independent/util.c
+++ b/SRC/prec-independent/util.c
@@ -249,7 +249,7 @@ void print_options_dist(superlu_dist_options_t *options)
 {
     if (options->PrintStat == NO)
         return;
-
+    
     printf("**************************************************\n");
     printf(".. options:\n");
     printf("**    Fact                      : %4d\n", options->Fact);
@@ -284,12 +284,21 @@ void print_sp_ienv_dist(superlu_dist_options_t *options)
     if (options->PrintStat == NO)
         return;
 
+    int num_threads = omp_get_num_threads ();
+    int gpu_enabled = 0;
+#ifdef GPU_ACC
+    gpu_enabled = sp_ienv_dist(10, options);
+#endif
+    
     printf("**************************************************\n");
     printf(".. blocking parameters from sp_ienv():\n");
     printf("**    relaxation                 : %d\n", sp_ienv_dist(2, options));
     printf("**    max supernode              : %d\n", sp_ienv_dist(3, options));
     printf("**    estimated fill ratio       : %d\n", sp_ienv_dist(6, options));
     printf("**    min GEMM m*k*n to use GPU  : %d\n", sp_ienv_dist(7, options));
+    printf("** parallel environment:\n");
+    printf("**    OpenMP threads            : %4d\n", num_threads);
+    printf("**    GPU enable?               : %4d\n", gpu_enabled);
     printf("**************************************************\n");
 }
 
@@ -1389,7 +1398,7 @@ gemm_division_cpu_gpu(
 {
     int Ngem = sp_ienv_dist(7, options);  /*get_mnk_dgemm ();*/
     int min_gpu_col = get_gpublas_nb (); /* default 64 */
-    int superlu_acc_offload = get_acc_offload();
+    int superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();
     int ncols = full_u_cols[num_blks - 1];
     // int ncolsExcludingFirst =full_u_cols[num_blks - 1]
 
diff --git a/SRC/single/psgstrf.c b/SRC/single/psgstrf.c
index 10d1f430..1d47ca8b 100755
--- a/SRC/single/psgstrf.c
+++ b/SRC/single/psgstrf.c
@@ -800,7 +800,7 @@ psgstrf(superlu_dist_options_t * options, int m, int n, float anorm,
 #endif
 
 #ifdef GPU_ACC /*-- use GPU --*/
-    int superlu_acc_offload = get_acc_offload();
+    int superlu_acc_offload = sp_ienv_dist(10, options); //get_acc_offload();
 
     int gpublas_nb = get_gpublas_nb(); // default 64
     int nstreams = get_num_gpu_streams (); // default 8
diff --git a/SRC/single/psgstrf3d.c b/SRC/single/psgstrf3d.c
index 6236c11c..4393852a 100755
--- a/SRC/single/psgstrf3d.c
+++ b/SRC/single/psgstrf3d.c
@@ -262,7 +262,7 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
     int_t mcb =    (nsupers + Pc - 1) / Pc;
     HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t));
 
-    sInit_HyP(HyP, Llu, mcb, mrb);
+    sInit_HyP(options, HyP, Llu, mcb, mrb);
     HyP->first_l_block_acc = first_l_block_acc;
     HyP->first_u_block_acc = first_u_block_acc;
 
diff --git a/SRC/single/strfAux.c b/SRC/single/strfAux.c
index b91495c5..6051ac20 100755
--- a/SRC/single/strfAux.c
+++ b/SRC/single/strfAux.c
@@ -28,7 +28,7 @@ at the top-level directory.
 #endif
 
 /* Inititalize the data structure to assist HALO offload of Schur-complement. */
-void sInit_HyP(HyP_t* HyP, sLocalLU_t *Llu, int_t mcb, int_t mrb )
+void sInit_HyP(superlu_dist_options_t *options, HyP_t* HyP, sLocalLU_t *Llu, int_t mcb, int_t mrb )
 {
     HyP->last_offload = -1;
 #if 0
@@ -64,7 +64,7 @@ void sInit_HyP(HyP_t* HyP, sLocalLU_t *Llu, int_t mcb, int_t mrb )
     }
 
     HyP->last_offload = -1;
-    HyP->superlu_acc_offload = get_acc_offload ();
+    HyP->superlu_acc_offload = sp_ienv_dist(10, options); // get_acc_offload();
 
     HyP->nGPUStreams =0;
 } /* sInit_HyP */