#include "superlu_ddefs.h"
+
/*! \brief
*
*
@@ -89,9 +90,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
//MPI_Init( &argc, &argv );
- //MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);
+// MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);
+#ifdef GPU_SOLVE
nv_init_wrapper(&argc,argv,&omp_mpi_level);
-
+#else
+ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);
+#endif
#if ( VAMPIR>=1 )
VT_traceoff();
@@ -101,7 +105,33 @@ int main(int argc, char *argv[])
__itt_pause();
#endif
- /* Parse command line argv[]. */
+ /* Set the default input options:
+ options.Fact = DOFACT;
+ options.Equil = YES;
+ options.ParSymbFact = NO;
+ options.ColPerm = METIS_AT_PLUS_A;
+ options.RowPerm = LargeDiag_MC64;
+ options.ReplaceTinyPivot = NO;
+ options.IterRefine = SLU_DOUBLE;
+ options.Trans = NOTRANS;
+ options.SolveInitialized = NO;
+ options.RefineInitialized = NO;
+ options.PrintStat = YES;
+ options.DiagInv = NO;
+ */
+ set_default_options_dist(&options);
+ options.IterRefine = NOREFINE;
+ options.DiagInv = YES;
+ options.ReplaceTinyPivot = YES;
+#if 0
+ options.RowPerm = LargeDiag_HWPM;
+ options.IterRefine = NOREFINE;
+ options.ColPerm = NATURAL;
+ options.Equil = NO;
+ options.ReplaceTinyPivot = YES;
+#endif
+
+ /* Parse command line argv[], may modify default options */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
c = *(*cpp+1);
@@ -109,8 +139,14 @@ int main(int argc, char *argv[])
switch (c) {
case 'h':
printf("Options:\n");
- printf("\t-r : process rows (default %4d)\n", nprow);
- printf("\t-c : process columns (default %4d)\n", npcol);
+ printf("\t-r : process rows (default %4d)\n", nprow);
+ printf("\t-c : process columns (default %4d)\n", npcol);
+ printf("\t-p : row permutation (default %4d)\n", options.RowPerm);
+ printf("\t-q : col permutation (default %4d)\n", options.ColPerm);
+ printf("\t-s : parallel symbolic? (default %4d)\n", options.ParSymbFact);
+ printf("\t-l : lookahead level (default %4d)\n", options.num_lookaheads);
+ printf("\t-i : iter. refinement (default %4d)\n", options.IterRefine);
+ printf("\t-b : use batch mode? (default %4d)\n", batch);
exit(0);
break;
case 'r': nprow = atoi(*cpp);
@@ -138,15 +174,25 @@ int main(int argc, char *argv[])
}
}
- if ( batch ) { /* in the batch mode: create multiple SuperLU grids,
- each grid solving one linear system. */
+ /* Command line input to modify default options */
+ if (rowperm != -1) options.RowPerm = rowperm;
+ if (colperm != -1) options.ColPerm = colperm;
+ if (lookahead != -1) options.num_lookaheads = lookahead;
+ if (ir != -1) options.IterRefine = ir;
+ if (symbfact != -1) options.ParSymbFact = symbfact;
+
+ /* In the batch mode: create multiple SuperLU grids,
+ each grid solving one linear system. */
+ if ( batch ) {
/* ------------------------------------------------------------
INITIALIZE MULTIPLE SUPERLU PROCESS GRIDS.
------------------------------------------------------------*/
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
usermap = SUPERLU_MALLOC(nprow*npcol * sizeof(int));
ldumap = nprow;
- int color = myrank/(nprow*npcol); /* Assuming each grid uses the same number of nprow and npcol */
+
+ /* Assuming each grid uses the same number of nprow and npcol */
+ int color = myrank/(nprow*npcol);
MPI_Comm_split(MPI_COMM_WORLD, color, myrank, &SubComm);
p = 0;
for (int i = 0; i < nprow; ++i)
@@ -162,27 +208,27 @@ int main(int argc, char *argv[])
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");
if (ttemp) {
- int devs, rank;
- MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
+ int devs, rank;
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}
- // This is to initialize GPU, which can be costly.
- double t1 = SuperLU_timer_();
+ // This is to initialize GPU, which can be costly.
+ double t1 = SuperLU_timer_();
gpuFree(0);
- double t2 = SuperLU_timer_();
+ double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
- gpublasHandle_t hb;
+ gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
- }
+ }
#endif
// printf("grid.iam %5d, myrank %5d\n",grid.iam,myrank);
// fflush(stdout);
- } else {
+ } else { /* not batch mode */
/* ------------------------------------------------------------
INITIALIZE THE SUPERLU PROCESS GRID.
------------------------------------------------------------ */
@@ -192,11 +238,11 @@ int main(int argc, char *argv[])
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
- double t1 = SuperLU_timer_();
+ double t1 = SuperLU_timer_();
gpuFree(0);
- double t2 = SuperLU_timer_();
+ double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
- gpublasHandle_t hb;
+ gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
@@ -244,6 +290,12 @@ int main(int argc, char *argv[])
fflush(stdout);
}
+ /* print solver options */
+ if (!iam) {
+ print_options_dist(&options);
+ fflush(stdout);
+ }
+
#if ( VAMPIR>=1 )
VT_traceoff();
#endif
@@ -271,39 +323,6 @@ int main(int argc, char *argv[])
NOW WE SOLVE THE LINEAR SYSTEM.
------------------------------------------------------------*/
- /* Set the default input options:
- options.Fact = DOFACT;
- options.Equil = YES;
- options.ParSymbFact = NO;
- options.ColPerm = METIS_AT_PLUS_A;
- options.RowPerm = LargeDiag_MC64;
- options.ReplaceTinyPivot = NO;
- options.IterRefine = SLU_DOUBLE;
- options.Trans = NOTRANS;
- options.SolveInitialized = NO;
- options.RefineInitialized = NO;
- options.PrintStat = YES;
- options.DiagInv = NO;
- */
- set_default_options_dist(&options);
-#if 0
- options.RowPerm = LargeDiag_HWPM;
- options.IterRefine = NOREFINE;
- options.ColPerm = NATURAL;
- options.Equil = NO;
- options.ReplaceTinyPivot = YES;
-#endif
-
- if (rowperm != -1) options.RowPerm = rowperm;
- if (colperm != -1) options.ColPerm = colperm;
- if (lookahead != -1) options.num_lookaheads = lookahead;
- if (ir != -1) options.IterRefine = ir;
- if (symbfact != -1) options.ParSymbFact = symbfact;
-
- if (!iam) {
- print_options_dist(&options);
- fflush(stdout);
- }
m = A.nrow;
n = A.ncol;
@@ -331,8 +350,7 @@ int main(int argc, char *argv[])
}
PStatPrint(&options, &stat, &grid); /* Print the statistics. */
- fflush(stdout);
- return 0;
+
/* ------------------------------------------------------------
DEALLOCATE STORAGE.
------------------------------------------------------------*/
@@ -347,7 +365,6 @@ int main(int argc, char *argv[])
SUPERLU_FREE(berr);
fclose(fp);
-
/* ------------------------------------------------------------
RELEASE THE SUPERLU PROCESS GRID.
------------------------------------------------------------*/
@@ -374,16 +391,10 @@ int main(int argc, char *argv[])
/* ------------------------------------------------------------
TERMINATES THE MPI EXECUTION ENVIRONMENT.
------------------------------------------------------------*/
- //nvshmem_free(ready_x);
- //nvshmem_free(ready_lsum);
- //nvshmem_free(flag_bc_q);
- //nvshmem_free(flag_rd_q);
- //nvshmem_finalize();
MPI_Finalize();
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit main()");
- fflush(stdout);
#endif
}
diff --git a/EXAMPLE/psdrive.c b/EXAMPLE/psdrive.c
index b7a5709d..99a00e31 100644
--- a/EXAMPLE/psdrive.c
+++ b/EXAMPLE/psdrive.c
@@ -101,7 +101,31 @@ int main(int argc, char *argv[])
__itt_pause();
#endif
- /* Parse command line argv[]. */
+ /* Set the default input options:
+ options.Fact = DOFACT;
+ options.Equil = YES;
+ options.ParSymbFact = NO;
+ options.ColPerm = METIS_AT_PLUS_A;
+ options.RowPerm = LargeDiag_MC64;
+ options.ReplaceTinyPivot = NO;
+ options.IterRefine = SLU_DOUBLE;
+ options.Trans = NOTRANS;
+ options.SolveInitialized = NO;
+ options.RefineInitialized = NO;
+ options.PrintStat = YES;
+ options.DiagInv = NO;
+ */
+ set_default_options_dist(&options);
+ options.IterRefine = SLU_SINGLE;
+#if 0
+ options.RowPerm = LargeDiag_HWPM;
+ options.IterRefine = NOREFINE;
+ options.ColPerm = NATURAL;
+ options.Equil = NO;
+ options.ReplaceTinyPivot = YES;
+#endif
+
+ /* Parse command line argv[], may modify default options */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
c = *(*cpp+1);
@@ -109,8 +133,14 @@ int main(int argc, char *argv[])
switch (c) {
case 'h':
printf("Options:\n");
- printf("\t-r : process rows (default %4d)\n", nprow);
- printf("\t-c : process columns (default %4d)\n", npcol);
+ printf("\t-r : process rows (default %4d)\n", nprow);
+ printf("\t-c : process columns (default %4d)\n", npcol);
+ printf("\t-p : row permutation (default %4d)\n", options.RowPerm);
+ printf("\t-q : col permutation (default %4d)\n", options.ColPerm);
+ printf("\t-s : parallel symbolic? (default %4d)\n", options.ParSymbFact);
+ printf("\t-l : lookahead level (default %4d)\n", options.num_lookaheads);
+ printf("\t-i : iter. refinement (default %4d)\n", options.IterRefine);
+ printf("\t-b : use batch mode? (default %4d)\n", batch);
exit(0);
break;
case 'r': nprow = atoi(*cpp);
@@ -138,15 +168,25 @@ int main(int argc, char *argv[])
}
}
- if ( batch ) { /* in the batch mode: create multiple SuperLU grids,
- each grid solving one linear system. */
+ /* Command line input to modify default options */
+ if (rowperm != -1) options.RowPerm = rowperm;
+ if (colperm != -1) options.ColPerm = colperm;
+ if (lookahead != -1) options.num_lookaheads = lookahead;
+ if (ir != -1) options.IterRefine = ir;
+ if (symbfact != -1) options.ParSymbFact = symbfact;
+
+ /* In the batch mode: create multiple SuperLU grids,
+ each grid solving one linear system. */
+ if ( batch ) {
/* ------------------------------------------------------------
INITIALIZE MULTIPLE SUPERLU PROCESS GRIDS.
------------------------------------------------------------*/
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
usermap = SUPERLU_MALLOC(nprow*npcol * sizeof(int));
ldumap = nprow;
- int color = myrank/(nprow*npcol); /* Assuming each grid uses the same number of nprow and npcol */
+
+ /* Assuming each grid uses the same number of nprow and npcol */
+ int color = myrank/(nprow*npcol);
MPI_Comm_split(MPI_COMM_WORLD, color, myrank, &SubComm);
p = 0;
for (int i = 0; i < nprow; ++i)
@@ -182,7 +222,7 @@ int main(int argc, char *argv[])
// printf("grid.iam %5d, myrank %5d\n",grid.iam,myrank);
// fflush(stdout);
- } else {
+ } else { /* not batch mode */
/* ------------------------------------------------------------
INITIALIZE THE SUPERLU PROCESS GRID.
------------------------------------------------------------ */
@@ -244,6 +284,12 @@ int main(int argc, char *argv[])
fflush(stdout);
}
+ /* print solver options */
+ if (!iam) {
+ print_options_dist(&options);
+ fflush(stdout);
+ }
+
#if ( VAMPIR>=1 )
VT_traceoff();
#endif
@@ -271,40 +317,6 @@ int main(int argc, char *argv[])
NOW WE SOLVE THE LINEAR SYSTEM.
------------------------------------------------------------*/
- /* Set the default input options:
- options.Fact = DOFACT;
- options.Equil = YES;
- options.ParSymbFact = NO;
- options.ColPerm = METIS_AT_PLUS_A;
- options.RowPerm = LargeDiag_MC64;
- options.ReplaceTinyPivot = NO;
- options.IterRefine = SLU_DOUBLE;
- options.Trans = NOTRANS;
- options.SolveInitialized = NO;
- options.RefineInitialized = NO;
- options.PrintStat = YES;
- options.DiagInv = NO;
- */
- set_default_options_dist(&options);
- options.IterRefine = SLU_SINGLE;
-#if 0
- options.RowPerm = LargeDiag_HWPM;
- options.IterRefine = NOREFINE;
- options.ColPerm = NATURAL;
- options.Equil = NO;
- options.ReplaceTinyPivot = YES;
-#endif
-
- if (rowperm != -1) options.RowPerm = rowperm;
- if (colperm != -1) options.ColPerm = colperm;
- if (lookahead != -1) options.num_lookaheads = lookahead;
- if (ir != -1) options.IterRefine = ir;
- if (symbfact != -1) options.ParSymbFact = symbfact;
-
- if (!iam) {
- print_options_dist(&options);
- fflush(stdout);
- }
m = A.nrow;
n = A.ncol;
diff --git a/EXAMPLE/psgsrfs_tracking.c b/EXAMPLE/psgsrfs_tracking.c
index c0a970b3..0945132d 100755
--- a/EXAMPLE/psgsrfs_tracking.c
+++ b/EXAMPLE/psgsrfs_tracking.c
@@ -127,15 +127,15 @@ psgsrfs_tracking(superlu_dist_options_t *options,
#define ITMAX 10
float *ax, *R, *dx, *temp, *work, *B_col, *X_col;
- int_t count, i, j, lwork, nz;
- int iam;
+ int_t lwork, nz;
+ int iam, i, j, count;
float eps, lstres;
float s, safmin, safe1, safe2;
/* Data structures used by matrix-vector multiply routine. */
psgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm;
NRformat_loc *Astore;
- int_t m_loc, fst_row;
+ int m_loc, fst_row;
/* Initialization. */
@@ -351,5 +351,5 @@ psgsrfs_tracking(superlu_dist_options_t *options,
CHECK_MALLOC(iam, "Exit psgsrfs()");
#endif
-} /* PSGSRFS */
+} /* psgsrfs_tracking */
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
index eaf9c9ff..1e97b9d8 100755
--- a/EXAMPLE/pzdrive.c
+++ b/EXAMPLE/pzdrive.c
@@ -100,7 +100,30 @@ int main(int argc, char *argv[])
__itt_pause();
#endif
- /* Parse command line argv[]. */
+ /* Set the default input options:
+ options.Fact = DOFACT;
+ options.Equil = YES;
+ options.ParSymbFact = NO;
+ options.ColPerm = METIS_AT_PLUS_A;
+ options.RowPerm = LargeDiag_MC64;
+ options.ReplaceTinyPivot = NO;
+ options.IterRefine = SLU_DOUBLE;
+ options.Trans = NOTRANS;
+ options.SolveInitialized = NO;
+ options.RefineInitialized = NO;
+ options.PrintStat = YES;
+ options.DiagInv = NO;
+ */
+ set_default_options_dist(&options);
+#if 0
+ options.RowPerm = LargeDiag_HWPM;
+ options.IterRefine = NOREFINE;
+ options.ColPerm = NATURAL;
+ options.Equil = NO;
+ options.ReplaceTinyPivot = YES;
+#endif
+
+ /* Parse command line argv[], may modify default options */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
c = *(*cpp+1);
@@ -108,8 +131,14 @@ int main(int argc, char *argv[])
switch (c) {
case 'h':
printf("Options:\n");
- printf("\t-r : process rows (default %4d)\n", nprow);
- printf("\t-c : process columns (default %4d)\n", npcol);
+ printf("\t-r : process rows (default %4d)\n", nprow);
+ printf("\t-c : process columns (default %4d)\n", npcol);
+ printf("\t-p : row permutation (default %4d)\n", options.RowPerm);
+ printf("\t-q : col permutation (default %4d)\n", options.ColPerm);
+ printf("\t-s : parallel symbolic? (default %4d)\n", options.ParSymbFact);
+ printf("\t-l : lookahead level (default %4d)\n", options.num_lookaheads);
+ printf("\t-i : iter. refinement (default %4d)\n", options.IterRefine);
+ printf("\t-b : use batch mode? (default %4d)\n", batch);
exit(0);
break;
case 'r': nprow = atoi(*cpp);
@@ -137,15 +166,25 @@ int main(int argc, char *argv[])
}
}
- if ( batch ) { /* in the batch mode: create multiple SuperLU grids,
- each grid solving one linear system. */
+ /* Command line input to modify default options */
+ if (rowperm != -1) options.RowPerm = rowperm;
+ if (colperm != -1) options.ColPerm = colperm;
+ if (lookahead != -1) options.num_lookaheads = lookahead;
+ if (ir != -1) options.IterRefine = ir;
+ if (symbfact != -1) options.ParSymbFact = symbfact;
+
+ /* In the batch mode: create multiple SuperLU grids,
+ each grid solving one linear system. */
+ if ( batch ) {
/* ------------------------------------------------------------
INITIALIZE MULTIPLE SUPERLU PROCESS GRIDS.
------------------------------------------------------------*/
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
usermap = SUPERLU_MALLOC(nprow*npcol * sizeof(int));
ldumap = nprow;
- int color = myrank/(nprow*npcol); /* Assuming each grid uses the same number of nprow and npcol */
+
+ /* Assuming each grid uses the same number of nprow and npcol */
+ int color = myrank/(nprow*npcol);
MPI_Comm_split(MPI_COMM_WORLD, color, myrank, &SubComm);
p = 0;
for (int i = 0; i < nprow; ++i)
@@ -181,7 +220,7 @@ int main(int argc, char *argv[])
// printf("grid.iam %5d, myrank %5d\n",grid.iam,myrank);
// fflush(stdout);
- } else {
+ } else { /* not batch mode */
/* ------------------------------------------------------------
INITIALIZE THE SUPERLU PROCESS GRID.
------------------------------------------------------------ */
@@ -243,6 +282,12 @@ int main(int argc, char *argv[])
fflush(stdout);
}
+ /* print solver options */
+ if (!iam) {
+ print_options_dist(&options);
+ fflush(stdout);
+ }
+
#if ( VAMPIR>=1 )
VT_traceoff();
#endif
@@ -270,39 +315,6 @@ int main(int argc, char *argv[])
NOW WE SOLVE THE LINEAR SYSTEM.
------------------------------------------------------------*/
- /* Set the default input options:
- options.Fact = DOFACT;
- options.Equil = YES;
- options.ParSymbFact = NO;
- options.ColPerm = METIS_AT_PLUS_A;
- options.RowPerm = LargeDiag_MC64;
- options.ReplaceTinyPivot = NO;
- options.IterRefine = SLU_DOUBLE;
- options.Trans = NOTRANS;
- options.SolveInitialized = NO;
- options.RefineInitialized = NO;
- options.PrintStat = YES;
- options.DiagInv = NO;
- */
- set_default_options_dist(&options);
-#if 0
- options.RowPerm = LargeDiag_HWPM;
- options.IterRefine = NOREFINE;
- options.ColPerm = NATURAL;
- options.Equil = NO;
- options.ReplaceTinyPivot = YES;
-#endif
-
- if (rowperm != -1) options.RowPerm = rowperm;
- if (colperm != -1) options.ColPerm = colperm;
- if (lookahead != -1) options.num_lookaheads = lookahead;
- if (ir != -1) options.IterRefine = ir;
- if (symbfact != -1) options.ParSymbFact = symbfact;
-
- if (!iam) {
- print_options_dist(&options);
- fflush(stdout);
- }
m = A.nrow;
n = A.ncol;
diff --git a/FORTRAN/f_5x5.F90.old b/FORTRAN/f_5x5.F90.old
new file mode 100644
index 00000000..fec77adc
--- /dev/null
+++ b/FORTRAN/f_5x5.F90.old
@@ -0,0 +1,226 @@
+
+! -- Distributed SuperLU routine (version 2.0) --
+! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+! July 20, 2004
+!
+!
+ program f_5x5
+!
+! Purpose
+! =======
+!
+! This example illustrates how to use F_PDGSSVX with the full
+! (default) options to solve a linear system.
+! The input matrix is a small 5x5 example appeared in SuperLU Users' Guide,,
+! Section 2.2:
+!
+! [ s u u ] [ 19 21 21 ]
+! [ l u ] [ 12 21 ]
+! [ l p ] = [ 12 16 ]
+! [ e u ] [ 5 21 ]
+! [ l l r ] [ 12 12 18 ]
+!
+! It is set up to use 2 processors:
+! processor 1 contains the first 2 rows
+! processor 2 contains the last 3 rows
+!
+! Seven basic steps are required:
+! 1. Create C structures used in SuperLU_DIST
+! 2. Initialize the MPI environment and the SuperLU process grid
+! 3. Set up the input matrix and the right-hand side
+! 4. Set the options argument
+! 5. Call f_pdgssvx
+! 6. Release the process grid and terminate the MPI environment
+! 7. Release all structures
+!
+ use superlu_mod
+! implicit none
+ include 'mpif.h'
+ integer maxn, maxnz, maxnrhs
+ parameter ( maxn = 10, maxnz = 100, maxnrhs = 10 )
+ integer colind(maxnz), rowptr(maxn+1)
+ real*8 nzval(maxnz), b(maxn), berr(maxnrhs)
+ integer n, m, nnz, nrhs, ldb, nprow, npcol, init
+ integer*4 iam, info, i, ierr, ldb4
+ integer nnz_loc, m_loc, fst_row
+ real*8 s, u, p, e, r, l
+
+ integer(superlu_ptr) :: grid
+ integer(superlu_ptr) :: options
+ integer(superlu_ptr) :: ScalePermstruct
+ integer(superlu_ptr) :: LUstruct
+ integer(superlu_ptr) :: SOLVEstruct
+ integer(superlu_ptr) :: A
+ integer(superlu_ptr) :: stat
+
+! Initialize MPI environment
+ call mpi_init(ierr)
+
+! Check malloc
+! call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+ call f_create_gridinfo_handle(grid)
+ call f_create_options_handle(options)
+ call f_create_ScalePerm_handle(ScalePermstruct)
+ call f_create_LUstruct_handle(LUstruct)
+ call f_create_SOLVEstruct_handle(SOLVEstruct)
+ call f_create_SuperMatrix_handle(A)
+ call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+ nprow = 1
+ npcol = 2
+ call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid)
+
+! Bail out if I do not belong in the grid.
+ call get_GridInfo(grid, iam=iam)
+ if ( iam >= nprow * npcol ) then
+ go to 100
+ endif
+ if ( iam == 0 ) then
+ write(*,*) ' Process grid ', nprow, ' X ', npcol
+ write(*,*) ' default integer size ', kind(0)
+ endif
+!
+!*************************************************************************
+! Set up the input matrix A
+!*************************************************************************
+! The input matrix is a small 5x5 example appeared in SuperLU Users' Guide:
+!
+! [ s u u ] [ 19 21 21 ]
+! [ l u ] [ 12 21 ]
+! [ l p ] = [ 12 16 ]
+! [ e u ] [ 5 21 ]
+! [ l l r ] [ 12 12 18 ]
+!
+! It is set up to use 2 processors:
+! processor 1 contains the first 2 rows
+! processor 2 contains the last 3 rows
+!
+ m = 5
+ n = 5
+ nnz = 12
+ s = 19.0
+ u = 21.0
+ p = 16.0
+ e = 5.0
+ r = 18.0
+ l = 12.0
+!
+ if ( iam == 0 ) then
+! Processor 0 owns the first 2 rows of the matrix
+! NOTE: 0-based indexing must be used for the C routines.
+ nnz_loc = 5
+ m_loc = 2
+ fst_row = 0 ! 0-based indexing
+ nzval(1) = s
+ colind(1) = 0 ! 0-based indexing
+ nzval(2) = u
+ colind(2) = 2
+ nzval(3) = u
+ colind(3) = 3
+ nzval(4) = l
+ colind(4) = 0
+ nzval(5) = u
+ colind(5) = 1
+ rowptr(1) = 0 ! 0-based indexing
+ rowptr(2) = 3
+ rowptr(3) = 5
+ else
+! Processor 1 owns the last 3 rows of the matrix
+ nnz_loc = 7
+ m_loc = 3
+ fst_row = 2 ! 0-based indexing
+ nzval(1) = l
+ colind(1) = 1
+ nzval(2) = p
+ colind(2) = 2
+ nzval(3) = e
+ colind(3) = 3
+ nzval(4) = u
+ colind(4) = 4
+ nzval(5) = l
+ colind(5) = 0
+ nzval(6) = l
+ colind(6) = 1
+ nzval(7) = r
+ colind(7) = 4
+ rowptr(1) = 0 ! 0-based indexing
+ rowptr(2) = 2
+ rowptr(3) = 4
+ rowptr(4) = 7
+ endif
+
+ if ( iam == 0 ) then
+ write(*,*) ' Matrix A was set up'
+ endif
+
+! Create the distributed compressed row matrix pointed to by the F90 handle A
+ call f_dCreate_CompRowLoc_Mat_dist(A, m, n, nnz_loc, m_loc, fst_row, &
+ nzval, colind, rowptr, SLU_NR_loc, SLU_D, SLU_GE)
+
+! Setup the right hand side
+ call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
+ do i = 1, ldb
+ b(i) = 1.0
+ enddo
+ nrhs = 1
+ ldb4 = ldb
+
+! Set the default input options
+ call f_set_default_options(options)
+
+! Modify one or more options
+ call set_superlu_options(options,ColPerm=NATURAL)
+ call set_superlu_options(options,RowPerm=NOROWPERM)
+
+! Initialize ScalePermstruct and LUstruct
+ call get_SuperMatrix(A,nrow=m,ncol=n)
+ call f_ScalePermstructInit(m, n, ScalePermstruct)
+ call f_LUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+ call f_PStatInit(stat)
+
+! Call the linear equation solver
+ call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+ grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+ if (info == 0 .and. iam == 1) then
+ write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+ else
+ write(*,*) 'INFO from f_pdgssvx = ', info
+ endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+ call f_PStatFree(stat)
+ call f_Destroy_SuperMat_Store_dist(A)
+ call f_ScalePermstructFree(ScalePermstruct)
+ call f_Destroy_LU(n, grid, LUstruct)
+ call f_LUstructFree(LUstruct)
+ call get_superlu_options(options, SolveInitialized=init)
+ if (init == YES) then
+ call f_dSolveFinalize(options, SOLVEstruct)
+ endif
+
+! Release the SuperLU process grid
+100 call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+ call f_destroy_gridinfo_handle(grid)
+ call f_destroy_options_handle(options)
+ call f_destroy_ScalePerm_handle(ScalePermstruct)
+ call f_destroy_LUstruct_handle(LUstruct)
+ call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+ call f_destroy_SuperMatrix_handle(A)
+ call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+! call f_check_malloc(iam)
+
+! Terminate the MPI execution environment
+ call mpi_finalize(ierr)
+
+ stop
+ end
diff --git a/FORTRAN/f_pddrive.F90.old b/FORTRAN/f_pddrive.F90.old
new file mode 100644
index 00000000..33803d99
--- /dev/null
+++ b/FORTRAN/f_pddrive.F90.old
@@ -0,0 +1,161 @@
+
+
+!> @file
+!! \brief The driver program to solve a linear system with default options.
+!!
+!!
+!! -- Distributed SuperLU routine (version 3.2) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! October, 2012
+!!
+!
+ program f_pddrive
+!
+! Purpose
+! =======
+!
+! The driver program F_PDDRIVE.
+!
+! This example illustrates how to use F_PDGSSVX with the full
+! (default) options to solve a linear system.
+!
+! Seven basic steps are required:
+! 1. Create C structures used in SuperLU_DIST
+! 2. Initialize the MPI environment and the SuperLU process grid
+! 3. Set up the input matrix and the right-hand side
+! 4. Set the options argument
+! 5. Call f_pdgssvx
+! 6. Release the process grid and terminate the MPI environment
+! 7. Release all structures
+!
+!
+ use superlu_mod
+! implicit none
+ include 'mpif.h'
+ integer maxn, maxnz, maxnrhs
+ parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+ integer rowind(maxnz), colptr(maxn)
+ real*8 values(maxnz), b(maxn), berr(maxnrhs)
+ integer n, m, nnz, nprow, npcol, ldb, init
+ integer*4 iam, info, i, ierr, ldb4, nrhs
+
+ integer(superlu_ptr) :: grid
+ integer(superlu_ptr) :: options
+ integer(superlu_ptr) :: ScalePermstruct
+ integer(superlu_ptr) :: LUstruct
+ integer(superlu_ptr) :: SOLVEstruct
+ integer(superlu_ptr) :: A
+ integer(superlu_ptr) :: stat
+
+! Initialize MPI environment
+ call mpi_init(ierr)
+
+! Check malloc
+! call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+ call f_create_gridinfo_handle(grid)
+ call f_create_options_handle(options)
+ call f_create_ScalePerm_handle(ScalePermstruct)
+ call f_create_LUstruct_handle(LUstruct)
+ call f_create_SOLVEstruct_handle(SOLVEstruct)
+ call f_create_SuperMatrix_handle(A)
+ call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+ nprow = 2
+ npcol = 2
+ call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid)
+
+! Bail out if I do not belong in the grid.
+ call get_GridInfo(grid, iam=iam)
+ if ( iam >= nprow * npcol ) then
+ go to 100
+ endif
+ if ( iam == 0 ) then
+ write(*,*) ' Process grid ', nprow, ' X ', npcol
+ endif
+
+! Read Harwell-Boeing matrix, and adjust the pointers and indices
+! to 0-based indexing, as required by C routines.
+ if ( iam == 0 ) then
+ open(file = "../EXAMPLE/g20.rua", status = "old", unit = 5)
+ call dhbcode1(m, n, nnz, values, rowind, colptr)
+ close(unit = 5)
+!
+ do i = 1, n+1
+ colptr(i) = colptr(i) - 1
+ enddo
+ do i = 1, nnz
+ rowind(i) = rowind(i) - 1
+ enddo
+ endif
+
+! Distribute the matrix to the process gird
+ call f_dcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid)
+
+! Setup the right hand side
+ call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
+ do i = 1, ldb
+ b(i) = 1.0
+ enddo
+ nrhs = 1
+ ldb4 = ldb
+
+! Set the default input options
+ call f_set_default_options(options)
+
+! Change one or more options
+! call set_superlu_options(options,Fact=FACTORED)
+! call set_superlu_options(options,ParSymbFact=YES)
+
+! Initialize ScalePermstruct and LUstruct
+ call get_SuperMatrix(A, nrow=m, ncol=n)
+ call f_ScalePermstructInit(m, n, ScalePermstruct)
+ call f_LUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+ call f_PStatInit(stat)
+
+! Call the linear equation solver
+ call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+ grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+ if (info == 0) then
+ write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+ else
+ write(*,*) 'INFO from f_pdgssvx = ', info
+ endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+ call f_PStatFree(stat)
+ call f_Destroy_CompRowLoc_Mat_dist(A)
+ call f_ScalePermstructFree(ScalePermstruct)
+ call f_Destroy_LU(n, grid, LUstruct)
+ call f_LUstructFree(LUstruct)
+ call get_superlu_options(options, SolveInitialized=init)
+ if (init == YES) then
+ call f_dSolveFinalize(options, SOLVEstruct)
+ endif
+
+! Release the SuperLU process grid
+100 call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+ call f_destroy_gridinfo_handle(grid)
+ call f_destroy_options_handle(options)
+ call f_destroy_ScalePerm_handle(ScalePermstruct)
+ call f_destroy_LUstruct_handle(LUstruct)
+ call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+ call f_destroy_SuperMatrix_handle(A)
+ call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+! call f_check_malloc(iam)
+
+
+! Terminate the MPI execution environment
+ call mpi_finalize(ierr)
+
+ stop
+ end
diff --git a/FORTRAN/f_pzdrive.F90.old b/FORTRAN/f_pzdrive.F90.old
new file mode 100644
index 00000000..9c9db5b0
--- /dev/null
+++ b/FORTRAN/f_pzdrive.F90.old
@@ -0,0 +1,160 @@
+
+!> @file
+!! \brief The driver program to solve a linear system with default options.
+!!
+!!
+!! -- Distributed SuperLU routine (version 3.2) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! October, 2012
+!!
+!
+ program f_pzdrive
+!
+! Purpose
+! =======
+!
+! The driver program F_PDDRIVE.
+!
+! This example illustrates how to use F_PDGSSVX with the full
+! (default) options to solve a linear system.
+!
+! Seven basic steps are required:
+! 1. Create C structures used in SuperLU_DIST
+! 2. Initialize the MPI environment and the SuperLU process grid
+! 3. Set up the input matrix and the right-hand side
+! 4. Set the options argument
+! 5. Call f_pdgssvx
+! 6. Release the process grid and terminate the MPI environment
+! 7. Release all structures
+!
+!
+ use superlu_mod
+! implicit none
+ include 'mpif.h'
+ integer maxn, maxnz, maxnrhs
+ parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
+ integer rowind(maxnz), colptr(maxn)
+ double complex values(maxnz), b(maxn), berr(maxnrhs)
+ integer n, m, nnz, nprow, npcol, ldb, init
+ integer*4 iam, info, i, ierr, ldb4, nrhs
+
+ integer(superlu_ptr) :: grid
+ integer(superlu_ptr) :: options
+ integer(superlu_ptr) :: ScalePermstruct
+ integer(superlu_ptr) :: LUstruct
+ integer(superlu_ptr) :: SOLVEstruct
+ integer(superlu_ptr) :: A
+ integer(superlu_ptr) :: stat
+
+! Initialize MPI environment
+ call mpi_init(ierr)
+
+! Check malloc
+! call f_check_malloc(iam)
+
+! Create Fortran handles for the C structures used in SuperLU_DIST
+ call f_create_gridinfo_handle(grid)
+ call f_create_options_handle(options)
+ call f_create_ScalePerm_handle(ScalePermstruct)
+ call f_create_LUstruct_handle(LUstruct)
+ call f_create_SOLVEstruct_handle(SOLVEstruct)
+ call f_create_SuperMatrix_handle(A)
+ call f_create_SuperLUStat_handle(stat)
+
+! Initialize the SuperLU_DIST process grid
+ nprow = 2
+ npcol = 2
+ call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid)
+
+! Bail out if I do not belong in the grid.
+ call get_GridInfo(grid, iam=iam)
+ if ( iam >= nprow * npcol ) then
+ go to 100
+ endif
+ if ( iam == 0 ) then
+ write(*,*) ' Process grid ', nprow, ' X ', npcol
+ endif
+
+! Read Harwell-Boeing matrix, and adjust the pointers and indices
+! to 0-based indexing, as required by C routines.
+ if ( iam == 0 ) then
+ open(file = "../EXAMPLE/cg20.cua", status = "old", unit = 5)
+ call zhbcode1(m, n, nnz, values, rowind, colptr)
+ close(unit = 5)
+!
+ do i = 1, n+1
+ colptr(i) = colptr(i) - 1
+ enddo
+ do i = 1, nnz
+ rowind(i) = rowind(i) - 1
+ enddo
+ endif
+
+! Distribute the matrix to the process gird
+ call f_zcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid)
+
+! Setup the right hand side
+ call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
+ do i = 1, ldb
+ b(i) = 1.0
+ enddo
+ nrhs = 1
+ ldb4 = ldb
+
+! Set the default input options
+ call f_set_default_options(options)
+
+! Change one or more options
+! call set_superlu_options(options,Fact=FACTORED)
+! call set_superlu_options(options,ParSymbFact=YES)
+
+! Initialize ScalePermstruct and LUstruct
+ call get_SuperMatrix(A, nrow=m, ncol=n)
+ call f_ScalePermstructInit(m, n, ScalePermstruct)
+ call f_LUstructInit(m, n, LUstruct)
+
+! Initialize the statistics variables
+ call f_PStatInit(stat)
+
+! Call the linear equation solver
+ call f_pzgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+ grid, LUstruct, SOLVEstruct, berr, stat, info)
+
+ if (info == 0) then
+ write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+ else
+ write(*,*) 'INFO from f_pdgssvx = ', info
+ endif
+
+! Deallocate the storage allocated by SuperLU_DIST
+ call f_PStatFree(stat)
+ call f_Destroy_CompRowLoc_Mat_dist(A)
+ call f_ScalePermstructFree(ScalePermstruct)
+ call f_Destroy_LU(n, grid, LUstruct)
+ call f_LUstructFree(LUstruct)
+ call get_superlu_options(options, SolveInitialized=init)
+ if (init == YES) then
+ call f_zSolveFinalize(options, SOLVEstruct)
+ endif
+
+! Release the SuperLU process grid
+100 call f_superlu_gridexit(grid)
+
+! Deallocate the C structures pointed to by the Fortran handles
+ call f_destroy_gridinfo_handle(grid)
+ call f_destroy_options_handle(options)
+ call f_destroy_ScalePerm_handle(ScalePermstruct)
+ call f_destroy_LUstruct_handle(LUstruct)
+ call f_destroy_SOLVEstruct_handle(SOLVEstruct)
+ call f_destroy_SuperMatrix_handle(A)
+ call f_destroy_SuperLUStat_handle(stat)
+
+! Check malloc
+! call f_check_malloc(iam)
+
+
+! Terminate the MPI execution environment
+ call mpi_finalize(ierr)
+
+ stop
+ end
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 8df55e0f..680cb5ad 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -293,8 +293,14 @@ if (BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
list(APPEND targets superlu_dist-static)
endif()
+if (TPL_ENABLE_NVSHMEM)
+set(superlu_dist_libs ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES} ${BLAS_LIB} ${LAPACK_LIB}
+ ${PARMETIS_LIB} ${COLAMD_LIB} ${COMBBLAS_LIB} ${CUDA_LIB} ${HIP_LIB} ${NVSHMEM_LIB})
+else()
set(superlu_dist_libs ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES} ${BLAS_LIB} ${LAPACK_LIB}
${PARMETIS_LIB} ${COLAMD_LIB} ${COMBBLAS_LIB} ${CUDA_LIB} ${HIP_LIB})
+endif()
+
if (NOT MSVC)
list(APPEND superlu_dist_libs m)
endif ()
diff --git a/SRC/ddistribute.c b/SRC/ddistribute.c
index 86ff5000..de7f77f8 100644
--- a/SRC/ddistribute.c
+++ b/SRC/ddistribute.c
@@ -17,6 +17,8 @@ at the top-level directory.
* -- Distributed SuperLU routine (version 2.3) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 15, 2008
+ *
+ * January 9, 2023
*
*/
#include "superlu_ddefs.h"
@@ -35,7 +37,8 @@ at the top-level directory.
* =========
*
* options (input) superlu_dist_options_t *
- * options->Fact specifies whether or not the L and U structures will be r * = SamePattern_SameRowPerm: L and U structures are input, and
+ * options->Fact specifies whether or not the L and U structures will be re-used.
+ * = SamePattern_SameRowPerm: L and U structures are input, and
* unchanged on exit.
* = DOFACT or SamePattern: L and U structures are computed and output.
*
@@ -62,7 +65,7 @@ at the top-level directory.
float
ddistribute(superlu_dist_options_t *options,
- int_t n, SuperMatrix *A,
+ int_t n, SuperMatrix *A,
Glu_freeable_t *Glu_freeable,
dLUstruct_t *LUstruct, gridinfo_t *grid)
{
@@ -94,39 +97,40 @@ ddistribute(superlu_dist_options_t *options,
int_t *index_srt; /* indices consist of headers and row subscripts */
int *index1; /* temporary pointer to array of int */
double *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
- double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
- double *Lnzval_bc_dat; /* size sum of sizes of Lnzval_bc_ptr[lk]) */
+ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ double *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */
long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
- int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */
- long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
-
- int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Lindval_loc_bc_dat; /* size sum of sizes of Lindval_loc_bc_ptr[lk]) */
- long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
-
- int_t *Unnz; /* size ceil(NSUPERS/Pc) */
+
+ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+
+ int_t *Unnz; /* size ceil(NSUPERS/Pc) */
double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
- double *Unzval_br_dat; /* size sum of sizes of Unzval_br_ptr[lk]) */
- long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
+ double *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */
+ long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
long int Unzval_br_cnt=0;
- int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
- int_t *Ufstnz_br_dat; /* size sum of sizes of Ufstnz_br_ptr[lk]) */
+ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
long int Ufstnz_br_cnt=0;
- C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
- int msgsize;
+ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
+
+ int msgsize;
int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
Ucb_indptr_t *Ucb_inddat;
long int *Ucb_indoffset;
long int Ucb_indcnt=0;
-
- int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
+
+ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
int_t *Ucb_valdat;
long int *Ucb_valoffset;
long int Ucb_valcnt=0;
@@ -178,11 +182,12 @@ ddistribute(superlu_dist_options_t *options,
int *frecv, *brecv;
int_t *lloc;
double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
- double *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */
+ double *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */
long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
- double *Uinv_bc_dat; /* size sum of sizes of Uinv_bc_ptr[lk]) */
+ double *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */
long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
+
double *SeedSTD_BC,*SeedSTD_RD;
int_t idx_indx,idx_lusup;
int_t nbrow;
@@ -190,7 +195,6 @@ ddistribute(superlu_dist_options_t *options,
int_t lptr1_tmp, idx_i, idx_v,m, uu;
int_t nub;
int tag;
-
#if ( PRNTlevel>=1 )
int_t nLblocks = 0, nUblocks = 0;
@@ -232,7 +236,7 @@ ddistribute(superlu_dist_options_t *options,
L and U data structures. */
ilsum = Llu->ilsum;
ldaspa = Llu->ldalsum;
- if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3,options))) )
+ if ( !(dense = doubleCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3, options))) )
ABORT("Calloc fails for SPA dense[].");
nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */
if ( !(Urb_length = intCalloc_dist(nrbu)) )
@@ -246,7 +250,7 @@ ddistribute(superlu_dist_options_t *options,
Unzval_br_ptr = Llu->Unzval_br_ptr;
Unnz = Llu->Unnz;
- mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3,options)*dword;
+ mem_use += 2.0*nrbu*iword + ldaspa * sp_ienv_dist(3, options) * dword;
#if ( PROFlevel>=1 )
t = SuperLU_timer_();
@@ -393,6 +397,7 @@ ddistribute(superlu_dist_options_t *options,
fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
}
Unzval_br_offset[k-1] = -1;
+
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Ufstnz_br_ptr[].");
if ( !(Ufstnz_br_offset =
@@ -488,12 +493,14 @@ ddistribute(superlu_dist_options_t *options,
if ( !(index = intMalloc_dist(len1+1)) )
ABORT("Malloc fails for Uindex[].");
Ufstnz_br_ptr[lb] = index;
- Ufstnz_br_offset[lb]=len1+1;
+ Ufstnz_br_offset[lb] = len1+1;
Ufstnz_br_cnt += Ufstnz_br_offset[lb];
+
if ( !(Unzval_br_ptr[lb] = doubleMalloc_dist(len)) )
ABORT("Malloc fails for Unzval_br_ptr[*][].");
Unzval_br_offset[lb]=len;
Unzval_br_cnt += Unzval_br_offset[lb];
+
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
index[0] = Ucbs[lb]; /* Number of column blocks */
@@ -532,7 +539,7 @@ ddistribute(superlu_dist_options_t *options,
if ( !(Lrb_valptr = intMalloc_dist(k)) )
ABORT("Malloc fails for Lrb_valptr[].");
if (!(dense=doubleCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa)
- *sp_ienv_dist(3,options)))))
+ *sp_ienv_dist(3, options)))))
ABORT("Calloc fails for SPA dense[].");
/* These counts will be used for triangular solves. */
@@ -541,7 +548,7 @@ ddistribute(superlu_dist_options_t *options,
if ( !(bmod = int32Calloc_dist(k)) )
ABORT("Calloc fails for bmod[].");
#if ( PRNTlevel>=1 )
- mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3,options)*dword;
+ mem_use += 6.0*k*iword + ldaspa * sp_ienv_dist(3, options) * dword;
#endif
k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
@@ -563,7 +570,6 @@ ddistribute(superlu_dist_options_t *options,
}
Lnzval_bc_offset[k-1] = -1;
-
if ( !(Lindval_loc_bc_ptr =
(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
@@ -595,9 +601,8 @@ ddistribute(superlu_dist_options_t *options,
Linv_bc_offset[k-1] = -1;
Uinv_bc_offset[k-1] = -1;
- if ( !(Unnz =
- (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
- ABORT("Malloc fails for Unnz[].");
+ if ( !(Unnz = (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
+ ABORT("Malloc fails for Unnz[].");
/* These lists of processes will be used for triangular solves. */
if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
@@ -627,6 +632,7 @@ ddistribute(superlu_dist_options_t *options,
long int Lrowind_bc_cnt=0;
long int Lnzval_bc_cnt=0;
long int Lindval_loc_bc_cnt=0;
+
for (jb = 0; jb < nsupers; ++jb) {
pc = PCOL( jb, grid );
if ( mycol == pc ) { /* Block column jb in my process column */
@@ -767,38 +773,38 @@ ddistribute(superlu_dist_options_t *options,
index[] and nzval[]. */
/* Add room for descriptors */
len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
- if ( !(index = intMalloc_dist(len1)) )
- ABORT("Malloc fails for index[]");
- Lrowind_bc_offset[ljb]=len1;
- Lrowind_bc_cnt += Lrowind_bc_offset[ljb];
- if (!(lusup = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
+ if ( !(index = intMalloc_dist(len1)) )
+ ABORT("Malloc fails for index[]");
+ Lrowind_bc_offset[ljb]=len1;
+ Lrowind_bc_cnt += Lrowind_bc_offset[ljb];
+ if (!(lusup = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
ABORT("Malloc fails for lusup[]");
- Lnzval_bc_offset[ljb]=len*nsupc;
- Lnzval_bc_cnt += Lnzval_bc_offset[ljb];
-
- if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
+ Lnzval_bc_offset[ljb]=len*nsupc;
+ Lnzval_bc_cnt += Lnzval_bc_offset[ljb];
+
+ if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
- Lindval_loc_bc_offset[ljb]=nrbl*3;
- Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb];
-
- myrow = MYROW( iam, grid );
- krow = PROW( jb, grid );
- if(myrow==krow){ /* diagonal block */
- if (!(Linv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
- ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
- Linv_bc_offset[ljb]=nsupc*nsupc;
- Linv_bc_cnt += Linv_bc_offset[ljb];
- if (!(Uinv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
- ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
- Uinv_bc_offset[ljb]=nsupc*nsupc;
- Uinv_bc_cnt += Uinv_bc_offset[ljb];
- }else{
- Linv_bc_ptr[ljb] = NULL;
- Linv_bc_offset[ljb] = -1;
- Uinv_bc_ptr[ljb] = NULL;
- Uinv_bc_offset[ljb] = -1;
- }
-
+ Lindval_loc_bc_offset[ljb]=nrbl*3;
+ Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb];
+
+ myrow = MYROW( iam, grid );
+ krow = PROW( jb, grid );
+ if(myrow==krow){ /* diagonal block */
+ if (!(Linv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
+ ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+ Linv_bc_offset[ljb]=nsupc*nsupc;
+ Linv_bc_cnt += Linv_bc_offset[ljb];
+ if (!(Uinv_bc_ptr[ljb] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
+ ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+ Uinv_bc_offset[ljb]=nsupc*nsupc;
+ Uinv_bc_cnt += Uinv_bc_offset[ljb];
+ }else{
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ }
+
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
@@ -910,14 +916,14 @@ ddistribute(superlu_dist_options_t *options,
} else {
Lrowind_bc_ptr[ljb] = NULL;
Lnzval_bc_ptr[ljb] = NULL;
- Linv_bc_ptr[ljb] = NULL;
- Linv_bc_offset[ljb] = -1;
- Lrowind_bc_offset[ljb]=-1;
- Lindval_loc_bc_offset[ljb]=-1;
- Lnzval_bc_offset[ljb]=-1;
- Uinv_bc_ptr[ljb] = NULL;
- Uinv_bc_offset[ljb] = -1;
- Lindval_loc_bc_ptr[ljb] = NULL;
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Lrowind_bc_offset[ljb]=-1;
+ Lindval_loc_bc_offset[ljb]=-1;
+ Lnzval_bc_offset[ljb]=-1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ Lindval_loc_bc_ptr[ljb] = NULL;
} /* if nrbl ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
@@ -926,7 +932,6 @@ ddistribute(superlu_dist_options_t *options,
} /* for jb ... */
-
Linv_bc_cnt +=1; // safe guard
Uinv_bc_cnt +=1;
Lrowind_bc_cnt +=1;
@@ -934,24 +939,24 @@ ddistribute(superlu_dist_options_t *options,
Lnzval_bc_cnt +=1;
if ( !(Linv_bc_dat =
- (double*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(double))) ) {
+ (double*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(double))) ) {
fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
}
if ( !(Uinv_bc_dat =
- (double*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(double))) ) {
+ (double*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(double))) ) {
fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
}
if ( !(Lrowind_bc_dat =
- (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
+ (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
}
if ( !(Lindval_loc_bc_dat =
- (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
+ (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
}
if ( !(Lnzval_bc_dat =
- (double*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(double))) ) {
+ (double*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(double))) ) {
fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
}
@@ -1016,9 +1021,8 @@ ddistribute(superlu_dist_options_t *options,
Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
Lindval_loc_bc_cnt+=tmp_cnt;
}
-
- }
+ } /* for jb ... */
/////////////////////////////////////////////////////////////////
@@ -1043,6 +1047,7 @@ ddistribute(superlu_dist_options_t *options,
fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
}
Ucb_indoffset[nub-1] = -1;
+
nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
/* Count number of row blocks in a block column.
@@ -1065,11 +1070,10 @@ ddistribute(superlu_dist_options_t *options,
for (lb = 0; lb < nub; ++lb) {
if ( Urbs[lb] ) { /* Not an empty block column. */
if ( !(Ucb_indptr[lb]
- = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+ = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
Ucb_indoffset[lb]=Urbs[lb];
Ucb_indcnt += Ucb_indoffset[lb];
-
if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
Ucb_valoffset[lb]=Urbs[lb];
@@ -1123,28 +1127,26 @@ ddistribute(superlu_dist_options_t *options,
}
}
-
Unzval_br_cnt +=1; // safe guard
Ufstnz_br_cnt +=1;
Ucb_valcnt +=1 ;
Ucb_indcnt +=1;
if ( !(Unzval_br_dat =
- (double*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(double))) ) {
+ (double*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(double))) ) {
fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
}
if ( !(Ufstnz_br_dat =
- (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
+ (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
}
if ( !(Ucb_valdat =
- (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
+ (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
fprintf(stderr, "Malloc fails for Ucb_valdat[].");
}
if ( !(Ucb_inddat =
- (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
+ (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
fprintf(stderr, "Malloc fails for Ucb_inddat[].");
}
-
/* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -1174,7 +1176,6 @@ ddistribute(superlu_dist_options_t *options,
}
}
-
k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
Ucb_valcnt=0;
Ucb_indcnt=0;
@@ -1295,10 +1296,11 @@ ddistribute(superlu_dist_options_t *options,
// rseed=rand();
// rseed=1.0;
msgsize = SuperSize( jb );
- // LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
- // BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d');
+ //LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
+ //BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d');
C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'd');
LBtree_ptr[ljb].tag_=BC_L;
+
// printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt);
// fflush(stdout);
@@ -1365,10 +1367,9 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
}
/* Every process receives the count, but it is only useful on the
diagonal processes. */
+ //MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
-
-
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(LRtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
ABORT("Malloc fails for LRtree_ptr[].");
@@ -1479,8 +1480,8 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
// if(ib==0){
- // LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
- // RdTree_SetTag(LRtree_ptr[lib], RD_L,'d');
+ //LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
+ //RdTree_SetTag(LRtree_ptr[lib], RD_L,'d');
C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'd');
LRtree_ptr[lib].tag_=RD_L;
// }
@@ -1513,7 +1514,6 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
SUPERLU_FREE(mod_bit);
SUPERLU_FREE(frecv);
-
SUPERLU_FREE(ActiveFlag);
SUPERLU_FREE(ActiveFlagAll);
SUPERLU_FREE(ranks);
@@ -1638,9 +1638,8 @@ if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t);
// rseed=rand();
// rseed=1.0;
msgsize = SuperSize( jb );
- // UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
- // BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d');
-
+ //UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
+ //BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d');
C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'd');
UBtree_ptr[ljb].tag_=BC_U;
@@ -1697,10 +1696,9 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
}
/* Every process receives the count, but it is only useful on the
diagonal processes. */
+ //MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
-
-
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
ABORT("Malloc fails for URtree_ptr[].");
@@ -1842,8 +1840,8 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
// if(ib==0){
- // URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
- // RdTree_SetTag(URtree_ptr[lib], RD_U,'d');
+ //URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
+ //RdTree_SetTag(URtree_ptr[lib], RD_U,'d');
C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'd');
URtree_ptr[lib].tag_=RD_U;
// }
@@ -1866,7 +1864,6 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
SUPERLU_FREE(mod_bit);
SUPERLU_FREE(brecv);
-
SUPERLU_FREE(ActiveFlag);
SUPERLU_FREE(ActiveFlagAll);
SUPERLU_FREE(ranks);
@@ -1884,32 +1881,31 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
////////////////////////////////////////////////////////
-
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
Llu->Lrowind_bc_dat = Lrowind_bc_dat;
Llu->Lrowind_bc_offset = Lrowind_bc_offset;
Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
-
+
Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
-
+
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
Llu->Lnzval_bc_dat = Lnzval_bc_dat;
Llu->Lnzval_bc_offset = Lnzval_bc_offset;
Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
-
+
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
- Llu->Ufstnz_br_dat = Ufstnz_br_dat;
- Llu->Ufstnz_br_offset = Ufstnz_br_offset;
- Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
-
+ Llu->Ufstnz_br_dat = Ufstnz_br_dat;
+ Llu->Ufstnz_br_offset = Ufstnz_br_offset;
+ Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
+
Llu->Unzval_br_ptr = Unzval_br_ptr;
Llu->Unzval_br_dat = Unzval_br_dat;
Llu->Unzval_br_offset = Unzval_br_offset;
Llu->Unzval_br_cnt = Unzval_br_cnt;
-
+
Llu->Unnz = Unnz;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
@@ -1924,7 +1920,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
Llu->nbsendx = nbsendx;
Llu->ilsum = ilsum;
Llu->ldalsum = ldaspa;
-
+
Llu->LRtree_ptr = LRtree_ptr;
Llu->LBtree_ptr = LBtree_ptr;
Llu->URtree_ptr = URtree_ptr;
@@ -1934,11 +1930,12 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
Llu->Linv_bc_dat = Linv_bc_dat;
Llu->Linv_bc_offset = Linv_bc_offset;
Llu->Linv_bc_cnt = Linv_bc_cnt;
-
+
Llu->Uinv_bc_ptr = Uinv_bc_ptr;
Llu->Uinv_bc_dat = Uinv_bc_dat;
Llu->Uinv_bc_offset = Uinv_bc_offset;
- Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+ Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+
Llu->Urbs = Urbs;
Llu->Ucb_indptr = Ucb_indptr;
Llu->Ucb_inddat = Ucb_inddat;
@@ -1949,7 +1946,6 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
Llu->Ucb_valoffset = Ucb_valoffset;
Llu->Ucb_valcnt = Ucb_valcnt;
-
#ifdef GPU_ACC
checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
@@ -1977,7 +1973,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
// some dummy allocation to avoid checking whether they are null pointers later
checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(double) ));
checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
@@ -1992,11 +1988,12 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
/* gpuMemcpy for the following is performed in pxgssvx */
- checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(double)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(double)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(double) ));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(double) ));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(double) ));
-#endif
+#endif /* match ifdef GPU_ACC */
+
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
nLblocks, nUblocks);
diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h
index 6f236eb8..1103aa48 100644
--- a/SRC/dlustruct_gpu.h
+++ b/SRC/dlustruct_gpu.h
@@ -96,8 +96,6 @@ typedef struct //LUstruct_gpu_
local_u_blk_info_t *local_u_blk_infoVec;
int_t *local_u_blk_infoPtr;
- int_t *ijb_lookupVec;
- int_t *ijb_lookupPtr;
// GPU buffers for performing Schur Complement Update on GPU
dSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS];
diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu
index 9c78ba5d..01bef401 100644
--- a/SRC/dsuperlu_gpu.cu
+++ b/SRC/dsuperlu_gpu.cu
@@ -795,12 +795,8 @@ int dfree_LUstruct_gpu (
checkGPU(gpuFree(A_gpu->local_l_blk_infoVec));
checkGPU(gpuFree(A_gpu->local_l_blk_infoPtr));
- checkGPU(gpuFree(A_gpu->jib_lookupVec));
- checkGPU(gpuFree(A_gpu->jib_lookupPtr));
checkGPU(gpuFree(A_gpu->local_u_blk_infoVec));
checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr));
- checkGPU(gpuFree(A_gpu->ijb_lookupVec));
- checkGPU(gpuFree(A_gpu->ijb_lookupPtr));
/* Destroy all the meta-structures associated with the streams. */
gpuStreamDestroy(sluGPU->CopyStream);
diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c
index e0792561..34660bc8 100644
--- a/SRC/dutil_dist.c
+++ b/SRC/dutil_dist.c
@@ -733,7 +733,7 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL);
}
- ncb = nsupers / grid->npcol;
+ ncb = nsupers / grid->npcol;
extra = nsupers % grid->npcol;
mycol = MYCOL( iam, grid );
if ( mycol < extra ) ++ncb;
@@ -747,15 +747,15 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
nsupc = SuperSize( gb );
for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
len = index[k+1];
- //for (j = 0; j < nsupc; ++j) {
- //fprintf(fp, IFMT IFMT " %e\n", index[k+LB_DESCRIPTOR+i]+1, xsup[gb]+1, len,);
- //for (i=0; ixsup;
- int_t *index;
- double *nzval;
- int *idx_block;
- if ( !(idx_block = (int*)SUPERLU_MALLOC( h_nfrecvmod[1] *2* sizeof(int))) )
- ABORT("Malloc fails for SeedSTD_BC[].");
- // assert(grid->npcol*grid->nprow==1);
-
- // count nonzeros in the first pass
- nnzL = 0;
- n = 0;
- ncb = nsupers / grid->npcol;
- extra = nsupers % grid->npcol;
- mycol = MYCOL( iam, grid );
- if ( mycol < extra ) ++ncb;
- for (lb = 0; lb < ncb; ++lb) {
- index = Llu->Lrowind_bc_ptr[lb];
- if ( index ) { /* Not an empty column */
- nzval = Llu->Lnzval_bc_ptr[lb];
- nb = index[0];
- nsupr = index[1];
- gb = lb * grid->npcol + mycol;
- nsupc = SuperSize( gb );
- for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
- len = index[k+1];
-
- for (j = 0; j < nsupc; ++j) {
- for (i=0; i=xsup[gb]+j+1){
- nnzL ++;
- nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1);
- n = nmax;
- }
-
- }
- }
- k += LB_DESCRIPTOR + len;
- r += len;
- }
- }
- }
- MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm);
- MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm);
-
- ncb = nsupers / grid->npcol;
- extra = nsupers % grid->npcol;
- mycol = MYCOL( iam, grid );
- if ( mycol < extra ) ++ncb;
- int tmp_c=0;
- for (lb = 0; lb < ncb; ++lb) {
- index = Llu->Lrowind_bc_ptr[lb];
- if ( index ) { /* Not an empty column */
- nzval = Llu->Lnzval_bc_ptr[lb];
- nb = index[0];
- nsupr = index[1];
- gb = lb * grid->npcol + mycol;
- nsupc = SuperSize( gb );
- for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) {
- len = index[k+1];
- //fprintf(fp, "%d,%d,%d,%d,%d\n", index[k+LB_DESCRIPTOR+2], gb, (double)iam,len,nsupc);
- idx_block[tmp_c*2]=index[k+LB_DESCRIPTOR+2];
- idx_block[tmp_c*2+1]=gb;
- tmp_c+=1;
- k += LB_DESCRIPTOR + len;
- r += len;
- }
- }
- }
-
- //find_critical_path(idx_block);
-} /* criticalpath */
/*! \Compute the level sets in the L factor
diff --git a/SRC/gpu_wrapper.h b/SRC/gpu_wrapper.h
index 0e6297ec..e7ec2d8d 100644
--- a/SRC/gpu_wrapper.h
+++ b/SRC/gpu_wrapper.h
@@ -38,8 +38,6 @@ at the top-level directory.
#define gpuSuccess cudaSuccess
#define gpuGetErrorString cudaGetErrorString
#define gpuMalloc cudaMalloc
-#define gpuMallocHost cudaMallocHost
-#define gpuMemset cudaMemset
#define gpuHostMalloc cudaHostAlloc
#define gpuHostMallocDefault cudaHostAllocDefault
#define gpuMallocManaged cudaMallocManaged
diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c
index 2ff5d670..5854487d 100644
--- a/SRC/pddistribute.c
+++ b/SRC/pddistribute.c
@@ -2427,27 +2427,29 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
-
- /////* reuse: L and U *////
int maxrecvsz = sp_ienv_dist(3, options)* nrhs + SUPERLU_MAX( XK_H, LSUM_H );
- flag_bc_q = (int *)nvshmem_malloc(RDMA_FLAG_SIZE * (k+1) * sizeof(int)); // for sender
- flag_rd_q = (int *)nvshmem_malloc( RDMA_FLAG_SIZE * nlb * 2 * sizeof(int)); // for sender
- ready_x = (double *)nvshmem_malloc(maxrecvsz*CEILING( nsupers, grid->npcol) * sizeof(double)); // for receiver
- ready_lsum = (double *)nvshmem_malloc(2*maxrecvsz*CEILING( nsupers, grid->nprow) * sizeof(double)); // for receiver
- //printf("(%d) k=%d, flag_size=%d int, data_size=%d double, nlb=%d, flag_size=%d int, data_size=%d double, int=%d B, double=%d B\n",
- // iam,k,RDMA_FLAG_SIZE * (k+1), maxrecvsz*CEILING( nsupers, grid->npcol),
- // nlb,RDMA_FLAG_SIZE * nlb * 2 , 2*maxrecvsz*CEILING( nsupers, grid->nprow), sizeof(int), sizeof(double) );
+ int flag_bc_size = RDMA_FLAG_SIZE * (k+1);
+ int flag_rd_size = RDMA_FLAG_SIZE * nlb * 2;
+ int ready_x_size = maxrecvsz*CEILING( nsupers, grid->npcol);
+ int ready_lsum_size = 2*maxrecvsz*CEILING( nsupers, grid->nprow);
+ int my_flag_bc_size = RDMA_FLAG_SIZE * (CEILING( nsupers, grid->npcol)+1);
+ int my_flag_rd_size = RDMA_FLAG_SIZE * nlb * 2;
+ //printf("(%d) in pddistribute:\n "
+ // "flag_bc_size=%d int, ready_x=%d double, "
+ // "flag_rd_size=%d int, ready_lsum=%d double, "
+ // "int=%d B, double=%d B\n",
+ // iam,
+ // flag_bc_size, ready_x_size,
+ // flag_rd_size , ready_lsum_size,
+ // sizeof(int), sizeof(double) );
//fflush(stdout);
+ prepare_multiGPU_buffers(flag_bc_size,flag_rd_size,ready_x_size,ready_lsum_size,my_flag_bc_size,my_flag_rd_size);
-
- my_flag_bc = (int *) nvshmem_malloc ( RDMA_FLAG_SIZE * (CEILING( nsupers, grid->npcol)+1) * sizeof(int)); // for sender
- my_flag_rd = (int *) nvshmem_malloc (RDMA_FLAG_SIZE * nlb * 2 * sizeof(int)); // for sender
+ /////* for L solve *////
checkGPU(gpuMemset(my_flag_bc, 0, RDMA_FLAG_SIZE * (CEILING( nsupers, grid->npcol)+1) * sizeof(int)));
checkGPU(gpuMemset(my_flag_rd, 0, RDMA_FLAG_SIZE * nlb * 2 * sizeof(int)));
- //checkGPU(gpuMemset(ready_x, 0, maxrecvsz*CEILING( nsupers, grid->npcol) * sizeof(double)));
- //checkGPU(gpuMemset(ready_lsum, 0, 2*maxrecvsz*CEILING( nsupers, grid->nprow) * sizeof(double)));
-
- /////* for L solve *////
+ checkGPU(gpuMemset(ready_x, 0, maxrecvsz*CEILING( nsupers, grid->npcol) * sizeof(double)));
+ checkGPU(gpuMemset(ready_lsum, 0, 2*maxrecvsz*CEILING( nsupers, grid->nprow) * sizeof(double)));
checkGPU(gpuMalloc( (void**)&d_status, CEILING( nsupers, grid->npcol) * sizeof(int)));
checkGPU(gpuMalloc( (void**)&d_nfrecv, 3 * sizeof(int)));
checkGPU(gpuMemcpy(d_status, mystatus, CEILING( nsupers, grid->npcol) * sizeof(int), gpuMemcpyHostToDevice));
diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c
index b5b67cc8..c9555e6f 100644
--- a/SRC/pdgssvx.c
+++ b/SRC/pdgssvx.c
@@ -538,7 +538,8 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
int_t nnz_loc, nnz, iinfo;
int m_loc, fst_row, icol;
int colequ, Equil, factored, job, notran, rowequ, need_value;
- int_t i, j, irow, m, n, permc_spec;
+ int_t i, j, irow, m, n;
+ int permc_spec;
int iam, iam_g;
int ldx; /* LDA for matrix X (local). */
char equed[1], norm[1];
@@ -716,15 +717,12 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
} else { /* Compute R & C from scratch */
/* Compute the row and column scalings. */
pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid);
+
if ( iinfo > 0 ) {
if ( iinfo <= m ) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "The %d-th row of A is exactly zero\n", (int)iinfo);
-#endif
} else {
-#if ( PRNTlevel>=1 )
- fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)iinfo-n);
-#endif
+ fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)(iinfo-n));
}
} else if ( iinfo < 0 ) return;
@@ -998,7 +996,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
printf("{" IFMT "," IFMT "}: pdgssvx: invalid ColPerm option when ParSymbfact is used\n",
MYROW(grid->iam, grid), MYCOL(grid->iam, grid));
}
- }
+ } /* end preparing for parallel symbolic */
if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
/* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */
@@ -1018,9 +1016,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
// }
// }
if (flinfo > 0) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n");
-#endif
*info = flinfo;
return;
}
@@ -1070,10 +1066,11 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
ABORT("Malloc fails for Glu_freeable.");
- /* Every process does this. */
+ /* Every process does this.
+ returned value (-iinfo) is the size of lsub[], incuding pruned graph.*/
iinfo = symbfact(options, iam, &GAC, perm_c, etree,
Glu_persist, Glu_freeable);
- nnzLU = Glu_freeable->nnzLU;
+ nnzLU = Glu_freeable->nnzLU;
stat->utime[SYMBFAC] = SuperLU_timer_() - t;
if ( iinfo <= 0 ) { /* Successful return */
QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
@@ -1093,10 +1090,8 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
}
#endif
} else { /* symbfact out of memory */
-#if ( PRNTlevel>=1 )
if ( !iam )
fprintf(stderr,"symbfact() error returns " IFMT "\n",iinfo);
-#endif
*info = iinfo;
return;
}
@@ -1111,9 +1106,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
nnzLU = Pslu_freeable.nnzLU;
stat->utime[SYMBFAC] = SuperLU_timer_() - t;
if (flinfo > 0) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "Insufficient memory for parallel symbolic factorization.");
-#endif
*info = flinfo;
return;
}
@@ -1143,7 +1136,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
distribution routine. */
t = SuperLU_timer_();
dist_mem_use = pddistribute(options, n, A, ScalePermstruct,
- Glu_freeable, LUstruct, grid,nrhs);
+ Glu_freeable, LUstruct, grid, nrhs);
stat->utime[DIST] = SuperLU_timer_() - t;
/* Deallocate storage used in symbolic factorization. */
@@ -1161,6 +1154,8 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
t = SuperLU_timer_();
dist_mem_use = ddist_psymbtonum(options, n, A, ScalePermstruct,
&Pslu_freeable, LUstruct, grid);
+
+ /* dist_mem_use = memDist + memNLU */
if (dist_mem_use > 0)
ABORT ("Not enough memory available for dist_psymbtonum\n");
@@ -1275,30 +1270,66 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
float mem_stage[3];
struct { float val; int rank; } local_struct, global_struct;
- MPI_Reduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, MPI_SUM, 0, grid->comm );
+ MPI_Reduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t,
+ MPI_SUM, 0, grid->comm );
stat->TinyPivots = TinyPivots;
+ if ( iam==0 ) {
+ printf("\n** Memory Usage **********************************\n");
+ }
+
+ /* Compute numerical factorization memeory */
+ dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+
/*-- Compute high watermark of all stages --*/
if (parSymbFact == TRUE) {
/* The memory used in the redistribution routine
includes the memory used for storing the symbolic
structure and the memory allocated for numerical
factorization */
- mem_stage[0] = (-flinfo); /* parallel symbfact step */
- mem_stage[1] = (-dist_mem_use); /* distribution step */
+ /* parallel symbfact step:
+ (-flinfo) is the allocMem returned from symbfact_dist() */
+ mem_stage[0] = symb_mem_usage.total + (-flinfo);
+
+ /* see leading comment of dist_symbLU() */
+ /* dist_mem_use = (memDist + memNLU) in ddist_psymbtonum() */
+ mem_stage[1] = symb_mem_usage.for_lu + (-dist_mem_use); /* distribution step */
loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1] );
if ( options->RowPerm != NO )
loc_max = SUPERLU_MAX(loc_max, GA_mem_use);
- } else {
+
+#if ( PRNTlevel>=1 )
+ if ( iam==0 ) {
+ printf("\t(P0) Globle A for MC64: GA_mem_use %.2f\n", GA_mem_use*1e-6);
+ printf("\t(P0) parallel symbolic::stage[0]: symb_memory %.2f, allocMem %.2f\n",
+ symb_mem_usage.total*1e-6, (-flinfo)*1e-6);
+ printf("\t(P0) parallel distribution::stage[1]: symb_LU %.2f, dist_mem_use %.2f\n",
+ symb_mem_usage.for_lu*1e-6, (-dist_mem_use)*1e-6);
+ fflush(stdout);
+
+ }
+#endif
+ } else { /* Serial symbolic. GA_mem_use is for global A */
mem_stage[0] = symb_mem_usage.total + GA_mem_use; /* symbfact step */
- mem_stage[1] = symb_mem_usage.for_lu + dist_mem_use
- + num_mem_usage.for_lu; /* distribution step */
+ mem_stage[1] = symb_mem_usage.for_lu
+ + dist_mem_use
+ + num_mem_usage.for_lu; /* distribution step */
loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1] );
+#if ( PRNTlevel>=1 )
+ if ( iam==0 ) {
+ printf("\t(P0) serial symbolic::stage[0]: symb_memory %.2f, GA_mem_use %.2f\n",
+ symb_mem_usage.total*1e-6, GA_mem_use*1e-6);
+ printf("\t(P0) serial distribution::stage[1]:"
+ "symb_LU %.2f, dist_mem_use %.2f, num_mem_usage.for_lu %.2f\n",
+ symb_mem_usage.for_lu*1e-6, dist_mem_use*1e-6,
+ num_mem_usage.for_lu*1e-6);
+ fflush(stdout);
+
+ }
+#endif
}
- dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
mem_stage[2] = num_mem_usage.total; /* numerical factorization step */
-
loc_max = SUPERLU_MAX( loc_max, mem_stage[2] ); /* local max of 3 stages */
local_struct.val = loc_max;
@@ -1326,7 +1357,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
float buffer_peak = global_struct.val*1e-6;
if ( iam==0 ) {
- printf("\n** Memory Usage **********************************\n");
printf("** Total highmark (MB):\n"
" Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n",
avg * 1e-6,
@@ -1353,7 +1383,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
} /* end if (!factored) */
-
if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
/* Need to reset the solve's communication pattern,
because perm_r[] and/or perm_c[] is changed. */
@@ -1444,10 +1473,11 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
if ( options->DiagInv==YES && (Fact != FACTORED) ) {
pdCompute_Diag_Inv(n, LUstruct, grid, stat, info);
-
+
#ifdef GPU_ACC
- pdconvertU(options, grid,LUstruct, stat, n);
+ pdconvertU(options, grid, LUstruct, stat, n);
+
checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
(LUstruct->Llu->Linv_bc_cnt) * sizeof(double), gpuMemcpyHostToDevice));
checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
@@ -1457,7 +1487,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
#endif
}
-
// #pragma omp parallel
// {
// #pragma omp master
@@ -1599,7 +1628,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale);
- fflush(stdout);
#endif
/* Deallocate R and/or C if it was not used. */
@@ -1627,14 +1655,12 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
CHECK_MALLOC(iam, "Exit pdgssvx()");
#endif
-}
-
-
-
+} /* pdgssvx */
+#ifdef GPU_ACC
void
pdconvertU(superlu_dist_options_t *options, gridinfo_t *grid,
- dLUstruct_t *LUstruct, SuperLUStat_t *stat, int_t n)
+ dLUstruct_t *LUstruct, SuperLUStat_t *stat, int n)
{
int64_t nnz_ind,nnz_offset;
int64_t nnz_val;
@@ -1649,8 +1675,8 @@ int_t knsupc,iknsupc,ikfrow,iklrow;
int_t *xsup = Glu_persist->xsup;;
int iam = grid->iam;
-int_t mycol = MYCOL (iam, grid);
-int_t myrow = MYROW (iam, grid);
+int mycol = MYCOL (iam, grid);
+int myrow = MYROW (iam, grid);
int_t *usub;
double *uval;
@@ -1670,48 +1696,45 @@ if ( !(Llu->Ucolind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))
Llu->Ucolind_bc_ptr[nsupers_j-1] = NULL;
if ( !(Llu->Unzval_bc_ptr =
- (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) )
+ (double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) )
ABORT("Malloc fails for Llu->Unzval_bc_ptr[].");
Llu->Unzval_bc_ptr[nsupers_j-1] = NULL;
if ( !(Llu->Uindval_loc_bc_ptr =
- (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) )
+ (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) )
ABORT("Malloc fails for Llu->Uindval_loc_bc_ptr[].");
Llu->Uindval_loc_bc_ptr[nsupers_j-1] = NULL;
if ( !(Llu->Uindval_loc_bc_offset =
- (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
- fprintf(stderr, "Malloc fails for Llu->Uindval_loc_bc_offset[].");
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Uindval_loc_bc_offset[].");
}
Llu->Uindval_loc_bc_offset[nsupers_j-1] = -1;
-
if ( !(Llu->Ucolind_bc_offset =
- (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
fprintf(stderr, "Malloc fails for Llu->Ucolind_bc_offset[].");
}
Llu->Ucolind_bc_offset[nsupers_j-1] = -1;
if ( !(Llu->Unzval_bc_offset =
- (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
fprintf(stderr, "Malloc fails for Llu->Lnzval_bc_offset[].");
}
Llu->Unzval_bc_offset[nsupers_j-1] = -1;
-
-
for (lk=0;lknpcol + mycol;/* Global block number, col-wise. */
knsupc = SuperSize( k );
nub = Urbs[lk]; /* Number of U blocks in block column lk */
if(nub>0){
- // First pass count sizes of Llu->Ucolind_bc_ptr[lk] and Llu->Unzval_bc_ptr[lk]
- nnz_ind=0;
- nnz_val=0;
- nnz_ind+=BC_HEADER_NEWU;
- nrow=0;
- for (ub = 0; ub < nub; ++ub) {
+ // First pass count sizes of Llu->Ucolind_bc_ptr[lk] and Llu->Unzval_bc_ptr[lk]
+ nnz_ind=0;
+ nnz_val=0;
+ nnz_ind+=BC_HEADER_NEWU;
+ nrow=0;
+ for (ub = 0; ub < nub; ++ub) {
ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
usub = Llu->Ufstnz_br_ptr[ik];
uval = Llu->Unzval_br_ptr[ik];
@@ -1727,47 +1750,45 @@ for (lk=0;lkops[SOLVE] += 2 * (iklrow - fnz);
- }
+ }
} /* for jj ... */
- } /* for ub ... */
-
- // Second pass fills Llu->Ucolind_bc_ptr[lk] and Llu->Unzval_bc_ptr[lk]
- if ( !(Llu->Ucolind_bc_ptr[lk] = intMalloc_dist(nnz_ind+nrow*2)) )
- ABORT("Malloc fails for Llu->Ucolind_bc_ptr[lk]");
- Llu->Ucolind_bc_offset[lk]=nnz_ind+nrow*2;
- Ucolind_bc_cnt += Llu->Ucolind_bc_offset[lk];
-
-
- if (!(Llu->Unzval_bc_ptr[lk]=doubleCalloc_dist(nnz_val)))
- ABORT("Calloc fails for Llu->Unzval_bc_ptr[lk].");
- Llu->Unzval_bc_offset[lk]=nnz_val;
- Unzval_bc_cnt += Llu->Unzval_bc_offset[lk];
-
-
- if ( !(Llu->Uindval_loc_bc_ptr[lk] = intCalloc_dist(nub*3)) )
- ABORT("Malloc fails for Llu->Uindval_loc_bc_ptr[lk][]");
- Llu->Uindval_loc_bc_offset[lk]=nub*3;
- Uindval_loc_bc_cnt += Llu->Uindval_loc_bc_offset[lk];
-
- Llu->Ucolind_bc_ptr[lk][0]=nub;
- Llu->Ucolind_bc_ptr[lk][1]=nrow;
- Llu->Ucolind_bc_ptr[lk][2]=nnz_ind;
- nnz_offset=nnz_ind;
-
- nnz_ind=0;
- nnz_val=0;
- ncol=0;
- nnz_ind+=BC_HEADER_NEWU;
- nrow=0;
- for (ub = 0; ub < nub; ++ub) {
+ } /* for ub ... */
+
+ // Second pass fills Llu->Ucolind_bc_ptr[lk] and Llu->Unzval_bc_ptr[lk]
+ if ( !(Llu->Ucolind_bc_ptr[lk] = intMalloc_dist(nnz_ind+nrow*2)) )
+ ABORT("Malloc fails for Llu->Ucolind_bc_ptr[lk]");
+ Llu->Ucolind_bc_offset[lk]=nnz_ind+nrow*2;
+ Ucolind_bc_cnt += Llu->Ucolind_bc_offset[lk];
+
+ if (!(Llu->Unzval_bc_ptr[lk]=doubleCalloc_dist(nnz_val)))
+ ABORT("Calloc fails for Llu->Unzval_bc_ptr[lk].");
+ Llu->Unzval_bc_offset[lk]=nnz_val;
+ Unzval_bc_cnt += Llu->Unzval_bc_offset[lk];
+
+ if ( !(Llu->Uindval_loc_bc_ptr[lk] = intCalloc_dist(nub*3)) )
+ ABORT("Malloc fails for Llu->Uindval_loc_bc_ptr[lk][]");
+ Llu->Uindval_loc_bc_offset[lk]=nub*3;
+ Uindval_loc_bc_cnt += Llu->Uindval_loc_bc_offset[lk];
+
+ Llu->Ucolind_bc_ptr[lk][0]=nub;
+ Llu->Ucolind_bc_ptr[lk][1]=nrow;
+ Llu->Ucolind_bc_ptr[lk][2]=nnz_ind;
+ nnz_offset=nnz_ind;
+
+ nnz_ind=0;
+ nnz_val=0;
+ ncol=0;
+ nnz_ind+=BC_HEADER_NEWU;
+ nrow=0;
+ for (ub = 0; ub < nub; ++ub) {
ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
usub = Llu->Ufstnz_br_ptr[ik];
uval = Llu->Unzval_br_ptr[ik];
@@ -1787,8 +1808,8 @@ for (lk=0;lkUcolind_bc_ptr[lk][nnz_ind+ncol_loc+UB_DESCRIPTOR_NEWU]=FstBlockC(k)+jj; /* Global column number */
ncol_loc++;
for (irow = fnz; irow < iklrow; ++irow){
@@ -1799,8 +1820,8 @@ for (lk=0;lkUcolind_bc_ptr[lk][nnz_ind]=gik;
Llu->Ucolind_bc_ptr[lk][nnz_ind+1]=ncol_loc;
@@ -1811,9 +1832,9 @@ for (lk=0;lkUcolind_bc_ptr[lk] = NULL;
Llu->Unzval_bc_ptr[lk] = NULL;
Llu->Ucolind_bc_offset[lk]=-1;
@@ -1821,24 +1842,22 @@ for (lk=0;lkUindval_loc_bc_ptr[lk] = NULL;
Llu->Uindval_loc_bc_offset[lk]=-1;
}
-}
-
-
+} /* end for lk ... */
// safe guard
Ucolind_bc_cnt +=1;
Unzval_bc_cnt +=1;
Uindval_loc_bc_cnt +=1;
if ( !(Llu->Ucolind_bc_dat =
- (int_t*)SUPERLU_MALLOC(Ucolind_bc_cnt * sizeof(int_t))) ) {
+ (int_t*)SUPERLU_MALLOC(Ucolind_bc_cnt * sizeof(int_t))) ) {
fprintf(stderr, "Malloc fails for Llu->Ucolind_bc_dat[].");
}
if ( !(Llu->Unzval_bc_dat =
- (double*)SUPERLU_MALLOC(Unzval_bc_cnt * sizeof(double))) ) {
+ (double*)SUPERLU_MALLOC(Unzval_bc_cnt * sizeof(double))) ) {
fprintf(stderr, "Malloc fails for Llu->Unzval_bc_dat[].");
}
if ( !(Llu->Uindval_loc_bc_dat =
- (int_t*)SUPERLU_MALLOC(Uindval_loc_bc_cnt * sizeof(int_t))) ) {
+ (int_t*)SUPERLU_MALLOC(Uindval_loc_bc_cnt * sizeof(int_t))) ) {
fprintf(stderr, "Malloc fails for Llu->Uindval_loc_bc_dat[].");
}
@@ -1852,7 +1871,7 @@ for (lk=0;lkUcolind_bc_ptr[jb]!=NULL){
for (jj = 0; jj < Llu->Ucolind_bc_offset[jb]; ++jj) {
- Llu->Ucolind_bc_dat[Ucolind_bc_cnt+jj]=Llu->Ucolind_bc_ptr[jb][jj];
+ Llu->Ucolind_bc_dat[Ucolind_bc_cnt+jj]=Llu->Ucolind_bc_ptr[jb][jj];
}
SUPERLU_FREE(Llu->Ucolind_bc_ptr[jb]);
Llu->Ucolind_bc_ptr[jb]=&Llu->Ucolind_bc_dat[Ucolind_bc_cnt];
@@ -1863,7 +1882,7 @@ for (lk=0;lkUnzval_bc_ptr[jb]!=NULL){
for (jj = 0; jj < Llu->Unzval_bc_offset[jb]; ++jj) {
- Llu->Unzval_bc_dat[Unzval_bc_cnt+jj]=Llu->Unzval_bc_ptr[jb][jj];
+ Llu->Unzval_bc_dat[Unzval_bc_cnt+jj]=Llu->Unzval_bc_ptr[jb][jj];
}
SUPERLU_FREE(Llu->Unzval_bc_ptr[jb]);
Llu->Unzval_bc_ptr[jb]=&Llu->Unzval_bc_dat[Unzval_bc_cnt];
@@ -1873,18 +1892,18 @@ for (lk=0;lkUindval_loc_bc_ptr[jb]!=NULL){
- for (jj = 0; jj < Llu->Uindval_loc_bc_offset[jb]; ++jj) {
- Llu->Uindval_loc_bc_dat[Uindval_loc_bc_cnt+jj]=Llu->Uindval_loc_bc_ptr[jb][jj];
- }
- SUPERLU_FREE(Llu->Uindval_loc_bc_ptr[jb]);
- Llu->Uindval_loc_bc_ptr[jb]=&Llu->Uindval_loc_bc_dat[Uindval_loc_bc_cnt];
- tmp_cnt = Llu->Uindval_loc_bc_offset[jb];
- Llu->Uindval_loc_bc_offset[jb]=Uindval_loc_bc_cnt;
- Uindval_loc_bc_cnt+=tmp_cnt;
+ for (jj = 0; jj < Llu->Uindval_loc_bc_offset[jb]; ++jj) {
+ Llu->Uindval_loc_bc_dat[Uindval_loc_bc_cnt+jj]=Llu->Uindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Llu->Uindval_loc_bc_ptr[jb]);
+ Llu->Uindval_loc_bc_ptr[jb]=&Llu->Uindval_loc_bc_dat[Uindval_loc_bc_cnt];
+ tmp_cnt = Llu->Uindval_loc_bc_offset[jb];
+ Llu->Uindval_loc_bc_offset[jb]=Uindval_loc_bc_cnt;
+ Uindval_loc_bc_cnt+=tmp_cnt;
}
-
- }
+ } /* end for jb ... */
+
Llu->Ucolind_bc_cnt = Ucolind_bc_cnt;
Llu->Unzval_bc_cnt = Unzval_bc_cnt;
Llu->Uindval_loc_bc_cnt = Uindval_loc_bc_cnt;
@@ -1892,7 +1911,6 @@ for (lk=0;lkUcolind_bc_offset %10d\n",Llu->Ucolind_bc_offset[0]);
-#ifdef GPU_ACC
checkGPU(gpuFree(Llu->d_Ucolind_bc_dat));
checkGPU(gpuFree(Llu->d_Ucolind_bc_offset));
checkGPU(gpuFree(Llu->d_Unzval_bc_dat));
@@ -1900,7 +1918,6 @@ for (lk=0;lkd_Uindval_loc_bc_dat));
checkGPU(gpuFree(Llu->d_Uindval_loc_bc_offset));
-
checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, (Llu->Ucolind_bc_cnt) * sizeof(int_t)));
checkGPU(gpuMemcpy(Llu->d_Ucolind_bc_dat, Llu->Ucolind_bc_dat, (Llu->Ucolind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t)));
@@ -1914,7 +1931,6 @@ for (lk=0;lkd_Uindval_loc_bc_dat, Llu->Uindval_loc_bc_dat, (Llu->Uindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t)));
checkGPU(gpuMemcpy(Llu->d_Uindval_loc_bc_offset, Llu->Uindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t), gpuMemcpyHostToDevice));
-#endif
SUPERLU_FREE (Llu->Ucolind_bc_dat);
SUPERLU_FREE (Llu->Ucolind_bc_offset);
@@ -1923,4 +1939,5 @@ for (lk=0;lkUindval_loc_bc_dat);
SUPERLU_FREE (Llu->Uindval_loc_bc_offset);
-}
+} /* pdconvertU */
+#endif /* ifdef GPU_ACC */
diff --git a/SRC/pdgstrs.c b/SRC/pdgstrs.c
index 799917ef..799ece33 100644
--- a/SRC/pdgstrs.c
+++ b/SRC/pdgstrs.c
@@ -398,9 +398,9 @@ pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb,
#endif
{
// t = SuperLU_timer_();
-#ifdef _OPENMP
-#pragma omp taskloop private (i,l,irow,k,j,knsupc) untied
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (i,l,irow,k,j,knsupc) untied
+//#endif
for (i = 0; i < m_loc; ++i) {
irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*B */
@@ -652,9 +652,9 @@ pdReDistribute_X_to_B(int_t n, double *B, int_t m_loc, int_t ldb, int_t fst_row,
#endif
{
// t = SuperLU_timer_();
-#ifdef _OPENMP
-#pragma omp taskloop private (k,knsupc,lk,irow,l,i,j) untied
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (k,knsupc,lk,irow,l,i,j) untied
+//#endif
for (k = 0; k < nsupers; k++) {
knsupc = SuperSize( k );
lk = LBi( k, grid ); /* Local block number */
@@ -2134,9 +2134,9 @@ thread_id=0;
#endif
{
-#ifdef _OPENMP
-#pragma omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
+//#endif
for (jj=0;jj>>(target, mype, npes);
-//CUDA_CHECK(cudaDeviceSynchronize());
+ //int *target;
+ //target = (int *)nvshmem_malloc(sizeof(int)*256);
+ //printf("(%d) nvshmem malloc target success\n",mype);
+ //fflush(stdout);
+ //simple_shift<<<1, 256>>>(target, mype, npes);
+ //CUDA_CHECK(cudaDeviceSynchronize());
}
+void prepare_multiGPU_buffers(int flag_bc_size,int flag_rd_size,int ready_x_size,int ready_lsum_size,int my_flag_bc_size,int my_flag_rd_size){
+ int iam;
+ MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &iam));
+ flag_bc_q = (int *)nvshmem_malloc( flag_bc_size * sizeof(int)); // for sender
+ flag_rd_q = (int *)nvshmem_malloc( flag_rd_size * sizeof(int)); // for sender
+ ready_x = (double *)nvshmem_malloc( ready_x_size * sizeof(double)); // for receiver
+ ready_lsum = (double *)nvshmem_malloc( ready_lsum_size * sizeof(double)); // for receiver
+ my_flag_bc = (int *) nvshmem_malloc ( my_flag_bc_size * sizeof(int)); // for sender
+ my_flag_rd = (int *) nvshmem_malloc ( my_flag_rd_size * sizeof(int)); // for sender
+
+ //printf("(%d) in prepare_multiGPU_buffers:\n "
+ // "flag_bc_size=%d int, ready_x=%d double, "
+ // "flag_rd_size=%d int, ready_lsum=%d double, "
+ // "int=%d B, double=%d B\n",
+ // iam,
+ // flag_bc_size, ready_x_size,
+ // flag_rd_size , ready_lsum_size,
+ // sizeof(int), sizeof(double) );
+ //fflush(stdout);
+
+}
__device__ void C_BcTree_forwardMessageSimple_Device(C_Tree* tree, int* flag_bc_q, int* my_flag_bc, int mype, int tid,double* ready_x, int maxrecvsz){
//int BCsendoffset;
diff --git a/SRC/pdsymbfact_distdata.c b/SRC/pdsymbfact_distdata.c
index 1eb28504..d7a58ebf 100644
--- a/SRC/pdsymbfact_distdata.c
+++ b/SRC/pdsymbfact_distdata.c
@@ -42,8 +42,8 @@ at the top-level directory.
* Redistribute the symbolic structure of L and U from the distribution
* used in the parallel symbolic factorization step to the distdibution
* used in the parallel numeric factorization step. On exit, the L and U
- * structure for the 2D distribution used in the numeric factorization step is
- * stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal
+ * structure for the 2D distribution used in the numeric factorization step
+ * is stored in p_xlsub, p_lsub, p_xusub, p_usub. The global supernodal
* information is also computed and it is stored in Glu_persist->supno
* and Glu_persist->xsup.
*
@@ -53,7 +53,7 @@ at the top-level directory.
* Glu_persist->supno, Glu_persist->xsup.
*
* This routine also deallocates memory allocated during symbolic
- * factorization routine. That is, the folloing arrays are freed:
+ * factorization routine. That is, the following arrays are free'd:
* Pslu_freeable->xlsub, Pslu_freeable->lsub,
* Pslu_freeable->xusub, Pslu_freeable->usub,
* Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
@@ -68,8 +68,9 @@ at the top-level directory.
*
* n (Input) int_t
* Order of the input matrix
+ *
* Pslu_freeable (Input) Pslu_freeable_t *
- * Local L and U structure,
+ * Local L and U structure: lsub[] / usub[]. They are free'd after distribution.
* global to local indexing information.
*
* Glu_persist (Output) Glu_persist_t *
@@ -101,26 +102,25 @@ at the top-level directory.
* (an approximation).
*
*/
-
static float
-dist_symbLU (superlu_dist_options_t *options,
- int_t n, Pslu_freeable_t *Pslu_freeable,
- Glu_persist_t *Glu_persist,
+dist_symbLU (superlu_dist_options_t *options, int_t n,
+ Pslu_freeable_t *Pslu_freeable, Glu_persist_t *Glu_persist,
int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub,
gridinfo_t *grid
)
{
int iam, nprocs, pc, pr, p, np, p_diag;
int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u,
- *tmp_ptrToSend, *mem;
+ *tmp_ptrToSend, *mem; // temp memory
int_t *nnzToRecv_l, *nnzToRecv_u;
int_t *send_1, *send_2, nsend_1, nsend_2;
- int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
+ int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; // temp memory
int_t nsupers, nsupers_i, nsupers_j;
int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
int_t maxszsn, maxNvtcsPProc;
int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
- int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
+ int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; /* computed from symbfact_dist(),
+ free'd in this routine after distribution */
int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
int_t *globToLoc, nvtcs_loc;
@@ -128,8 +128,8 @@ dist_symbLU (superlu_dist_options_t *options,
RecvCnt_l, RecvCnt_u, ind_loc;
int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
int_t nelts, isize;
- float memAux; /* Memory used during this routine and freed on return */
- float memRet; /* Memory allocated and not freed on return */
+ float memAux; /* Memory used during this routine and free'd before return */
+ float memRet; /* Memory allocated and not free'd on return */
int_t iword, dword;
/* ------------------------------------------------------------
@@ -561,6 +561,13 @@ dist_symbLU (superlu_dist_options_t *options,
else
nnzToRecv[iam] = nnz_loc_u;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [1] memAux %.2f, memRet %.2f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+
/* ------------------------------------------------------------
DEALLOCATE TEMPORARY STORAGE.
-------------------------------------------------------------*/
@@ -633,8 +640,8 @@ dist_symbLU (superlu_dist_options_t *options,
while (i < k + nnzToRecv[p]) {
gb = rcv_luind[i];
if (gb >= nsupers)
- printf ("Pe[%d] p %d gb " IFMT " nsupers " IFMT " i " IFMT " i-k " IFMT "\n",
- iam, p, gb, nsupers, i, i-k);
+ printf ("Pe[%d] p %d gb %d nsupers %d i " IFMT " i-k " IFMT "\n",
+ iam, p, (int) gb, (int) nsupers, i, i-k);
i += 2;
if (sendL) gb_l = LBj( gb, grid );
if (sendU) gb_l = LBi( gb, grid );
@@ -649,7 +656,16 @@ dist_symbLU (superlu_dist_options_t *options,
}
else
sendU = FALSE;
- }
+
+ /* Sherry: this loop goes around twice ? */
+
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [2] end while: memAux %.4f\t memRet %.4f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+ } /* end while sendL || sendU */
/* deallocate memory allocated during symbolic factorization routine */
if (rcv_luind != NULL) {
@@ -677,6 +693,14 @@ dist_symbLU (superlu_dist_options_t *options,
*p_xlsub = xlsub_n; *p_lsub = lsub_n;
*p_xusub = xusub_n; *p_usub = usub_n;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [3] before return: memAux %.4f\t memRet %.4f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+
+ /* It is confirmed that memAux is 0 now */
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit dist_symbLU()");
#endif
@@ -778,8 +802,8 @@ ddist_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
MPI_Status status;
int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */
int_t *supno = Glu_persist->supno;
- float memAux; /* Memory used during this routine and freed on return */
- float memRet; /* Memory allocated and not freed on return */
+ float memAux; /* Memory used during this routine and free'd before return */
+ float memRet; /* Memory allocated and not free'd on return */
int_t iword, dword, szbuf;
/* ------------------------------------------------------------
@@ -1139,7 +1163,7 @@ ddist_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
#endif
return (-memRet);
-} /* dist_A */
+} /* ddist_A */
/*! \brief
*
@@ -1191,7 +1215,6 @@ ddist_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
* (an approximation).
*
*/
-
float
ddist_psymbtonum(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
dScalePermstruct_t *ScalePermstruct,
@@ -1226,57 +1249,54 @@ ddist_psymbtonum(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int_t *index; /* indices consist of headers and row subscripts */
int *index1; /* temporary pointer to array of int */
double *lusup, *uval; /* nonzero values in L and U */
- int *recvBuf; //int_t *recvBuf;
+ int *recvBuf; // 1/16/22 Sherry changed to int, was: int_t *recvBuf;
int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
- double *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */
+ double *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */
long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
- double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
- double *Uinv_bc_dat; /* size sum of sizes of Uinv_bc_ptr[lk]) */
- long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
- double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
- double *Lnzval_bc_dat; /* size sum of sizes of Lnzval_bc_ptr[lk]) */
+ double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ double *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */
+ long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
+ double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ double *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */
long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
-
- int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */
+
+ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */
long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
-
- int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Lindval_loc_bc_dat; /* size sum of sizes of Lindval_loc_bc_ptr[lk]) */
- long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
-
-
- int_t *index_srt; /* indices consist of headers and row subscripts */
+ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+
+ int_t *index_srt; /* indices consist of headers and row subscripts */
double *lusup_srt; /* nonzero values in L and U */
-
- double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
- double *Unzval_br_dat; /* size sum of sizes of Unzval_br_ptr[lk]) */
- long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
- long int Unzval_br_cnt=0;
- int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
- int_t *Ufstnz_br_dat; /* size sum of sizes of Ufstnz_br_ptr[lk]) */
- long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
- long int Ufstnz_br_cnt=0;
+ double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
+ double *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */
+ long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Unzval_br_cnt=0;
+
+ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
+ long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Ufstnz_br_cnt=0;
int_t *Unnz; /* size ceil(NSUPERS/Pc) */
- C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
int msgsize;
- int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
- Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
- Ucb_indptr_t *Ucb_inddat;
- long int *Ucb_indoffset;
- long int Ucb_indcnt=0;
-
- int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
- int_t *Ucb_valdat;
- long int *Ucb_valoffset;
- long int Ucb_valcnt=0;
+ int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
+ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+ Ucb_indptr_t *Ucb_inddat;
+ long int *Ucb_indoffset;
+ long int Ucb_indcnt=0;
+ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
+ int_t *Ucb_valdat;
+ long int *Ucb_valoffset;
+ long int Ucb_valcnt=0;
/*-- Counts to be used in factorization. --*/
int *ToRecv, *ToSendD, **ToSendR;
@@ -1293,12 +1313,12 @@ ddist_psymbtonum(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int **bsendx_plist; /* Column process list to send down Xk. */
int nbrecvx = 0; /* Number of Xk I will receive. */
int nbsendx = 0; /* Number of Xk I will send */
-
int_t *ilsum; /* starting position of each supernode in
- the full array (local) */
+ the full array (local, blockwise) */
int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in
the full array (local, block column wise) */
- /*-- Auxiliary arrays; freed on return --*/
+ /*-- Auxiliary arrays; free'd on return --*/
+ // Sherry check
int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */
int_t *LUb_length; /* L,U block length; size nsupers_ij */
int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
@@ -1328,15 +1348,21 @@ double *dense, *dense_col; /* SPA */
int_t lptr1_tmp, idx_i, idx_v,m, uu;
int_t nub;
- float memStrLU, memA,
+ /* counting memory */
+ float memA, /* memory used by ddist_A: distributing A values. */
+ memStrLU, /* memory used by dist_symbLU: distributing symbolic LU */
memDist = 0.; /* memory used for redistributing the data, which does
not include the memory for the numerical values
- of L and U (positive number)*/
+ of L and U (positive number).
+ It includes memA and memStrLU.
+ */
float memNLU = 0.; /* memory allocated for storing the numerical values of
L and U, that will be used in the numeric
- factorization (positive number) */
- float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
-
+ factorization (positive number).
+ It also contains dense-SPA[] array */
+ float memTRS = 0.; /* memory allocated for storing the meta-data for
+ triangular solve (positive number)*/
+
#if ( PRNTlevel>=1 )
int_t nLblocks = 0, nUblocks = 0;
#endif
@@ -1348,7 +1374,7 @@ double *dense, *dense_col; /* SPA */
/* Initialization. */
iam = grid->iam;
#if ( DEBUGlevel>=1 )
- CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
+ CHECK_MALLOC(iam, "Enter ddist_psymbtonum()");
#endif
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
@@ -1392,7 +1418,7 @@ double *dense, *dense_col; /* SPA */
if ( myrow == PROW( gb, grid ) ) {
i = SuperSize( gb );
ldaspa += i;
- lb = LBi( gb, grid );
+ lb = LBi( gb, grid ); // local block number
ilsum[lb + 1] = ilsum[lb] + i;
}
ilsum[nsupers_i] = ldaspa;
@@ -1402,7 +1428,7 @@ double *dense, *dense_col; /* SPA */
if (mycol == PCOL( gb, grid )) {
i = SuperSize( gb );
ldaspa_j += i;
- lb = LBj( gb, grid );
+ lb = LBj( gb, grid ); // local block number
ilsum_j[lb + 1] = ilsum_j[lb] + i;
}
ilsum_j[nsupers_j] = ldaspa_j;
@@ -1445,7 +1471,7 @@ double *dense, *dense_col; /* SPA */
for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
/* Auxiliary arrays used to set up L and U block data structures.
- They are freed on return. */
+ They are free'd on return. */
if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
fprintf(stderr, "Calloc fails for LUb_length[].");
return (memDist + memNLU + memTRS);
@@ -1471,28 +1497,33 @@ double *dense, *dense_col; /* SPA */
fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
return (memDist + memNLU + memTRS);
}
- if ( !(Unzval_br_offset =
- (long int*)SUPERLU_MALLOC(nsupers_i * sizeof(long int))) ) {
- fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
- }
- Unzval_br_offset[nsupers_i-1] = -1;
-
-
-
+ if ( !(Unzval_br_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_i * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Unzval_br_offset[nsupers_i-1] = -1;
+
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
return (memDist + memNLU + memTRS);
}
- if ( !(Ufstnz_br_offset =
- (long int*)SUPERLU_MALLOC(nsupers_i * sizeof(long int))) ) {
- fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
- return (memDist + memNLU + memTRS);
- }
- Ufstnz_br_offset[nsupers_i-1] = -1;
-
+ if ( !(Ufstnz_br_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_i * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Ufstnz_br_offset[nsupers_i-1] = -1;
+ memTRS += 2 * nsupers_i * sizeof(long int);
memNLU += nsupers_i*sizeof(double*) + nsupers_i*sizeof(int_t*);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.ddist_psymbtonum [1] memDist %.4f, memNLU %.4f\n", memDist*1e-6, memNLU*1e-6);
+ }
+#endif
+
Unzval_br_ptr[nsupers_i-1] = NULL;
Ufstnz_br_ptr[nsupers_i-1] = NULL;
@@ -1514,7 +1545,7 @@ double *dense, *dense_col; /* SPA */
memDist += (nsupers_i + nsupers_j)*iword;
/* Auxiliary arrays used to set up L, U block data structures.
- They are freed on return.
+ They are free'd on return.
k is the number of local row blocks. */
if ( !(dense = doubleCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j)
* sp_ienv_dist(3, options))) ) {
@@ -1532,7 +1563,13 @@ double *dense, *dense_col; /* SPA */
}
/* ------------------------------------------------ */
memNLU += 2*nsupers_i*iword +
- SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3, options)*dword;
+ SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3, options)*dword;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.ddist_psymbtonum [[2]] memDist %.2f, memNLU %.2f [+ dense SPA]\n", memDist*1e-6, memNLU*1e-6);
+ fflush(stdout);
+ }
+#endif
/* Pointers to the beginning of each block column of L. */
if ( !(Lnzval_bc_ptr =
@@ -1544,60 +1581,70 @@ double *dense, *dense_col; /* SPA */
fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
- if ( !(Lrowind_bc_offset =
- (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
- fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
- }
- Lrowind_bc_offset[nsupers_j-1] = -1;
- if ( !(Lnzval_bc_offset =
- (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
- fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
- }
-
+ if ( !(Lrowind_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
+ }
+ Lrowind_bc_offset[nsupers_j-1] = -1;
+ if ( !(Lnzval_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
+ }
if ( !(Linv_bc_ptr =
(double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
- if ( !(Linv_bc_offset =
- (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
- fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
- }
+ if ( !(Linv_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
+ }
+
if ( !(Uinv_bc_ptr =
(double**)SUPERLU_MALLOC(nsupers_j * sizeof(double*))) ) {
fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
- if ( !(Uinv_bc_offset =
- (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
- fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
- return (memDist + memNLU + memTRS);
- }
- Linv_bc_ptr[nsupers_j-1] = NULL;
- Uinv_bc_ptr[nsupers_j-1] = NULL;
- Linv_bc_offset[nsupers_j-1] = -1;
- Uinv_bc_offset[nsupers_j-1] = -1;
+ if ( !(Uinv_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Linv_bc_ptr[nsupers_j-1] = NULL;
+ Uinv_bc_ptr[nsupers_j-1] = NULL;
+ Linv_bc_offset[nsupers_j-1] = -1;
+ Uinv_bc_offset[nsupers_j-1] = -1;
+
if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){
fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
- if ( !(Lindval_loc_bc_offset =
- (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
- fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
- }
-
+ if ( !(Lindval_loc_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
+ }
if ( !(Unnz = (int_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int_t))) ){
fprintf(stderr, "Malloc fails for Unnz[].");
return (memDist + memNLU + memTRS);
}
- memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
+
+ //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr, Uinv_bc_ptr, and 5 more ...
+ memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double) + nsupers_j*iword
+ + 5 * nsupers_j * sizeof(long int);
memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.ddist_psymbtonum [[3]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
Lnzval_bc_ptr[nsupers_j-1] = NULL;
Lrowind_bc_ptr[nsupers_j-1] = NULL;
Linv_bc_ptr[nsupers_j-1] = NULL;
@@ -1630,16 +1677,23 @@ double *dense, *dense_col; /* SPA */
bsendx_plist[i] = &index1[j];
/* -------------------------------------------------------------- */
memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.ddist_psymbtonum [[4]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
- long int Linv_bc_cnt=0;
- long int Uinv_bc_cnt=0;
- long int Lrowind_bc_cnt=0;
- long int Lnzval_bc_cnt=0;
- long int Lindval_loc_bc_cnt=0;
+ long int Linv_bc_cnt=0;
+ long int Uinv_bc_cnt=0;
+ long int Lrowind_bc_cnt=0;
+ long int Lnzval_bc_cnt=0;
+ long int Lindval_loc_bc_cnt=0;
+
for (jb = 0; jb < nsupers; jb++) {
jbcol = PCOL( jb, grid );
jbrow = PROW( jb, grid );
@@ -1648,40 +1702,43 @@ double *dense, *dense_col; /* SPA */
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
-
+ /*------------------------------------------------
+ * SET UP U BLOCKS.
+ *------------------------------------------------*/
if ( myrow == jbrow ) { /* Block row jb in my process row */
- Ufstnz_br_ptr[ljb_i] = NULL;
- Unzval_br_ptr[ljb_i] = NULL;
- Unzval_br_offset[ljb_i]=-1;
- Ufstnz_br_offset[ljb_i]=-1;
+ Ufstnz_br_ptr[ljb_i] = NULL;
+ Unzval_br_ptr[ljb_i] = NULL;
+ Unzval_br_offset[ljb_i]=-1;
+ Ufstnz_br_offset[ljb_i]=-1;
+
/* Scatter A into SPA. */
for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
if (i >= asup_rowptr[ilsum[nsupers_i]])
printf ("ERR7\n");
- jcol = asup_colind[i];
+ jcol = asup_colind[i]; // upper triangular part
if (jcol >= n)
- printf ("Pe[%d] ERR distsn jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
- iam, jb, gb, j, jcol);
+ printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
+ iam, (int) jb, (int) gb, (int) j, jcol);
gb = BlockNum( jcol );
lb = LBj( gb, grid );
if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n");
jcol = ilsum_j[lb] + jcol - FstBlockC( gb );
if (jcol >= ldaspa_j)
- printf ("Pe[%d] ERR1 jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n",
- iam, jb, gb, j, jcol);
+ printf ("Pe[%d] ERR1 jb %d gb %d j %d jcol %d\n",
+ iam, (int) jb, (int) gb, (int) j, jcol);
dense_col[jcol] = asup_val[i];
}
dense_col += ldaspa_j;
}
- /*------------------------------------------------
- * SET UP U BLOCKS.
- *------------------------------------------------*/
/* Count number of blocks and length of each block. */
nrbu = 0;
len = 0; /* Number of column subscripts I own. */
len1 = 0; /* number of fstnz subscripts */
+
+ /* ljb_i is the current local row block number in U.
+ Loop through every nonzero in this row block */
for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
if (i >= xusub[nsupers_i]) printf ("ERR10\n");
jcol = usub[i];
@@ -1698,7 +1755,7 @@ double *dense, *dense_col; /* SPA */
pr = PROW( gb, grid );
if ( pr != jbrow && mycol == pc)
bsendx_plist[lb][jbrow] = YES;
- if (mycol == pc) {
+ if (mycol == pc) { /* I own this block */
len += nsupc;
LUb_length[lb] += nsupc;
ToSendD[ljb_i] = YES;
@@ -1742,10 +1799,9 @@ double *dense, *dense_col; /* SPA */
return (memDist + memNLU + memTRS);
}
Ufstnz_br_ptr[ljb_i] = index;
-
Ufstnz_br_offset[ljb_i]=len1+1;
Ufstnz_br_cnt += Ufstnz_br_offset[ljb_i];
-
+
if (!(Unzval_br_ptr[ljb_i] =
doubleMalloc_dist(len))) {
fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
@@ -1753,8 +1809,16 @@ double *dense, *dense_col; /* SPA */
}
Unzval_br_offset[ljb_i]=len;
Unzval_br_cnt += Unzval_br_offset[ljb_i];
-
+
memNLU += (len1+1)*iword + len*dword;
+#if ( PRNTlevel>=1 )
+ if (iam==0 && (jb %10000 == 0) ) {
+ printf("\t.ddist_psymbtonum [jb %d setup-U] memNLU %.4f, memTRS %.4f\n",
+ (int) jb, memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
uval = Unzval_br_ptr[ljb_i];
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
@@ -1806,7 +1870,10 @@ double *dense, *dense_col; /* SPA */
}
}
}
- } /* if nrbu ... */
+ } else {
+ Ufstnz_br_ptr[ljb_i] = NULL;
+ Unzval_br_ptr[ljb_i] = NULL;
+ } /* end if-else nrbu ... */
} /* if myrow == jbrow */
/*------------------------------------------------
@@ -1898,26 +1965,27 @@ double *dense, *dense_col; /* SPA */
fprintf (stderr, "Malloc fails for index[]");
return (memDist + memNLU + memTRS);
}
- Lrowind_bc_offset[ljb_j]=len1;
- Lrowind_bc_cnt += Lrowind_bc_offset[ljb_j];
+
+ Lrowind_bc_offset[ljb_j]=len1;
+ Lrowind_bc_cnt += Lrowind_bc_offset[ljb_j];
Lrowind_bc_ptr[ljb_j] = index;
+
if (!(Lnzval_bc_ptr[ljb_j] =
doubleMalloc_dist(len*nsupc))) {
- fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb);
+ fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d\n", (int) jb);
return (memDist + memNLU + memTRS);
}
- Lnzval_bc_offset[ljb_j]=len*nsupc;
- Lnzval_bc_cnt += Lnzval_bc_offset[ljb_j];
+ Lnzval_bc_offset[ljb_j]=len*nsupc;
+ Lnzval_bc_cnt += Lnzval_bc_offset[ljb_j];
myrow = MYROW( iam, grid );
krow = PROW( jb, grid );
if(myrow==krow){ /* diagonal block */
-
- if (!(Linv_bc_ptr[ljb_j] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
+ if (!(Linv_bc_ptr[ljb_j] = (double*)doubleMalloc_dist(nsupc*nsupc)) )
ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]");
Linv_bc_offset[ljb_j]=nsupc*nsupc;
Linv_bc_cnt += Linv_bc_offset[ljb_j];
- if (!(Uinv_bc_ptr[ljb_j] = (double*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(double))))
+ if (!(Uinv_bc_ptr[ljb_j] = (double*)doubleMalloc_dist(nsupc*nsupc)) )
ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");
Uinv_bc_offset[ljb_j]=nsupc*nsupc;
Uinv_bc_cnt += Uinv_bc_offset[ljb_j];
@@ -1932,9 +2000,12 @@ double *dense, *dense_col; /* SPA */
if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3)))
ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]");
- memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword; //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
- Lindval_loc_bc_offset[ljb_j]=nrbl*3;
- Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb_j];
+
+ //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+ memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;
+
+ Lindval_loc_bc_offset[ljb_j]=nrbl*3;
+ Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb_j];
lusup = Lnzval_bc_ptr[ljb_j];
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
@@ -1982,8 +2053,6 @@ double *dense, *dense_col; /* SPA */
}
} /* for i ... */
-
-
/* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/
if(nrbl>1){
krow = PROW( jb, grid );
@@ -1997,7 +2066,6 @@ double *dense, *dense_col; /* SPA */
quickSortM(lloc,0,uu,nrbl,0,3);
}
-
if ( !(index_srt = intMalloc_dist(len1)) )
ABORT("Malloc fails for index_srt[]");
if (!(lusup_srt = (double*)SUPERLU_MALLOC(len*nsupc * sizeof(double))))
@@ -2038,18 +2106,25 @@ double *dense, *dense_col; /* SPA */
Lrowind_bc_ptr[ljb_j] = NULL;
Lnzval_bc_ptr[ljb_j] = NULL;
Linv_bc_ptr[ljb_j] = NULL;
- Linv_bc_offset[ljb_j] = -1;
- Lrowind_bc_offset[ljb_j]=-1;
- Lindval_loc_bc_offset[ljb_j]=-1;
- Lnzval_bc_offset[ljb_j]=-1;
+ Linv_bc_offset[ljb_j] = -1;
+ Lrowind_bc_offset[ljb_j]=-1;
+ Lindval_loc_bc_offset[ljb_j]=-1;
+ Lnzval_bc_offset[ljb_j]=-1;
Uinv_bc_ptr[ljb_j] = NULL;
Uinv_bc_offset[ljb_j] = -1;
Lindval_loc_bc_ptr[ljb_j] = NULL;
} /* if nrbl ... */
} /* if mycol == pc */
- } /* for jb ... */
+ } /* end for jb ... */
SUPERLU_FREE(ilsum_j);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.ddist_psymbtonum [[5]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
SUPERLU_FREE(Urb_marker);
SUPERLU_FREE(LUb_length);
SUPERLU_FREE(LUb_indptr);
@@ -2072,7 +2147,7 @@ double *dense, *dense_col; /* SPA */
/* exchange information about bsendx_plist in between column of processors */
k = SUPERLU_MAX( grid->nprow, grid->npcol);
- if ( !(recvBuf = (int *) SUPERLU_MALLOC(nsupers*k * sizeof(int))) ) {
+ if ( !(recvBuf = (int *) SUPERLU_MALLOC(nsupers*k* sizeof(int))) ) {
fprintf (stderr, "Malloc fails for recvBuf[].");
return (memDist + memNLU + memTRS);
}
@@ -2128,9 +2203,13 @@ double *dense, *dense_col; /* SPA */
}
}
- //MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
+#if 0 // Sherry
+ MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t,
+ recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm);
+#else
MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, MPI_INT,
recvBuf, nnzToRecv, ptrToRecv, MPI_INT, grid->comm);
+#endif
for (jb = 0; jb < nsupers; jb++) {
jbcol = PCOL( jb, grid );
@@ -2161,9 +2240,13 @@ double *dense, *dense_col; /* SPA */
}
/* exchange information about bsendx_plist in between column of processors */
- //MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
+#if 0 // Sherry 1/16/2022
+ MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t,
+ MPI_MAX, grid->cscp.comm);
+#else
MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, MPI_INT,
MPI_MAX, grid->cscp.comm);
+#endif
for (jb = 0; jb < nsupers; jb ++) {
jbcol = PCOL( jb, grid);
@@ -2182,101 +2265,94 @@ double *dense, *dense_col; /* SPA */
(*bsendx_plist)[k] = EMPTY;
}
}
- }
-
-
- Linv_bc_cnt +=1; // safe guard
- Uinv_bc_cnt +=1;
- Lrowind_bc_cnt +=1 ;
- Lindval_loc_bc_cnt +=1;
- Lnzval_bc_cnt +=1;
- if ( !(Linv_bc_dat =
- (double*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(double))) ) {
- fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
- }
- if ( !(Uinv_bc_dat =
- (double*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(double))) ) {
- fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
- }
- if ( !(Lrowind_bc_dat =
- (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
- fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
- }
- if ( !(Lindval_loc_bc_dat =
- (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
- fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
- }
- if ( !(Lnzval_bc_dat =
- (double*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(double))) ) {
- fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
- }
-
- /* use contingous memory for Linv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
- k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
- Linv_bc_cnt=0;
- Uinv_bc_cnt=0;
- Lrowind_bc_cnt=0;
- Lnzval_bc_cnt=0;
- Lindval_loc_bc_cnt=0;
- long int tmp_cnt;
- for (jb = 0; jb < k; ++jb) { /* for each block column ... */
- if(Linv_bc_ptr[jb]!=NULL){
- for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) {
- Linv_bc_dat[Linv_bc_cnt+jj]=Linv_bc_ptr[jb][jj];
- }
- SUPERLU_FREE(Linv_bc_ptr[jb]);
- Linv_bc_ptr[jb]=&Linv_bc_dat[Linv_bc_cnt];
- tmp_cnt = Linv_bc_offset[jb];
- Linv_bc_offset[jb]=Linv_bc_cnt;
- Linv_bc_cnt+=tmp_cnt;
- }
- if(Uinv_bc_ptr[jb]!=NULL){
- for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) {
- Uinv_bc_dat[Uinv_bc_cnt+jj]=Uinv_bc_ptr[jb][jj];
- }
- SUPERLU_FREE(Uinv_bc_ptr[jb]);
- Uinv_bc_ptr[jb]=&Uinv_bc_dat[Uinv_bc_cnt];
- tmp_cnt = Uinv_bc_offset[jb];
- Uinv_bc_offset[jb]=Uinv_bc_cnt;
- Uinv_bc_cnt+=tmp_cnt;
- }
-
-
- if(Lrowind_bc_ptr[jb]!=NULL){
- for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) {
- Lrowind_bc_dat[Lrowind_bc_cnt+jj]=Lrowind_bc_ptr[jb][jj];
- }
- SUPERLU_FREE(Lrowind_bc_ptr[jb]);
- Lrowind_bc_ptr[jb]=&Lrowind_bc_dat[Lrowind_bc_cnt];
- tmp_cnt = Lrowind_bc_offset[jb];
- Lrowind_bc_offset[jb]=Lrowind_bc_cnt;
- Lrowind_bc_cnt+=tmp_cnt;
- }
-
- if(Lnzval_bc_ptr[jb]!=NULL){
- for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) {
- Lnzval_bc_dat[Lnzval_bc_cnt+jj]=Lnzval_bc_ptr[jb][jj];
- }
- SUPERLU_FREE(Lnzval_bc_ptr[jb]);
- Lnzval_bc_ptr[jb]=&Lnzval_bc_dat[Lnzval_bc_cnt];
- tmp_cnt = Lnzval_bc_offset[jb];
- Lnzval_bc_offset[jb]=Lnzval_bc_cnt;
- Lnzval_bc_cnt+=tmp_cnt;
- }
-
- if(Lindval_loc_bc_ptr[jb]!=NULL){
- for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) {
- Lindval_loc_bc_dat[Lindval_loc_bc_cnt+jj]=Lindval_loc_bc_ptr[jb][jj];
- }
- SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
- Lindval_loc_bc_ptr[jb]=&Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
- tmp_cnt = Lindval_loc_bc_offset[jb];
- Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
- Lindval_loc_bc_cnt+=tmp_cnt;
- }
-
-
- }
+ } /* end for jb ... */
+
+ Linv_bc_cnt +=1; // safe guard
+ Uinv_bc_cnt +=1;
+ Lrowind_bc_cnt +=1 ;
+ Lindval_loc_bc_cnt +=1;
+ Lnzval_bc_cnt +=1;
+ if ( !(Linv_bc_dat =
+ (double*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(double))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
+ }
+ if ( !(Uinv_bc_dat =
+ (double*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(double))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
+ }
+ if ( !(Lrowind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
+ }
+ if ( !(Lindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
+ }
+ if ( !(Lnzval_bc_dat =
+ (double*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(double))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ /* use contingous memory for Linv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Linv_bc_cnt=0;
+ Uinv_bc_cnt=0;
+ Lrowind_bc_cnt=0;
+ Lnzval_bc_cnt=0;
+ Lindval_loc_bc_cnt=0;
+ long int tmp_cnt;
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Linv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) {
+ Linv_bc_dat[Linv_bc_cnt+jj]=Linv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Linv_bc_ptr[jb]);
+ Linv_bc_ptr[jb]=&Linv_bc_dat[Linv_bc_cnt];
+ tmp_cnt = Linv_bc_offset[jb];
+ Linv_bc_offset[jb]=Linv_bc_cnt;
+ Linv_bc_cnt+=tmp_cnt;
+ }
+ if(Uinv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) {
+ Uinv_bc_dat[Uinv_bc_cnt+jj]=Uinv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Uinv_bc_ptr[jb]);
+ Uinv_bc_ptr[jb]=&Uinv_bc_dat[Uinv_bc_cnt];
+ tmp_cnt = Uinv_bc_offset[jb];
+ Uinv_bc_offset[jb]=Uinv_bc_cnt;
+ Uinv_bc_cnt+=tmp_cnt;
+ }
+ if(Lrowind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) {
+ Lrowind_bc_dat[Lrowind_bc_cnt+jj]=Lrowind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lrowind_bc_ptr[jb]);
+ Lrowind_bc_ptr[jb]=&Lrowind_bc_dat[Lrowind_bc_cnt];
+ tmp_cnt = Lrowind_bc_offset[jb];
+ Lrowind_bc_offset[jb]=Lrowind_bc_cnt;
+ Lrowind_bc_cnt+=tmp_cnt;
+ }
+ if(Lnzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) {
+ Lnzval_bc_dat[Lnzval_bc_cnt+jj]=Lnzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lnzval_bc_ptr[jb]);
+ Lnzval_bc_ptr[jb]=&Lnzval_bc_dat[Lnzval_bc_cnt];
+ tmp_cnt = Lnzval_bc_offset[jb];
+ Lnzval_bc_offset[jb]=Lnzval_bc_cnt;
+ Lnzval_bc_cnt+=tmp_cnt;
+ }
+ if(Lindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) {
+ Lindval_loc_bc_dat[Lindval_loc_bc_cnt+jj]=Lindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
+ Lindval_loc_bc_ptr[jb]=&Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
+ tmp_cnt = Lindval_loc_bc_offset[jb];
+ Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
+ Lindval_loc_bc_cnt+=tmp_cnt;
+ }
+ } /* end for jb ... */
+
/////////////////////////////////////////////////////////////////
/* Set up additional pointers for the index and value arrays of U.
@@ -2300,6 +2376,7 @@ double *dense, *dense_col; /* SPA */
fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
}
Ucb_indoffset[nub-1] = -1;
+
nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
/* Count number of row blocks in a block column.
@@ -2321,21 +2398,22 @@ double *dense, *dense_col; /* SPA */
One pass of the skeleton graph of U. */
for (lb = 0; lb < nub; ++lb) {
if ( Urbs[lb] ) { /* Not an empty block column. */
- if ( !(Ucb_indptr[lb]
- = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+ if ( !(Ucb_indptr[lb]
+ = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
- Ucb_indoffset[lb]=Urbs[lb];
- Ucb_indcnt += Ucb_indoffset[lb];
- if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+ Ucb_indoffset[lb]=Urbs[lb];
+ Ucb_indcnt += Ucb_indoffset[lb];
+
+ if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
- Ucb_valoffset[lb]=Urbs[lb];
- Ucb_valcnt += Ucb_valoffset[lb];
+ Ucb_valoffset[lb]=Urbs[lb];
+ Ucb_valcnt += Ucb_valoffset[lb];
}else{
- Ucb_valptr[lb]=NULL;
- Ucb_valoffset[lb]=-1;
- Ucb_indptr[lb]=NULL;
- Ucb_indoffset[lb]=-1;
- }
+ Ucb_valptr[lb]=NULL;
+ Ucb_valoffset[lb]=-1;
+ Ucb_indptr[lb]=NULL;
+ Ucb_indoffset[lb]=-1;
+ }
}
for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
usub1 = Ufstnz_br_ptr[lk];
@@ -2358,9 +2436,7 @@ double *dense, *dense_col; /* SPA */
}
}
-
-
-/* Count the nnzs per block column */
+ /* Count the nnzs per block column */
for (lb = 0; lb < nub; ++lb) {
Unnz[lb] = 0;
k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
@@ -2378,29 +2454,28 @@ double *dense, *dense_col; /* SPA */
}
} /* for jj ... */
}
- }
-
+ } /* end for lb ... */
+
Unzval_br_cnt +=1; // safe guard
Ufstnz_br_cnt +=1;
Ucb_valcnt +=1;
Ucb_indcnt +=1;
if ( !(Unzval_br_dat =
- (double*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(double))) ) {
+ (double*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(double))) ) {
fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
}
if ( !(Ufstnz_br_dat =
- (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
+ (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
}
if ( !(Ucb_valdat =
- (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
+ (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
fprintf(stderr, "Malloc fails for Ucb_valdat[].");
}
if ( !(Ucb_inddat =
- (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
+ (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
fprintf(stderr, "Malloc fails for Ucb_inddat[].");
}
-
/* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
@@ -2428,8 +2503,7 @@ double *dense, *dense_col; /* SPA */
Ufstnz_br_offset[lb]=Ufstnz_br_cnt;
Ufstnz_br_cnt+=tmp_cnt;
}
- }
-
+ } /* end for lb ... */
k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
Ucb_valcnt=0;
@@ -2455,7 +2529,7 @@ double *dense, *dense_col; /* SPA */
Ucb_indoffset[lb]=Ucb_indcnt;
Ucb_indcnt+=tmp_cnt;
}
- }
+ } /* end for lb ... */
/////////////////////////////////////////////////////////////////
@@ -2556,9 +2630,8 @@ double *dense, *dense_col; /* SPA */
// rseed=rand();
// rseed=1.0;
msgsize = SuperSize( jb );
- // LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
- // BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d');
-
+ //LBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
+ //BcTree_SetTag(LBtree_ptr[ljb],BC_L,'d');
C_BcTree_Create(&LBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'd');
LBtree_ptr[ljb].tag_=BC_L;
@@ -2628,10 +2701,11 @@ double *dense, *dense_col; /* SPA */
}
/* Every process receives the count, but it is only useful on the
diagonal processes. */
- //MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+#if 0 // Sherry
+ MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+#else
MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
-
-
+#endif
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(LRtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
@@ -2729,8 +2803,8 @@ double *dense, *dense_col; /* SPA */
// if(ib==0){
- // LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
- // RdTree_SetTag(LRtree_ptr[lib], RD_L,'d');
+ //LRtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
+ //RdTree_SetTag(LRtree_ptr[lib], RD_L,'d');
C_RdTree_Create(&LRtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'd');
LRtree_ptr[lib].tag_=RD_L;
// }
@@ -2885,9 +2959,8 @@ double *dense, *dense_col; /* SPA */
// rseed=rand();
// rseed=1.0;
msgsize = SuperSize( jb );
- // UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
- // BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d');
-
+ //UBtree_ptr[ljb] = BcTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'d');
+ //BcTree_SetTag(UBtree_ptr[ljb],BC_U,'d');
C_BcTree_Create(&UBtree_ptr[ljb], grid->comm, ranks, rank_cnt, msgsize, 'd');
UBtree_ptr[ljb].tag_=BC_U;
@@ -2945,10 +3018,11 @@ double *dense, *dense_col; /* SPA */
}
/* Every process receives the count, but it is only useful on the
diagonal processes. */
- //MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+#if 0 // Sherry
+ MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
+#else
MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
-
-
+#endif
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
@@ -3045,8 +3119,8 @@ double *dense, *dense_col; /* SPA */
// if(ib==0){
- // URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
- // RdTree_SetTag(URtree_ptr[lib], RD_U,'d');
+ //URtree_ptr[lib] = RdTree_Create(grid->comm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'d');
+ //RdTree_SetTag(URtree_ptr[lib], RD_U,'d');
C_RdTree_Create(&URtree_ptr[lib], grid->comm, ranks, rank_cnt, msgsize, 'd');
URtree_ptr[lib].tag_=RD_U;
// }
@@ -3089,45 +3163,44 @@ double *dense, *dense_col; /* SPA */
////////////////////////////////////////////////////////
- /* Free the memory used for storing L and U */
+ /* Free the memory used for storing symbolic structures of L and U */
SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
if (lsub != NULL)
SUPERLU_FREE(lsub);
if (usub != NULL)
SUPERLU_FREE(usub);
-
SUPERLU_FREE(nnzToRecv);
SUPERLU_FREE(ptrToRecv);
SUPERLU_FREE(nnzToSend);
SUPERLU_FREE(ptrToSend);
SUPERLU_FREE(recvBuf);
- Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
- Llu->Lrowind_bc_dat = Lrowind_bc_dat;
- Llu->Lrowind_bc_offset = Lrowind_bc_offset;
- Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
-
- Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
- Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
- Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
- Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
-
- Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
- Llu->Lnzval_bc_dat = Lnzval_bc_dat;
- Llu->Lnzval_bc_offset = Lnzval_bc_offset;
- Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
-
+ Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+ Llu->Lrowind_bc_dat = Lrowind_bc_dat;
+ Llu->Lrowind_bc_offset = Lrowind_bc_offset;
+ Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
+
+ Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+ Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
+ Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
+ Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
+
+ Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+ Llu->Lnzval_bc_dat = Lnzval_bc_dat;
+ Llu->Lnzval_bc_offset = Lnzval_bc_offset;
+ Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
- Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+ Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
Llu->Ufstnz_br_dat = Ufstnz_br_dat;
Llu->Ufstnz_br_offset = Ufstnz_br_offset;
Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
+
+ Llu->Unzval_br_ptr = Unzval_br_ptr;
+ Llu->Unzval_br_dat = Unzval_br_dat;
+ Llu->Unzval_br_offset = Unzval_br_offset;
+ Llu->Unzval_br_cnt = Unzval_br_cnt;
- Llu->Unzval_br_ptr = Unzval_br_ptr;
- Llu->Unzval_br_dat = Unzval_br_dat;
- Llu->Unzval_br_offset = Unzval_br_offset;
- Llu->Unzval_br_cnt = Unzval_br_cnt;
Llu->Unnz = Unnz;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
@@ -3147,73 +3220,72 @@ double *dense, *dense_col; /* SPA */
Llu->LBtree_ptr = LBtree_ptr;
Llu->URtree_ptr = URtree_ptr;
Llu->UBtree_ptr = UBtree_ptr;
+
Llu->Linv_bc_ptr = Linv_bc_ptr;
- Llu->Linv_bc_dat = Linv_bc_dat;
- Llu->Linv_bc_offset = Linv_bc_offset;
- Llu->Linv_bc_cnt = Linv_bc_cnt;
-
- Llu->Uinv_bc_ptr = Uinv_bc_ptr;
- Llu->Uinv_bc_dat = Uinv_bc_dat;
- Llu->Uinv_bc_offset = Uinv_bc_offset;
- Llu->Uinv_bc_cnt = Uinv_bc_cnt;
-
- Llu->Urbs = Urbs;
- Llu->Ucb_indptr = Ucb_indptr;
- Llu->Ucb_inddat = Ucb_inddat;
- Llu->Ucb_indoffset = Ucb_indoffset;
- Llu->Ucb_indcnt = Ucb_indcnt;
- Llu->Ucb_valptr = Ucb_valptr;
- Llu->Ucb_valdat = Ucb_valdat;
- Llu->Ucb_valoffset = Ucb_valoffset;
- Llu->Ucb_valcnt = Ucb_valcnt;
+ Llu->Linv_bc_dat = Linv_bc_dat;
+ Llu->Linv_bc_offset = Linv_bc_offset;
+ Llu->Linv_bc_cnt = Linv_bc_cnt;
+
+ Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Uinv_bc_dat = Uinv_bc_dat;
+ Llu->Uinv_bc_offset = Uinv_bc_offset;
+ Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+
+ Llu->Urbs = Urbs;
+ Llu->Ucb_indptr = Ucb_indptr;
+ Llu->Ucb_inddat = Ucb_inddat;
+ Llu->Ucb_indoffset = Ucb_indoffset;
+ Llu->Ucb_indcnt = Ucb_indcnt;
+ Llu->Ucb_valptr = Ucb_valptr;
+ Llu->Ucb_valdat = Ucb_valdat;
+ Llu->Ucb_valoffset = Ucb_valoffset;
+ Llu->Ucb_valcnt = Ucb_valcnt;
#ifdef GPU_ACC
- checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
- checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n+1) * sizeof(int_t), gpuMemcpyHostToDevice));
- checkGPU(gpuMalloc( (void**)&Llu->d_LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
- checkGPU(gpuMalloc( (void**)&Llu->d_LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
- checkGPU(gpuMalloc( (void**)&Llu->d_URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
- checkGPU(gpuMalloc( (void**)&Llu->d_UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
- checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
- checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
- checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
- checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
- checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
- checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
- checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
- checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
- checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
- checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
- checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
- checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
- checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
- checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
- // some dummy allocation to avoid checking whether they are null pointers later
- checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(double)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
-
-
- checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
- checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
- checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
- checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
- checkGPU(gpuMalloc( (void**)&Llu->d_ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t)));
- checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t), gpuMemcpyHostToDevice));
-
-
- /* gpuMemcpy for the following is performed in pxgssvx */
- checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(double)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(double)));
- checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(double)));
+ // some dummy allocation to avoid checking whether they are null pointers later
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+
+ /* gpuMemcpy for the following is performed in pxgssvx */
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(double)));
-#endif
+# endif /* end ifdef GPU_ACC */
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
@@ -3229,11 +3301,22 @@ double *dense, *dense_col; /* SPA */
MPI_MAX, grid->comm);
#if ( DEBUGlevel>=1 )
- /* Memory allocated but not freed:
+ /* Memory allocated but not free'd:
ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
ToRecv, ToSendR, ToSendD, mod_bit
*/
- CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
+ CHECK_MALLOC(iam, "Exit ddist_psymbtonum()");
+#endif
+
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t. end ddist_psymbtonum: memDist %.4f, memNLU %.4f, memTRS %.2f\n",
+ memDist*1e-6, memNLU*1e-6, memTRS*1e-6);
+ printf("\t\t. dense[] SPA %.4f (MB), ldaspa %d, ldaspa_j %d\n",
+ SUPERLU_MAX(ldaspa, ldaspa_j) * sp_ienv_dist(3, options) * dword * 1e-6,
+ (int) ldaspa, (int) ldaspa_j);
+ fflush(stdout);
+ }
#endif
return (- (memDist+memNLU));
diff --git a/SRC/pdutil.c b/SRC/pdutil.c
index 04bf7905..71b7e730 100755
--- a/SRC/pdutil.c
+++ b/SRC/pdutil.c
@@ -17,6 +17,9 @@ at the top-level directory.
* -- Distributed SuperLU routine (version 2.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* March 15, 2003
+ *
+ * Last modified:
+ * December 28, 2022
*
*/
@@ -433,53 +436,6 @@ void dLUstructFree(dLUstruct_t *LUstruct)
#endif
}
-void
-dDestroy_Tree(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
-{
- int_t i, nb, nsupers;
- Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
- dLocalLU_t *Llu = LUstruct->Llu;
-#if ( DEBUGlevel>=1 )
- int iam;
- MPI_Comm_rank( MPI_COMM_WORLD, &iam );
- CHECK_MALLOC(iam, "Enter dDestroy_Tree()");
-#endif
-
- nsupers = Glu_persist->supno[n-1] + 1;
-
- nb = CEILING(nsupers, grid->npcol);
- for (i=0;iLBtree_ptr[i].empty_==NO){
- // BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
- C_BcTree_Nullify(&Llu->LBtree_ptr[i]);
- }
- if(Llu->UBtree_ptr[i].empty_==NO){
- // BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
- C_BcTree_Nullify(&Llu->UBtree_ptr[i]);
- }
- }
- SUPERLU_FREE(Llu->LBtree_ptr);
- SUPERLU_FREE(Llu->UBtree_ptr);
-
- nb = CEILING(nsupers, grid->nprow);
- for (i=0;iLRtree_ptr[i].empty_==NO){
- // RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
- C_RdTree_Nullify(&Llu->LRtree_ptr[i]);
- }
- if(Llu->URtree_ptr[i].empty_==NO){
- // RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
- C_RdTree_Nullify(&Llu->URtree_ptr[i]);
- }
- }
- SUPERLU_FREE(Llu->LRtree_ptr);
- SUPERLU_FREE(Llu->URtree_ptr);
-
-#if ( DEBUGlevel>=1 )
- CHECK_MALLOC(iam, "Exit dDestroy_Tree()");
-#endif
-}
-
/*! \brief Destroy distributed L & U matrices. */
void
dDestroy_LU(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
@@ -498,25 +454,32 @@ dDestroy_LU(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
nsupers = Glu_persist->supno[n-1] + 1;
- nb = CEILING(nsupers, grid->npcol);
- // for (i = 0; i < nb; ++i)
- // if ( Llu->Lrowind_bc_ptr[i] ) {
- // SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
- // SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
- // }
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Lrowind_bc_ptr[i] ) {
+ // SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#if 0 // Sherry: the following is not allocated with cudaHostAlloc
+ //#ifdef GPU_ACC
+ checkGPU(gpuFreeHost(Llu->Lnzval_bc_ptr[i]));
+#endif
+ // SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+ // }
+
SUPERLU_FREE (Llu->Lrowind_bc_ptr);
SUPERLU_FREE (Llu->Lrowind_bc_dat);
SUPERLU_FREE (Llu->Lrowind_bc_offset);
SUPERLU_FREE (Llu->Lnzval_bc_ptr);
SUPERLU_FREE (Llu->Lnzval_bc_dat);
SUPERLU_FREE (Llu->Lnzval_bc_offset);
-
+
+ /* Following are free'd in distribution routines */
// nb = CEILING(nsupers, grid->nprow);
// for (i = 0; i < nb; ++i)
- // if ( Llu->Ufstnz_br_ptr[i] ) {
- // SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
- // SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
- // }
+ // if ( Llu->Ufstnz_br_ptr[i] ) {
+ // SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+ // SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+ // }
SUPERLU_FREE (Llu->Ufstnz_br_ptr);
SUPERLU_FREE (Llu->Ufstnz_br_dat);
SUPERLU_FREE (Llu->Ufstnz_br_offset);
@@ -533,31 +496,32 @@ dDestroy_LU(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
/* The following can be freed only after iterative refinement. */
SUPERLU_FREE(Llu->ilsum);
SUPERLU_FREE(Llu->fmod);
- SUPERLU_FREE(Llu->fsendx_plist[0]);
+ SUPERLU_FREE((Llu->fsendx_plist)[0]);
SUPERLU_FREE(Llu->fsendx_plist);
SUPERLU_FREE(Llu->bmod);
- SUPERLU_FREE(Llu->bsendx_plist[0]);
+ SUPERLU_FREE((Llu->bsendx_plist)[0]);
SUPERLU_FREE(Llu->bsendx_plist);
SUPERLU_FREE(Llu->mod_bit);
+ /* Following are free'd in distribution routines */
// nb = CEILING(nsupers, grid->npcol);
- // for (i = 0; i < nb; ++i)
- // if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
- // SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
- // }
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
+ // SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
+ // }
SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
SUPERLU_FREE(Llu->Lindval_loc_bc_dat);
SUPERLU_FREE(Llu->Lindval_loc_bc_offset);
-
+
+ /* Following are free'd in distribution routines */
// nb = CEILING(nsupers, grid->npcol);
// for (i=0; iLinv_bc_ptr[i]!=NULL) {
- // // SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
- // // }
-
- // if(Llu->Uinv_bc_ptr[i]!=NULL){
- // SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
- // }
+ // if(Llu->Linv_bc_ptr[i]!=NULL) {
+ // SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
+ // }
+ // if(Llu->Uinv_bc_ptr[i]!=NULL){
+ // SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
+ // }
// }
SUPERLU_FREE(Llu->Linv_bc_ptr);
SUPERLU_FREE(Llu->Linv_bc_dat);
@@ -566,42 +530,42 @@ dDestroy_LU(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
SUPERLU_FREE(Llu->Uinv_bc_dat);
SUPERLU_FREE(Llu->Uinv_bc_offset);
SUPERLU_FREE(Llu->Unnz);
-
+
+ /* Following are free'd in distribution routines */
// nb = CEILING(nsupers, grid->npcol);
// for (i = 0; i < nb; ++i)
- // if ( Llu->Urbs[i] ) {
- // SUPERLU_FREE(Llu->Ucb_indptr[i]);
- // SUPERLU_FREE(Llu->Ucb_valptr[i]);
- // }
+ // if ( Llu->Urbs[i] ) {
+ // SUPERLU_FREE(Llu->Ucb_indptr[i]);
+ // SUPERLU_FREE(Llu->Ucb_valptr[i]);
+ // }
SUPERLU_FREE(Llu->Ucb_indptr);
SUPERLU_FREE(Llu->Ucb_inddat);
SUPERLU_FREE(Llu->Ucb_indoffset);
-
SUPERLU_FREE(Llu->Ucb_valptr);
SUPERLU_FREE(Llu->Ucb_valdat);
SUPERLU_FREE(Llu->Ucb_valoffset);
SUPERLU_FREE(Llu->Urbs);
-
+
SUPERLU_FREE(Glu_persist->xsup);
SUPERLU_FREE(Glu_persist->supno);
#ifdef GPU_ACC
- checkGPU (gpuFree (Llu->d_xsup));
- checkGPU (gpuFree (Llu->d_LRtree_ptr));
- checkGPU (gpuFree (Llu->d_LBtree_ptr));
- checkGPU (gpuFree (Llu->d_URtree_ptr));
- checkGPU (gpuFree (Llu->d_UBtree_ptr));
- checkGPU (gpuFree (Llu->d_ilsum));
- checkGPU (gpuFree (Llu->d_Lrowind_bc_dat));
- checkGPU (gpuFree (Llu->d_Lrowind_bc_offset));
- checkGPU (gpuFree (Llu->d_Lnzval_bc_dat));
- checkGPU (gpuFree (Llu->d_Lnzval_bc_offset));
- checkGPU (gpuFree (Llu->d_Linv_bc_dat));
- checkGPU (gpuFree (Llu->d_Uinv_bc_dat));
- checkGPU (gpuFree (Llu->d_Linv_bc_offset));
- checkGPU (gpuFree (Llu->d_Uinv_bc_offset));
- checkGPU (gpuFree (Llu->d_Lindval_loc_bc_dat));
- checkGPU (gpuFree (Llu->d_Lindval_loc_bc_offset));
+ checkGPU (gpuFree (Llu->d_xsup));
+ checkGPU (gpuFree (Llu->d_LRtree_ptr));
+ checkGPU (gpuFree (Llu->d_LBtree_ptr));
+ checkGPU (gpuFree (Llu->d_URtree_ptr));
+ checkGPU (gpuFree (Llu->d_UBtree_ptr));
+ checkGPU (gpuFree (Llu->d_ilsum));
+ checkGPU (gpuFree (Llu->d_Lrowind_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lrowind_bc_offset));
+ checkGPU (gpuFree (Llu->d_Lnzval_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lnzval_bc_offset));
+ checkGPU (gpuFree (Llu->d_Linv_bc_dat));
+ checkGPU (gpuFree (Llu->d_Uinv_bc_dat));
+ checkGPU (gpuFree (Llu->d_Linv_bc_offset));
+ checkGPU (gpuFree (Llu->d_Uinv_bc_offset));
+ checkGPU (gpuFree (Llu->d_Lindval_loc_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lindval_loc_bc_offset));
checkGPU (gpuFree (Llu->d_Ucolind_bc_dat));
checkGPU (gpuFree (Llu->d_Ucolind_bc_offset));
@@ -609,10 +573,8 @@ dDestroy_LU(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
checkGPU (gpuFree (Llu->d_Unzval_bc_offset));
checkGPU (gpuFree (Llu->d_Uindval_loc_bc_dat));
checkGPU (gpuFree (Llu->d_Uindval_loc_bc_offset));
-
#endif
-
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit dDestroy_LU()");
#endif
@@ -625,7 +587,7 @@ dDestroy_LU(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
* =======
* Set up the communication pattern for redistribution between B and X
* in the triangular solution.
- *
+ *
* Arguments
* =========
*
@@ -696,7 +658,7 @@ pdgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
++SendCnt[p];
}
-
+
/* Set up the displacements for alltoall. */
MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
sdispls[0] = rdispls[0] = 0;
@@ -899,7 +861,7 @@ void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
SUPERLU_FREE( A2d->colind );
SUPERLU_FREE( A2d->nzval );
}
- SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts
+ SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts
SUPERLU_FREE(A3d->row_disp);
SUPERLU_FREE(A3d->nnz_counts_int);
SUPERLU_FREE(A3d->nnz_disp);
@@ -925,27 +887,82 @@ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx,
double err, xnorm, temperr, tempxnorm;
double *x_work, *xtrue_work;
int i, j;
+ double errcomp; // componentwise error
+ double derr;
for (j = 0; j < nrhs; j++) {
x_work = &x[j*ldx];
xtrue_work = &xtrue[j*ldxtrue];
- err = xnorm = 0.0;
+ err = xnorm = errcomp = 0.0;
for (i = 0; i < n; i++) {
- err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i]));
+ derr = fabs(x_work[i] - xtrue_work[i]);
+ err = SUPERLU_MAX(err, derr);
xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i]));
+ errcomp = SUPERLU_MAX(errcomp, derr / fabs(x_work[i]));
}
/* get the golbal max err & xnrom */
temperr = err;
- tempxnorm = xnorm;
MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm);
+ tempxnorm = xnorm;
MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm);
+ temperr = errcomp;
+ MPI_Allreduce( &temperr, &errcomp, 1, MPI_FLOAT, MPI_MAX, slucomm);
err = err / xnorm;
- if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+ if ( !iam ) {
+ printf(".. Sol %2d: ||X - Xtrue|| / ||X|| = %e\t max_i |x - xtrue|_i / |x|_i = %e\n", j, err, errcomp);
+ fflush(stdout);
+ }
}
}
-/*! \brief Destroy distributed L & U matrices. */
+/*! \brief Destroy broadcast and reduction trees used in triangular solve */
+void
+dDestroy_Tree(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct)
+{
+ int i, nb, nsupers;
+ Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+ dLocalLU_t *Llu = LUstruct->Llu;
+#if ( DEBUGlevel>=1 )
+ int iam;
+ MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+ CHECK_MALLOC(iam, "Enter dDestroy_Tree()");
+#endif
+
+ nsupers = Glu_persist->supno[n-1] + 1;
+
+ nb = CEILING(nsupers, grid->npcol);
+ for (i=0;iLBtree_ptr[i].empty_==NO){
+ // BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
+ C_BcTree_Nullify(&Llu->LBtree_ptr[i]);
+ }
+ if(Llu->UBtree_ptr[i].empty_==NO){
+ // BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
+ C_BcTree_Nullify(&Llu->UBtree_ptr[i]);
+ }
+ }
+ SUPERLU_FREE(Llu->LBtree_ptr);
+ SUPERLU_FREE(Llu->UBtree_ptr);
+
+ nb = CEILING(nsupers, grid->nprow);
+ for (i=0;iLRtree_ptr[i].empty_==NO){
+ // RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
+ C_RdTree_Nullify(&Llu->LRtree_ptr[i]);
+ }
+ if(Llu->URtree_ptr[i].empty_==NO){
+ // RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
+ C_RdTree_Nullify(&Llu->URtree_ptr[i]);
+ }
+ }
+ SUPERLU_FREE(Llu->LRtree_ptr);
+ SUPERLU_FREE(Llu->URtree_ptr);
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit dDestroy_Tree()");
+#endif
+}
diff --git a/SRC/psdistribute.c b/SRC/psdistribute.c
index 8535a6bc..681954ce 100644
--- a/SRC/psdistribute.c
+++ b/SRC/psdistribute.c
@@ -17,11 +17,14 @@ at the top-level directory.
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 15, 2008
* October 18, 2021, minor fix, v7.1.1
+ * January 9, 2023, add new data structures for SpTRSV
*
*/
#include "superlu_sdefs.h"
-
+#ifdef GPU_ACC
+#include "gpu_api_utils.h"
+#endif
/*! \brief
*
@@ -402,21 +405,42 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int *index1; /* temporary pointer to array of int */
float *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */
+ long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
+
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Unnz; /* size ceil(NSUPERS/Pc) */
- float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+
+ int_t *Unnz; /* size ceil(NSUPERS/Pc) */
+ float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
+ float *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */
+ long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Unzval_br_cnt=0;
int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
+ long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Ufstnz_br_cnt=0;
- C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
- int msgsize;
+ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
+ int msgsize;
int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+ Ucb_indptr_t *Ucb_inddat;
+ long int *Ucb_indoffset;
+ long int Ucb_indcnt=0;
int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
+ int_t *Ucb_valdat;
+ long int *Ucb_valoffset;
+ long int Ucb_valcnt=0;
+
/*-- Counts to be used in factorization. --*/
int *ToRecv, *ToSendD, **ToSendR;
@@ -464,7 +488,11 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int *frecv, *brecv;
int_t *lloc;
float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */
+ long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */
+ long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
double *SeedSTD_BC,*SeedSTD_RD;
int_t idx_indx,idx_lusup;
int_t nbrow;
@@ -675,8 +703,18 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(Unzval_br_ptr =
(float**)SUPERLU_MALLOC(k * sizeof(float*))) )
ABORT("Malloc fails for Unzval_br_ptr[].");
+ if ( !(Unzval_br_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
+ }
+ Unzval_br_offset[k-1] = -1;
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Ufstnz_br_ptr[].");
+ if ( !(Ufstnz_br_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
+ }
+ Ufstnz_br_offset[k-1] = -1;
if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
ABORT("Malloc fails for ToSendD[].");
@@ -767,8 +805,13 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(index = intMalloc_dist(len1+1)) )
ABORT("Malloc fails for Uindex[].");
Ufstnz_br_ptr[lb] = index;
+ Ufstnz_br_offset[lb]=len1+1;
+ Ufstnz_br_cnt += Ufstnz_br_offset[lb];
if ( !(Unzval_br_ptr[lb] = floatMalloc_dist(len)) )
ABORT("Malloc fails for Unzval_br_ptr[*][].");
+ Unzval_br_offset[lb]=len;
+ Unzval_br_cnt += Unzval_br_offset[lb];
+
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
index[0] = Ucbs[lb]; /* Number of column blocks */
@@ -778,6 +821,8 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} else {
Ufstnz_br_ptr[lb] = NULL;
Unzval_br_ptr[lb] = NULL;
+ Unzval_br_offset[lb]=-1;
+ Ufstnz_br_offset[lb]=-1;
}
Urb_length[lb] = 0; /* Reset block length. */
Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
@@ -826,22 +871,47 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lrowind_bc_ptr[].");
Lrowind_bc_ptr[k-1] = NULL;
+ if ( !(Lrowind_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
+ }
+ Lrowind_bc_offset[k-1] = -1;
+ if ( !(Lnzval_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
+ }
+ Lnzval_bc_offset[k-1] = -1;
if ( !(Lindval_loc_bc_ptr =
(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
Lindval_loc_bc_ptr[k-1] = NULL;
+ if ( !(Lindval_loc_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
+ }
+ Lindval_loc_bc_offset[k-1] = -1;
if ( !(Linv_bc_ptr =
- (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
+ (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
}
+ if ( !(Linv_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
+ }
if ( !(Uinv_bc_ptr =
- (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
+ (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
}
+ if ( !(Uinv_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
+ }
Linv_bc_ptr[k-1] = NULL;
Uinv_bc_ptr[k-1] = NULL;
+ Linv_bc_offset[k-1] = -1;
+ Uinv_bc_offset[k-1] = -1;
if ( !(Unnz =
(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
@@ -872,6 +942,11 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
+ long int Linv_bc_cnt=0;
+ long int Uinv_bc_cnt=0;
+ long int Lrowind_bc_cnt=0;
+ long int Lnzval_bc_cnt=0;
+ long int Lindval_loc_bc_cnt=0;
for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
pc = PCOL( jb, grid );
@@ -1014,14 +1089,37 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
if ( !(index = intMalloc_dist(len1)) )
ABORT("Malloc fails for index[]");
+ Lrowind_bc_offset[ljb]=len1;
+ Lrowind_bc_cnt += Lrowind_bc_offset[ljb];
+
if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
ABORT("Malloc fails for lusup[]");
+ Lnzval_bc_offset[ljb]=len*nsupc;
+ Lnzval_bc_cnt += Lnzval_bc_offset[ljb];
if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) )
ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
- if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
- ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
- if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
- ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+ Lindval_loc_bc_offset[ljb]=nrbl*3;
+ Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb];
+
+ myrow = MYROW( iam, grid );
+ krow = PROW( jb, grid );
+ if(myrow==krow){ /* diagonal block */
+ if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+ ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+ Linv_bc_offset[ljb]=nsupc*nsupc;
+ Linv_bc_cnt += Linv_bc_offset[ljb];
+
+ if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+ ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+ Uinv_bc_offset[ljb]=nsupc*nsupc;
+ Uinv_bc_cnt += Uinv_bc_offset[ljb];
+ }else{
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ }
+
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
@@ -1134,9 +1232,14 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} else {
Lrowind_bc_ptr[ljb] = NULL;
Lnzval_bc_ptr[ljb] = NULL;
- Linv_bc_ptr[ljb] = NULL;
- Uinv_bc_ptr[ljb] = NULL;
- Lindval_loc_bc_ptr[ljb] = NULL;
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Lrowind_bc_offset[ljb]=-1;
+ Lindval_loc_bc_offset[ljb]=-1;
+ Lnzval_bc_offset[ljb]=-1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ Lindval_loc_bc_ptr[ljb] = NULL;
} /* if nrbl ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
@@ -1145,6 +1248,98 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} /* for jb ... */
+ Linv_bc_cnt +=1; // safe guard
+ Uinv_bc_cnt +=1;
+ Lrowind_bc_cnt +=1;
+ Lindval_loc_bc_cnt +=1;
+ Lnzval_bc_cnt +=1;
+ if ( !(Linv_bc_dat =
+ (float*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
+ }
+ if ( !(Uinv_bc_dat =
+ (float*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
+ }
+
+ if ( !(Lrowind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
+ }
+ if ( !(Lindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
+ }
+ if ( !(Lnzval_bc_dat =
+ (float*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+
+ /* use contingous memory for Linv_bc_ptr, Uinv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Linv_bc_cnt=0;
+ Uinv_bc_cnt=0;
+ Lrowind_bc_cnt=0;
+ Lnzval_bc_cnt=0;
+ Lindval_loc_bc_cnt=0;
+ long int tmp_cnt;
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Linv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) {
+ Linv_bc_dat[Linv_bc_cnt+jj]=Linv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Linv_bc_ptr[jb]);
+ Linv_bc_ptr[jb]=&Linv_bc_dat[Linv_bc_cnt];
+ tmp_cnt = Linv_bc_offset[jb];
+ Linv_bc_offset[jb]=Linv_bc_cnt;
+ Linv_bc_cnt+=tmp_cnt;
+ }
+
+ if(Uinv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) {
+ Uinv_bc_dat[Uinv_bc_cnt+jj]=Uinv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Uinv_bc_ptr[jb]);
+ Uinv_bc_ptr[jb]=&Uinv_bc_dat[Uinv_bc_cnt];
+ tmp_cnt = Uinv_bc_offset[jb];
+ Uinv_bc_offset[jb]=Uinv_bc_cnt;
+ Uinv_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lrowind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) {
+ Lrowind_bc_dat[Lrowind_bc_cnt+jj]=Lrowind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lrowind_bc_ptr[jb]);
+ Lrowind_bc_ptr[jb]=&Lrowind_bc_dat[Lrowind_bc_cnt];
+ tmp_cnt = Lrowind_bc_offset[jb];
+ Lrowind_bc_offset[jb]=Lrowind_bc_cnt;
+ Lrowind_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lnzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) {
+ Lnzval_bc_dat[Lnzval_bc_cnt+jj]=Lnzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lnzval_bc_ptr[jb]);
+ Lnzval_bc_ptr[jb]=&Lnzval_bc_dat[Lnzval_bc_cnt];
+ tmp_cnt = Lnzval_bc_offset[jb];
+ Lnzval_bc_offset[jb]=Lnzval_bc_cnt;
+ Lnzval_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) {
+ Lindval_loc_bc_dat[Lindval_loc_bc_cnt+jj]=Lindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
+ Lindval_loc_bc_ptr[jb]=&Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
+ tmp_cnt = Lindval_loc_bc_offset[jb];
+ Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
+ Lindval_loc_bc_cnt+=tmp_cnt;
+ }
+ } /* for jb ... */
+
/////////////////////////////////////////////////////////////////
/* Set up additional pointers for the index and value arrays of U.
@@ -1158,6 +1353,17 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
ABORT("Malloc fails for Ucb_indptr[]");
if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
ABORT("Malloc fails for Ucb_valptr[]");
+ if ( !(Ucb_valoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valoffset[].");
+ }
+ Ucb_valoffset[nub-1] = -1;
+ if ( !(Ucb_indoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
+ }
+ Ucb_indoffset[nub-1] = -1;
+
nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
/* Count number of row blocks in a block column.
@@ -1180,10 +1386,19 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
for (lb = 0; lb < nub; ++lb) {
if ( Urbs[lb] ) { /* Not an empty block column. */
if ( !(Ucb_indptr[lb]
- = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+ = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
+ Ucb_indoffset[lb]=Urbs[lb];
+ Ucb_indcnt += Ucb_indoffset[lb];
if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
+ Ucb_valoffset[lb]=Urbs[lb];
+ Ucb_valcnt += Ucb_valoffset[lb];
+ }else{
+ Ucb_valptr[lb]=NULL;
+ Ucb_valoffset[lb]=-1;
+ Ucb_indptr[lb]=NULL;
+ Ucb_indoffset[lb]=-1;
}
}
for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
@@ -1228,6 +1443,81 @@ psdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
}
}
+ Unzval_br_cnt +=1; // safe guard
+ Ufstnz_br_cnt +=1;
+ Ucb_valcnt +=1;
+ Ucb_indcnt +=1;
+ if ( !(Unzval_br_dat =
+ (float*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ if ( !(Ufstnz_br_dat =
+ (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
+ }
+ if ( !(Ucb_valdat =
+ (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valdat[].");
+ }
+ if ( !(Ucb_inddat =
+ (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_inddat[].");
+ }
+
+ /* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
+ k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+ Unzval_br_cnt=0;
+ Ufstnz_br_cnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Unzval_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Unzval_br_offset[lb]; ++jj) {
+ Unzval_br_dat[Unzval_br_cnt+jj]=Unzval_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Unzval_br_ptr[lb]);
+ Unzval_br_ptr[lb]=&Unzval_br_dat[Unzval_br_cnt];
+ tmp_cnt = Unzval_br_offset[lb];
+ Unzval_br_offset[lb]=Unzval_br_cnt;
+ Unzval_br_cnt+=tmp_cnt;
+ }
+
+ if(Ufstnz_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Ufstnz_br_offset[lb]; ++jj) {
+ Ufstnz_br_dat[Ufstnz_br_cnt+jj]=Ufstnz_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Ufstnz_br_ptr[lb]);
+ Ufstnz_br_ptr[lb]=&Ufstnz_br_dat[Ufstnz_br_cnt];
+ tmp_cnt = Ufstnz_br_offset[lb];
+ Ufstnz_br_offset[lb]=Ufstnz_br_cnt;
+ Ufstnz_br_cnt+=tmp_cnt;
+ }
+ }
+
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Ucb_valcnt=0;
+ Ucb_indcnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Ucb_valptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_valoffset[lb]; ++jj) {
+ Ucb_valdat[Ucb_valcnt+jj]=Ucb_valptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_valptr[lb]);
+ Ucb_valptr[lb]=&Ucb_valdat[Ucb_valcnt];
+ tmp_cnt = Ucb_valoffset[lb];
+ Ucb_valoffset[lb]=Ucb_valcnt;
+ Ucb_valcnt+=tmp_cnt;
+ }
+ if(Ucb_indptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_indoffset[lb]; ++jj) {
+ Ucb_inddat[Ucb_indcnt+jj]=Ucb_indptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_indptr[lb]);
+ Ucb_indptr[lb]=&Ucb_inddat[Ucb_indcnt];
+ tmp_cnt = Ucb_indoffset[lb];
+ Ucb_indoffset[lb]=Ucb_indcnt;
+ Ucb_indcnt+=tmp_cnt;
+ }
+ } /* for lb ... */
+
/////////////////////////////////////////////////////////////////
#if ( PROFlevel>=1 )
@@ -1730,10 +2020,8 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
}
/* Every process receives the count, but it is only useful on the
diagonal processes. */
- //MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
-
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
ABORT("Malloc fails for URtree_ptr[].");
@@ -1920,12 +2208,31 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
////////////////////////////////////////////////////////
-
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+ Llu->Lrowind_bc_dat = Lrowind_bc_dat;
+ Llu->Lrowind_bc_offset = Lrowind_bc_offset;
+ Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
+
Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+ Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
+ Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
+ Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
+
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+ Llu->Lnzval_bc_dat = Lnzval_bc_dat;
+ Llu->Lnzval_bc_offset = Lnzval_bc_offset;
+ Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
+
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+ Llu->Ufstnz_br_dat = Ufstnz_br_dat;
+ Llu->Ufstnz_br_offset = Ufstnz_br_offset;
+ Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
+
Llu->Unzval_br_ptr = Unzval_br_ptr;
+ Llu->Unzval_br_dat = Unzval_br_dat;
+ Llu->Unzval_br_offset = Unzval_br_offset;
+ Llu->Unzval_br_cnt = Unzval_br_cnt;
+
Llu->Unnz = Unnz;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
@@ -1945,11 +2252,74 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
Llu->LBtree_ptr = LBtree_ptr;
Llu->URtree_ptr = URtree_ptr;
Llu->UBtree_ptr = UBtree_ptr;
+
Llu->Linv_bc_ptr = Linv_bc_ptr;
+ Llu->Linv_bc_dat = Linv_bc_dat;
+ Llu->Linv_bc_offset = Linv_bc_offset;
+ Llu->Linv_bc_cnt = Linv_bc_cnt;
+
Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Uinv_bc_dat = Uinv_bc_dat;
+ Llu->Uinv_bc_offset = Uinv_bc_offset;
+ Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+
Llu->Urbs = Urbs;
Llu->Ucb_indptr = Ucb_indptr;
+ Llu->Ucb_inddat = Ucb_inddat;
+ Llu->Ucb_indoffset = Ucb_indoffset;
+ Llu->Ucb_indcnt = Ucb_indcnt;
Llu->Ucb_valptr = Ucb_valptr;
+ Llu->Ucb_valdat = Ucb_valdat;
+ Llu->Ucb_valoffset = Ucb_valoffset;
+ Llu->Ucb_valcnt = Ucb_valcnt;
+
+
+#ifdef GPU_ACC
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+
+ // some dummy allocation to avoid checking whether they are null pointers later
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(float)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
+
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+
+
+ /* gpuMemcpy for the following is performed in pxgssvx */
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(float)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(float)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(float)));
+
+#endif
#if ( PRNTlevel>=1 )
diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c
index 2991ec31..620c3584 100644
--- a/SRC/psgssvx.c
+++ b/SRC/psgssvx.c
@@ -720,13 +720,9 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
if ( iinfo > 0 ) {
if ( iinfo <= m ) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "The %d-th row of A is exactly zero\n", (int)iinfo);
-#endif
} else {
-#if ( PRNTlevel>=1 )
- fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)iinfo-n);
-#endif
+ fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)(iinfo-n));
}
} else if ( iinfo < 0 ) return;
@@ -1000,7 +996,7 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
printf("{" IFMT "," IFMT "}: psgssvx: invalid ColPerm option when ParSymbfact is used\n",
MYROW(grid->iam, grid), MYCOL(grid->iam, grid));
}
- }
+ } /* end preparing for parallel symbolic */
if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
/* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */
@@ -1020,9 +1016,7 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
// }
// }
if (flinfo > 0) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n");
-#endif
*info = flinfo;
return;
}
@@ -1072,10 +1066,11 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
ABORT("Malloc fails for Glu_freeable.");
- /* Every process does this. */
+ /* Every process does this.
+ returned value (-iinfo) is the size of lsub[], incuding pruned graph.*/
iinfo = symbfact(options, iam, &GAC, perm_c, etree,
Glu_persist, Glu_freeable);
- nnzLU = Glu_freeable->nnzLU;
+ nnzLU = Glu_freeable->nnzLU;
stat->utime[SYMBFAC] = SuperLU_timer_() - t;
if ( iinfo <= 0 ) { /* Successful return */
QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
@@ -1095,10 +1090,8 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
}
#endif
} else { /* symbfact out of memory */
-#if ( PRNTlevel>=1 )
if ( !iam )
fprintf(stderr,"symbfact() error returns " IFMT "\n",iinfo);
-#endif
*info = iinfo;
return;
}
@@ -1113,9 +1106,7 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
nnzLU = Pslu_freeable.nnzLU;
stat->utime[SYMBFAC] = SuperLU_timer_() - t;
if (flinfo > 0) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "Insufficient memory for parallel symbolic factorization.");
-#endif
*info = flinfo;
return;
}
@@ -1163,6 +1154,8 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
t = SuperLU_timer_();
dist_mem_use = sdist_psymbtonum(options, n, A, ScalePermstruct,
&Pslu_freeable, LUstruct, grid);
+
+ /* dist_mem_use = memDist + memNLU */
if (dist_mem_use > 0)
ABORT ("Not enough memory available for dist_psymbtonum\n");
@@ -1281,28 +1274,62 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
MPI_SUM, 0, grid->comm );
stat->TinyPivots = TinyPivots;
+ if ( iam==0 ) {
+ printf("\n** Memory Usage **********************************\n");
+ }
+
+ /* Compute numerical factorization memeory */
+ sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+
/*-- Compute high watermark of all stages --*/
if (parSymbFact == TRUE) {
/* The memory used in the redistribution routine
includes the memory used for storing the symbolic
structure and the memory allocated for numerical
factorization */
- mem_stage[0] = (-flinfo); /* symbfact step */
- mem_stage[1] = (-dist_mem_use); /* distribution step */
+ /* parallel symbfact step:
+ (-flinfo) is the allocMem returned from symbfact_dist() */
+ mem_stage[0] = symb_mem_usage.total + (-flinfo);
+
+ /* see leading comment of dist_symbLU() */
+ /* dist_mem_use = (memDist + memNLU) in sdist_psymbtonum() */
+ mem_stage[1] = symb_mem_usage.for_lu + (-dist_mem_use); /* distribution step */
loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1] );
if ( options->RowPerm != NO )
loc_max = SUPERLU_MAX(loc_max, GA_mem_use);
- } else {
+
+#if ( PRNTlevel>=1 )
+ if ( iam==0 ) {
+ printf("\t(P0) Globle A for MC64: GA_mem_use %.2f\n", GA_mem_use*1e-6);
+ printf("\t(P0) parallel symbolic::stage[0]: symb_memory %.2f, allocMem %.2f\n",
+ symb_mem_usage.total*1e-6, (-flinfo)*1e-6);
+ printf("\t(P0) parallel distribution::stage[1]: symb_LU %.2f, dist_mem_use %.2f\n",
+ symb_mem_usage.for_lu*1e-6, (-dist_mem_use)*1e-6);
+ fflush(stdout);
+
+ }
+#endif
+ } else { /* Serial symbolic. GA_mem_use is for global A */
mem_stage[0] = symb_mem_usage.total + GA_mem_use; /* symbfact step */
mem_stage[1] = symb_mem_usage.for_lu
+ dist_mem_use
+ num_mem_usage.for_lu; /* distribution step */
loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1] );
+#if ( PRNTlevel>=1 )
+ if ( iam==0 ) {
+ printf("\t(P0) serial symbolic::stage[0]: symb_memory %.2f, GA_mem_use %.2f\n",
+ symb_mem_usage.total*1e-6, GA_mem_use*1e-6);
+ printf("\t(P0) serial distribution::stage[1]:"
+ "symb_LU %.2f, dist_mem_use %.2f, num_mem_usage.for_lu %.2f\n",
+ symb_mem_usage.for_lu*1e-6, dist_mem_use*1e-6,
+ num_mem_usage.for_lu*1e-6);
+ fflush(stdout);
+
+ }
+#endif
}
- sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
mem_stage[2] = num_mem_usage.total; /* numerical factorization step */
-
loc_max = SUPERLU_MAX( loc_max, mem_stage[2] ); /* local max of 3 stages */
local_struct.val = loc_max;
@@ -1330,7 +1357,6 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
float buffer_peak = global_struct.val*1e-6;
if ( iam==0 ) {
- printf("\n** Memory Usage **********************************\n");
printf("** Total highmark (MB):\n"
" Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n",
avg * 1e-6,
@@ -1357,7 +1383,6 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
} /* end if (!factored) */
-
if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
/* Need to reset the solve's communication pattern,
because perm_r[] and/or perm_c[] is changed. */
@@ -1448,19 +1473,20 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
if ( options->DiagInv==YES && (Fact != FACTORED) ) {
psCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+
#ifdef GPU_ACC
+
+ psconvertU(options, grid, LUstruct, stat, n);
+
checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
(LUstruct->Llu->Linv_bc_cnt) * sizeof(float), gpuMemcpyHostToDevice));
checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
(LUstruct->Llu->Uinv_bc_cnt) * sizeof(float), gpuMemcpyHostToDevice));
checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
(LUstruct->Llu->Lnzval_bc_cnt) * sizeof(float), gpuMemcpyHostToDevice));
- //checkGPU(gpuMemcpy(LUstruct->Llu->d_Unzval_br_dat, LUstruct->Llu->Unzval_br_dat,
- // (LUstruct->Llu->Unzval_br_cnt) * sizeof(float), gpuMemcpyHostToDevice));
#endif
}
-
// #pragma omp parallel
// {
// #pragma omp master
@@ -1629,4 +1655,289 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
CHECK_MALLOC(iam, "Exit psgssvx()");
#endif
+} /* psgssvx */
+
+#ifdef GPU_ACC
+void
+psconvertU(superlu_dist_options_t *options, gridinfo_t *grid,
+ sLUstruct_t *LUstruct, SuperLUStat_t *stat, int n)
+{
+int64_t nnz_ind,nnz_offset;
+int64_t nnz_val;
+Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+int_t nsupers,nsupers_j,ncol,ncol_loc,nrow;
+int_t lk,ik,ub,nub,i,il,gik,k,uptr,jj,ii,fnz,irow,jb;
+sLocalLU_t *Llu = LUstruct->Llu;
+int_t *Urbs = Llu->Urbs;
+int_t **Ucb_valptr = Llu->Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
+Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+int_t knsupc,iknsupc,ikfrow,iklrow;
+int_t *xsup = Glu_persist->xsup;;
+
+int iam = grid->iam;
+int mycol = MYCOL (iam, grid);
+int myrow = MYROW (iam, grid);
+
+int_t *usub;
+float *uval;
+
+int64_t Ucolind_bc_cnt=0;
+int64_t Unzval_bc_cnt=0, Unzval_br_cnt=0;
+int64_t Uindval_loc_bc_cnt=0;
+
+int_t next_lind; /* next available position in index[*] */
+int_t next_lval; /* next available position in nzval[*] */
+
+nsupers = Glu_persist->supno[n-1] + 1;
+nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+
+if ( !(Llu->Ucolind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) )
+ ABORT("Malloc fails for Llu->Ucolind_bc_ptr[].");
+Llu->Ucolind_bc_ptr[nsupers_j-1] = NULL;
+
+if ( !(Llu->Unzval_bc_ptr =
+ (float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) )
+ ABORT("Malloc fails for Llu->Unzval_bc_ptr[].");
+Llu->Unzval_bc_ptr[nsupers_j-1] = NULL;
+
+if ( !(Llu->Uindval_loc_bc_ptr =
+ (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) )
+ ABORT("Malloc fails for Llu->Uindval_loc_bc_ptr[].");
+Llu->Uindval_loc_bc_ptr[nsupers_j-1] = NULL;
+
+if ( !(Llu->Uindval_loc_bc_offset =
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Uindval_loc_bc_offset[].");
}
+Llu->Uindval_loc_bc_offset[nsupers_j-1] = -1;
+
+if ( !(Llu->Ucolind_bc_offset =
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Ucolind_bc_offset[].");
+}
+Llu->Ucolind_bc_offset[nsupers_j-1] = -1;
+
+if ( !(Llu->Unzval_bc_offset =
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Lnzval_bc_offset[].");
+}
+Llu->Unzval_bc_offset[nsupers_j-1] = -1;
+
+for (lk=0;lknpcol + mycol;/* Global block number, col-wise. */
+ knsupc = SuperSize( k );
+ nub = Urbs[lk]; /* Number of U blocks in block column lk */
+
+ if(nub>0){
+ // First pass count sizes of Llu->Ucolind_bc_ptr[lk] and Llu->Unzval_bc_ptr[lk]
+ nnz_ind=0;
+ nnz_val=0;
+ nnz_ind+=BC_HEADER_NEWU;
+ nrow=0;
+ for (ub = 0; ub < nub; ++ub) {
+ ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+ usub = Llu->Ufstnz_br_ptr[ik];
+ uval = Llu->Unzval_br_ptr[ik];
+ i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+ i += UB_DESCRIPTOR;
+ gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+ iknsupc = SuperSize( gik );
+ nrow += iknsupc;
+ ikfrow = FstBlockC( gik );
+ iklrow = FstBlockC( gik+1 );
+ uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+
+ nnz_ind+=UB_DESCRIPTOR_NEWU;
+
+ for (jj = 0; jj < knsupc; ++jj) {
+ fnz = usub[i + jj];
+ if ( fnz < iklrow ) { /* Nonzero segment. */
+ nnz_val+=iknsupc;
+ nnz_ind+=1;
+ Unzval_br_cnt+=iklrow - fnz;
+ // for (irow = fnz; irow < iklrow; ++irow)
+ // dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+ // stat->ops[SOLVE] += 2 * (iklrow - fnz);
+ }
+ } /* for jj ... */
+ } /* for ub ... */
+
+ // Second pass fills Llu->Ucolind_bc_ptr[lk] and Llu->Unzval_bc_ptr[lk]
+ if ( !(Llu->Ucolind_bc_ptr[lk] = intMalloc_dist(nnz_ind+nrow*2)) )
+ ABORT("Malloc fails for Llu->Ucolind_bc_ptr[lk]");
+ Llu->Ucolind_bc_offset[lk]=nnz_ind+nrow*2;
+ Ucolind_bc_cnt += Llu->Ucolind_bc_offset[lk];
+
+ if (!(Llu->Unzval_bc_ptr[lk]=floatCalloc_dist(nnz_val)))
+ ABORT("Calloc fails for Llu->Unzval_bc_ptr[lk].");
+ Llu->Unzval_bc_offset[lk]=nnz_val;
+ Unzval_bc_cnt += Llu->Unzval_bc_offset[lk];
+
+ if ( !(Llu->Uindval_loc_bc_ptr[lk] = intCalloc_dist(nub*3)) )
+ ABORT("Malloc fails for Llu->Uindval_loc_bc_ptr[lk][]");
+ Llu->Uindval_loc_bc_offset[lk]=nub*3;
+ Uindval_loc_bc_cnt += Llu->Uindval_loc_bc_offset[lk];
+
+ Llu->Ucolind_bc_ptr[lk][0]=nub;
+ Llu->Ucolind_bc_ptr[lk][1]=nrow;
+ Llu->Ucolind_bc_ptr[lk][2]=nnz_ind;
+ nnz_offset=nnz_ind;
+
+ nnz_ind=0;
+ nnz_val=0;
+ ncol=0;
+ nnz_ind+=BC_HEADER_NEWU;
+ nrow=0;
+ for (ub = 0; ub < nub; ++ub) {
+ ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+ usub = Llu->Ufstnz_br_ptr[ik];
+ uval = Llu->Unzval_br_ptr[ik];
+ i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+ i += UB_DESCRIPTOR;
+ gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+ iknsupc = SuperSize( gik );
+ ikfrow = FstBlockC( gik );
+ iklrow = FstBlockC( gik+1 );
+ uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+
+ for(ii=0; iiUcolind_bc_ptr[lk][nnz_offset+nrow*2] = ub;
+ Llu->Ucolind_bc_ptr[lk][nnz_offset+nrow*2+1] = ii;
+ nrow++;
+ }
+
+ ncol_loc=0;
+ for (jj = 0; jj < knsupc; ++jj) {
+ fnz = usub[i + jj];
+ if ( fnz < iklrow ) { /* Nonzero segment. */
+ Llu->Ucolind_bc_ptr[lk][nnz_ind+ncol_loc+UB_DESCRIPTOR_NEWU]=FstBlockC(k)+jj; /* Global column number */
+ ncol_loc++;
+ for (irow = fnz; irow < iklrow; ++irow){
+ Llu->Unzval_bc_ptr[lk][nnz_val+irow - ikfrow]=uval[uptr++];
+ // if(lk==2){
+ // printf("uval %5d %5d %5d %f %5d %5d \n",gik, uptr-1, irow - ikfrow, uval[uptr-1], Ucb_valptr[lk][ub],ub);
+ // // printf("Unzval_bc_ptr %5d %f\n",gik, Llu->Unzval_bc_ptr[lk][nnz_val+irow - ikfrow]);
+ // }
+ }
+ nnz_val+=iknsupc;
+ }
+ } /* for jj ... */
+ Llu->Ucolind_bc_ptr[lk][nnz_ind]=gik;
+ Llu->Ucolind_bc_ptr[lk][nnz_ind+1]=ncol_loc;
+
+ Llu->Uindval_loc_bc_ptr[lk][ub] = ik;
+ Llu->Uindval_loc_bc_ptr[lk][ub+nub] = nnz_ind;
+ Llu->Uindval_loc_bc_ptr[lk][ub+nub*2] = ncol;
+ // if(lk==69)
+ // printf("ub ncol_loc %5d %5d \n",ub, ncol_loc);
+ ncol+=ncol_loc*iknsupc;
+ nnz_ind+=ncol_loc+UB_DESCRIPTOR_NEWU;
+ } /* for ub ... */
+
+ }else{ /* nub <= 0 */
+ Llu->Ucolind_bc_ptr[lk] = NULL;
+ Llu->Unzval_bc_ptr[lk] = NULL;
+ Llu->Ucolind_bc_offset[lk]=-1;
+ Llu->Unzval_bc_offset[lk]=-1;
+ Llu->Uindval_loc_bc_ptr[lk] = NULL;
+ Llu->Uindval_loc_bc_offset[lk]=-1;
+ }
+} /* end for lk ... */
+
+ // safe guard
+ Ucolind_bc_cnt +=1;
+ Unzval_bc_cnt +=1;
+ Uindval_loc_bc_cnt +=1;
+ if ( !(Llu->Ucolind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Ucolind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Ucolind_bc_dat[].");
+ }
+ if ( !(Llu->Unzval_bc_dat =
+ (float*)SUPERLU_MALLOC(Unzval_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Unzval_bc_dat[].");
+ }
+ if ( !(Llu->Uindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Uindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Uindval_loc_bc_dat[].");
+ }
+
+ /* use contingous memory for Ucolind_bc_ptr, Unzval_bc_ptr, Uindval_loc_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Ucolind_bc_cnt=0;
+ Unzval_bc_cnt=0;
+ Uindval_loc_bc_cnt=0;
+ int64_t tmp_cnt;
+
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Llu->Ucolind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Llu->Ucolind_bc_offset[jb]; ++jj) {
+ Llu->Ucolind_bc_dat[Ucolind_bc_cnt+jj]=Llu->Ucolind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Llu->Ucolind_bc_ptr[jb]);
+ Llu->Ucolind_bc_ptr[jb]=&Llu->Ucolind_bc_dat[Ucolind_bc_cnt];
+ tmp_cnt = Llu->Ucolind_bc_offset[jb];
+ Llu->Ucolind_bc_offset[jb]=Ucolind_bc_cnt;
+ Ucolind_bc_cnt+=tmp_cnt;
+ }
+
+ if(Llu->Unzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Llu->Unzval_bc_offset[jb]; ++jj) {
+ Llu->Unzval_bc_dat[Unzval_bc_cnt+jj]=Llu->Unzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Llu->Unzval_bc_ptr[jb]);
+ Llu->Unzval_bc_ptr[jb]=&Llu->Unzval_bc_dat[Unzval_bc_cnt];
+ tmp_cnt = Llu->Unzval_bc_offset[jb];
+ Llu->Unzval_bc_offset[jb]=Unzval_bc_cnt;
+ Unzval_bc_cnt+=tmp_cnt;
+ }
+
+ if(Llu->Uindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Llu->Uindval_loc_bc_offset[jb]; ++jj) {
+ Llu->Uindval_loc_bc_dat[Uindval_loc_bc_cnt+jj]=Llu->Uindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Llu->Uindval_loc_bc_ptr[jb]);
+ Llu->Uindval_loc_bc_ptr[jb]=&Llu->Uindval_loc_bc_dat[Uindval_loc_bc_cnt];
+ tmp_cnt = Llu->Uindval_loc_bc_offset[jb];
+ Llu->Uindval_loc_bc_offset[jb]=Uindval_loc_bc_cnt;
+ Uindval_loc_bc_cnt+=tmp_cnt;
+ }
+
+ } /* end for jb ... */
+
+ Llu->Ucolind_bc_cnt = Ucolind_bc_cnt;
+ Llu->Unzval_bc_cnt = Unzval_bc_cnt;
+ Llu->Uindval_loc_bc_cnt = Uindval_loc_bc_cnt;
+ // printf("Ucolind_bc_cnt %10d\n",Ucolind_bc_cnt);
+ //printf("Unzval_bc_cnt %10ld v.s. Unzval_br_cnt %10ld\n",Unzval_bc_cnt,Unzval_br_cnt);
+ // printf("Llu->Ucolind_bc_offset %10d\n",Llu->Ucolind_bc_offset[0]);
+
+ checkGPU(gpuFree(Llu->d_Ucolind_bc_dat));
+ checkGPU(gpuFree(Llu->d_Ucolind_bc_offset));
+ checkGPU(gpuFree(Llu->d_Unzval_bc_dat));
+ checkGPU(gpuFree(Llu->d_Unzval_bc_offset));
+ checkGPU(gpuFree(Llu->d_Uindval_loc_bc_dat));
+ checkGPU(gpuFree(Llu->d_Uindval_loc_bc_offset));
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, (Llu->Ucolind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Ucolind_bc_dat, Llu->Ucolind_bc_dat, (Llu->Ucolind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t)));
+ checkGPU(gpuMemcpy(Llu->d_Ucolind_bc_offset, Llu->Ucolind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t)));
+ checkGPU(gpuMemcpy(Llu->d_Unzval_bc_offset, Llu->Unzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, (Llu->Unzval_bc_cnt) * sizeof(float)));
+ checkGPU(gpuMemcpy(LUstruct->Llu->d_Unzval_bc_dat, LUstruct->Llu->Unzval_bc_dat,(LUstruct->Llu->Unzval_bc_cnt) * sizeof(float), gpuMemcpyHostToDevice));
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, (Llu->Uindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Uindval_loc_bc_dat, Llu->Uindval_loc_bc_dat, (Llu->Uindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t)));
+ checkGPU(gpuMemcpy(Llu->d_Uindval_loc_bc_offset, Llu->Uindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t), gpuMemcpyHostToDevice));
+
+ SUPERLU_FREE (Llu->Ucolind_bc_dat);
+ SUPERLU_FREE (Llu->Ucolind_bc_offset);
+ SUPERLU_FREE (Llu->Unzval_bc_dat);
+ SUPERLU_FREE (Llu->Unzval_bc_offset);
+ SUPERLU_FREE (Llu->Uindval_loc_bc_dat);
+ SUPERLU_FREE (Llu->Uindval_loc_bc_offset);
+
+} /* psconvertU */
+#endif /* ifdef GPU_ACC */
diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c
index d6bab403..8ac7e954 100644
--- a/SRC/psgssvx3d.c
+++ b/SRC/psgssvx3d.c
@@ -758,13 +758,9 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
if ( iinfo > 0 ) {
if ( iinfo <= m ) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
-#endif
} else {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
-#endif
}
} else if ( iinfo < 0 ) return;
@@ -1143,7 +1139,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
if (!iam) {
fprintf (stderr, "symbfact() error returns %d\n",
(int) iinfo);
- exit (-1);
+ return;
}
}
diff --git a/SRC/psgstrs.c b/SRC/psgstrs.c
index 046e1151..8a24b316 100644
--- a/SRC/psgstrs.c
+++ b/SRC/psgstrs.c
@@ -210,9 +210,9 @@ psReDistribute_B_to_X(float *B, int_t m_loc, int nrhs, int_t ldb,
#endif
{
// t = SuperLU_timer_();
-#ifdef _OPENMP
-#pragma omp taskloop private (i,l,irow,k,j,knsupc) untied
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (i,l,irow,k,j,knsupc) untied
+//#endif
for (i = 0; i < m_loc; ++i) {
irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*B */
@@ -465,9 +465,9 @@ psReDistribute_X_to_B(int_t n, float *B, int_t m_loc, int_t ldb, int_t fst_row,
#endif
{
// t = SuperLU_timer_();
-#ifdef _OPENMP
-#pragma omp taskloop private (k,knsupc,lk,irow,l,i,j) untied
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (k,knsupc,lk,irow,l,i,j) untied
+//#endif
for (k = 0; k < nsupers; k++) {
knsupc = SuperSize( k );
lk = LBi( k, grid ); /* Local block number */
@@ -1410,9 +1410,9 @@ if(procs==1){
#endif
{
-#ifdef _OPENMP
-#pragma omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
+//#endif
for (jj=0;jj=1 )
@@ -2016,9 +2016,9 @@ if(procs==1){
#pragma omp master
#endif
{
-#ifdef _OPENMP
-#pragma omp taskloop private (ii,jj,k,lk,thread_id) nogroup
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (ii,jj,k,lk,thread_id) nogroup
+//#endif
for (jj=0;jjsupno
* and Glu_persist->xsup.
*
@@ -53,7 +53,7 @@ at the top-level directory.
* Glu_persist->supno, Glu_persist->xsup.
*
* This routine also deallocates memory allocated during symbolic
- * factorization routine. That is, the folloing arrays are freed:
+ * factorization routine. That is, the following arrays are free'd:
* Pslu_freeable->xlsub, Pslu_freeable->lsub,
* Pslu_freeable->xusub, Pslu_freeable->usub,
* Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
@@ -70,7 +70,7 @@ at the top-level directory.
* Order of the input matrix
*
* Pslu_freeable (Input) Pslu_freeable_t *
- * Local L and U structure,
+ * Local L and U structure: lsub[] / usub[]. They are free'd after distribution.
* global to local indexing information.
*
* Glu_persist (Output) Glu_persist_t *
@@ -111,15 +111,16 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
{
int iam, nprocs, pc, pr, p, np, p_diag;
int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u,
- *tmp_ptrToSend, *mem;
+ *tmp_ptrToSend, *mem; // temp memory
int_t *nnzToRecv_l, *nnzToRecv_u;
int_t *send_1, *send_2, nsend_1, nsend_2;
- int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
+ int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; // temp memory
int_t nsupers, nsupers_i, nsupers_j;
int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
int_t maxszsn, maxNvtcsPProc;
int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
- int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
+ int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; /* computed from symbfact_dist(),
+ free'd in this routine after distribution */
int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
int_t *globToLoc, nvtcs_loc;
@@ -127,8 +128,8 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
RecvCnt_l, RecvCnt_u, ind_loc;
int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
int_t nelts, isize;
- float memAux; /* Memory used during this routine and freed on return */
- float memRet; /* Memory allocated and not freed on return */
+ float memAux; /* Memory used during this routine and free'd before return */
+ float memRet; /* Memory allocated and not free'd on return */
int_t iword, dword;
/* ------------------------------------------------------------
@@ -560,6 +561,13 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
else
nnzToRecv[iam] = nnz_loc_u;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [1] memAux %.2f, memRet %.2f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+
/* ------------------------------------------------------------
DEALLOCATE TEMPORARY STORAGE.
-------------------------------------------------------------*/
@@ -648,7 +656,16 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
}
else
sendU = FALSE;
- }
+
+ /* Sherry: this loop goes around twice ? */
+
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [2] end while: memAux %.4f\t memRet %.4f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+ } /* end while sendL || sendU */
/* deallocate memory allocated during symbolic factorization routine */
if (rcv_luind != NULL) {
@@ -676,6 +693,14 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
*p_xlsub = xlsub_n; *p_lsub = lsub_n;
*p_xusub = xusub_n; *p_usub = usub_n;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [3] before return: memAux %.4f\t memRet %.4f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+
+ /* It is confirmed that memAux is 0 now */
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit dist_symbLU()");
#endif
@@ -777,8 +802,8 @@ sdist_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct,
MPI_Status status;
int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */
int_t *supno = Glu_persist->supno;
- float memAux; /* Memory used during this routine and freed on return */
- float memRet; /* Memory allocated and not freed on return */
+ float memAux; /* Memory used during this routine and free'd before return */
+ float memRet; /* Memory allocated and not free'd on return */
int_t iword, dword, szbuf;
/* ------------------------------------------------------------
@@ -1138,7 +1163,7 @@ sdist_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct,
#endif
return (-memRet);
-} /* dist_A */
+} /* sdist_A */
/*! \brief
*
@@ -1226,27 +1251,52 @@ sdist_psymbtonum(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
float *lusup, *uval; /* nonzero values in L and U */
int *recvBuf; // 1/16/22 Sherry changed to int, was: int_t *recvBuf;
int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
- float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */
+ long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
+
float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */
+ long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
+ float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */
+ long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
+
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *index_srt; /* indices consist of headers and row subscripts */
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+
+ int_t *index_srt; /* indices consist of headers and row subscripts */
float *lusup_srt; /* nonzero values in L and U */
float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
+ float *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */
+ long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Unzval_br_cnt=0;
+
int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
+ long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Ufstnz_br_cnt=0;
int_t *Unnz; /* size ceil(NSUPERS/Pc) */
- C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
int msgsize;
- int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
+ int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+ Ucb_indptr_t *Ucb_inddat;
+ long int *Ucb_indoffset;
+ long int Ucb_indcnt=0;
int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
-
+ int_t *Ucb_valdat;
+ long int *Ucb_valoffset;
+ long int Ucb_valcnt=0;
/*-- Counts to be used in factorization. --*/
int *ToRecv, *ToSendD, **ToSendR;
@@ -1264,10 +1314,11 @@ sdist_psymbtonum(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int nbrecvx = 0; /* Number of Xk I will receive. */
int nbsendx = 0; /* Number of Xk I will send */
int_t *ilsum; /* starting position of each supernode in
- the full array (local) */
+ the full array (local, blockwise) */
int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in
the full array (local, block column wise) */
- /*-- Auxiliary arrays; freed on return --*/
+ /*-- Auxiliary arrays; free'd on return --*/
+ // Sherry check
int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */
int_t *LUb_length; /* L,U block length; size nsupers_ij */
int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
@@ -1297,15 +1348,21 @@ float *dense, *dense_col; /* SPA */
int_t lptr1_tmp, idx_i, idx_v,m, uu;
int_t nub;
- float memStrLU, memA,
+ /* counting memory */
+ float memA, /* memory used by sdist_A: distributing A values. */
+ memStrLU, /* memory used by dist_symbLU: distributing symbolic LU */
memDist = 0.; /* memory used for redistributing the data, which does
not include the memory for the numerical values
- of L and U (positive number)*/
+ of L and U (positive number).
+ It includes memA and memStrLU.
+ */
float memNLU = 0.; /* memory allocated for storing the numerical values of
L and U, that will be used in the numeric
- factorization (positive number) */
- float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
-
+ factorization (positive number).
+ It also contains dense-SPA[] array */
+ float memTRS = 0.; /* memory allocated for storing the meta-data for
+ triangular solve (positive number)*/
+
#if ( PRNTlevel>=1 )
int_t nLblocks = 0, nUblocks = 0;
#endif
@@ -1317,7 +1374,7 @@ float *dense, *dense_col; /* SPA */
/* Initialization. */
iam = grid->iam;
#if ( DEBUGlevel>=1 )
- CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
+ CHECK_MALLOC(iam, "Enter sdist_psymbtonum()");
#endif
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
@@ -1361,7 +1418,7 @@ float *dense, *dense_col; /* SPA */
if ( myrow == PROW( gb, grid ) ) {
i = SuperSize( gb );
ldaspa += i;
- lb = LBi( gb, grid );
+ lb = LBi( gb, grid ); // local block number
ilsum[lb + 1] = ilsum[lb] + i;
}
ilsum[nsupers_i] = ldaspa;
@@ -1371,7 +1428,7 @@ float *dense, *dense_col; /* SPA */
if (mycol == PCOL( gb, grid )) {
i = SuperSize( gb );
ldaspa_j += i;
- lb = LBj( gb, grid );
+ lb = LBj( gb, grid ); // local block number
ilsum_j[lb + 1] = ilsum_j[lb] + i;
}
ilsum_j[nsupers_j] = ldaspa_j;
@@ -1414,7 +1471,7 @@ float *dense, *dense_col; /* SPA */
for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
/* Auxiliary arrays used to set up L and U block data structures.
- They are freed on return. */
+ They are free'd on return. */
if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
fprintf(stderr, "Calloc fails for LUb_length[].");
return (memDist + memNLU + memTRS);
@@ -1440,11 +1497,33 @@ float *dense, *dense_col; /* SPA */
fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
return (memDist + memNLU + memTRS);
}
+
+ if ( !(Unzval_br_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_i * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Unzval_br_offset[nsupers_i-1] = -1;
+
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
return (memDist + memNLU + memTRS);
}
+ if ( !(Ufstnz_br_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_i * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Ufstnz_br_offset[nsupers_i-1] = -1;
+ memTRS += 2 * nsupers_i * sizeof(long int);
+
memNLU += nsupers_i*sizeof(float*) + nsupers_i*sizeof(int_t*);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.sdist_psymbtonum [1] memDist %.4f, memNLU %.4f\n", memDist*1e-6, memNLU*1e-6);
+ }
+#endif
+
Unzval_br_ptr[nsupers_i-1] = NULL;
Ufstnz_br_ptr[nsupers_i-1] = NULL;
@@ -1466,7 +1545,7 @@ float *dense, *dense_col; /* SPA */
memDist += (nsupers_i + nsupers_j)*iword;
/* Auxiliary arrays used to set up L, U block data structures.
- They are freed on return.
+ They are free'd on return.
k is the number of local row blocks. */
if ( !(dense = floatCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j)
* sp_ienv_dist(3, options))) ) {
@@ -1485,6 +1564,12 @@ float *dense, *dense_col; /* SPA */
/* ------------------------------------------------ */
memNLU += 2*nsupers_i*iword +
SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3, options)*dword;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.sdist_psymbtonum [[2]] memDist %.2f, memNLU %.2f [+ dense SPA]\n", memDist*1e-6, memNLU*1e-6);
+ fflush(stdout);
+ }
+#endif
/* Pointers to the beginning of each block column of L. */
if ( !(Lnzval_bc_ptr =
@@ -1497,28 +1582,69 @@ float *dense, *dense_col; /* SPA */
return (memDist + memNLU + memTRS);
}
+ if ( !(Lrowind_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
+ }
+ Lrowind_bc_offset[nsupers_j-1] = -1;
+ if ( !(Lnzval_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
+ }
+
if ( !(Linv_bc_ptr =
(float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) {
fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
+ if ( !(Linv_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
+ }
+
if ( !(Uinv_bc_ptr =
(float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) {
fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
+
+ if ( !(Uinv_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Linv_bc_ptr[nsupers_j-1] = NULL;
+ Uinv_bc_ptr[nsupers_j-1] = NULL;
+ Linv_bc_offset[nsupers_j-1] = -1;
+ Uinv_bc_offset[nsupers_j-1] = -1;
+
+
if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){
fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
+ if ( !(Lindval_loc_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
+ }
if ( !(Unnz = (int_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int_t))) ){
fprintf(stderr, "Malloc fails for Unnz[].");
return (memDist + memNLU + memTRS);
}
- memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
+
+ //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr, Uinv_bc_ptr, and 5 more ...
+ memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(float) + nsupers_j*iword
+ + 5 * nsupers_j * sizeof(long int);
memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.sdist_psymbtonum [[3]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
Lnzval_bc_ptr[nsupers_j-1] = NULL;
Lrowind_bc_ptr[nsupers_j-1] = NULL;
Linv_bc_ptr[nsupers_j-1] = NULL;
@@ -1551,11 +1677,23 @@ float *dense, *dense_col; /* SPA */
bsendx_plist[i] = &index1[j];
/* -------------------------------------------------------------- */
memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.sdist_psymbtonum [[4]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
+ long int Linv_bc_cnt=0;
+ long int Uinv_bc_cnt=0;
+ long int Lrowind_bc_cnt=0;
+ long int Lnzval_bc_cnt=0;
+ long int Lindval_loc_bc_cnt=0;
+
for (jb = 0; jb < nsupers; jb++) {
jbcol = PCOL( jb, grid );
jbrow = PROW( jb, grid );
@@ -1564,13 +1702,21 @@ float *dense, *dense_col; /* SPA */
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
+ /*------------------------------------------------
+ * SET UP U BLOCKS.
+ *------------------------------------------------*/
if ( myrow == jbrow ) { /* Block row jb in my process row */
+ Ufstnz_br_ptr[ljb_i] = NULL;
+ Unzval_br_ptr[ljb_i] = NULL;
+ Unzval_br_offset[ljb_i]=-1;
+ Ufstnz_br_offset[ljb_i]=-1;
+
/* Scatter A into SPA. */
for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
if (i >= asup_rowptr[ilsum[nsupers_i]])
printf ("ERR7\n");
- jcol = asup_colind[i];
+ jcol = asup_colind[i]; // upper triangular part
if (jcol >= n)
printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
iam, (int) jb, (int) gb, (int) j, jcol);
@@ -1586,13 +1732,13 @@ float *dense, *dense_col; /* SPA */
dense_col += ldaspa_j;
}
- /*------------------------------------------------
- * SET UP U BLOCKS.
- *------------------------------------------------*/
/* Count number of blocks and length of each block. */
nrbu = 0;
len = 0; /* Number of column subscripts I own. */
len1 = 0; /* number of fstnz subscripts */
+
+ /* ljb_i is the current local row block number in U.
+ Loop through every nonzero in this row block */
for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
if (i >= xusub[nsupers_i]) printf ("ERR10\n");
jcol = usub[i];
@@ -1609,7 +1755,7 @@ float *dense, *dense_col; /* SPA */
pr = PROW( gb, grid );
if ( pr != jbrow && mycol == pc)
bsendx_plist[lb][jbrow] = YES;
- if (mycol == pc) {
+ if (mycol == pc) { /* I own this block */
len += nsupc;
LUb_length[lb] += nsupc;
ToSendD[ljb_i] = YES;
@@ -1653,12 +1799,26 @@ float *dense, *dense_col; /* SPA */
return (memDist + memNLU + memTRS);
}
Ufstnz_br_ptr[ljb_i] = index;
+ Ufstnz_br_offset[ljb_i]=len1+1;
+ Ufstnz_br_cnt += Ufstnz_br_offset[ljb_i];
+
if (!(Unzval_br_ptr[ljb_i] =
floatMalloc_dist(len))) {
fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
return (memDist + memNLU + memTRS);
}
+ Unzval_br_offset[ljb_i]=len;
+ Unzval_br_cnt += Unzval_br_offset[ljb_i];
+
memNLU += (len1+1)*iword + len*dword;
+#if ( PRNTlevel>=1 )
+ if (iam==0 && (jb %10000 == 0) ) {
+ printf("\t.sdist_psymbtonum [jb %d setup-U] memNLU %.4f, memTRS %.4f\n",
+ (int) jb, memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
uval = Unzval_br_ptr[ljb_i];
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
@@ -1713,7 +1873,7 @@ float *dense, *dense_col; /* SPA */
} else {
Ufstnz_br_ptr[ljb_i] = NULL;
Unzval_br_ptr[ljb_i] = NULL;
- } /* if nrbu ... */
+ } /* end if-else nrbu ... */
} /* if myrow == jbrow */
/*------------------------------------------------
@@ -1805,23 +1965,47 @@ float *dense, *dense_col; /* SPA */
fprintf (stderr, "Malloc fails for index[]");
return (memDist + memNLU + memTRS);
}
+
+ Lrowind_bc_offset[ljb_j]=len1;
+ Lrowind_bc_cnt += Lrowind_bc_offset[ljb_j];
Lrowind_bc_ptr[ljb_j] = index;
+
if (!(Lnzval_bc_ptr[ljb_j] =
floatMalloc_dist(len*nsupc))) {
fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d\n", (int) jb);
return (memDist + memNLU + memTRS);
}
-
- if (!(Linv_bc_ptr[ljb_j] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
- ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]");
- if (!(Uinv_bc_ptr[ljb_j] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
- ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");
+ Lnzval_bc_offset[ljb_j]=len*nsupc;
+ Lnzval_bc_cnt += Lnzval_bc_offset[ljb_j];
+
+ myrow = MYROW( iam, grid );
+ krow = PROW( jb, grid );
+ if(myrow==krow){ /* diagonal block */
+ if (!(Linv_bc_ptr[ljb_j] = (float*)floatMalloc_dist(nsupc*nsupc)) )
+ ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]");
+ Linv_bc_offset[ljb_j]=nsupc*nsupc;
+ Linv_bc_cnt += Linv_bc_offset[ljb_j];
+ if (!(Uinv_bc_ptr[ljb_j] = (float*)floatMalloc_dist(nsupc*nsupc)) )
+ ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");
+ Uinv_bc_offset[ljb_j]=nsupc*nsupc;
+ Uinv_bc_cnt += Uinv_bc_offset[ljb_j];
+ }else{
+ Linv_bc_ptr[ljb_j] = NULL;
+ Linv_bc_offset[ljb_j] = -1;
+ Uinv_bc_ptr[ljb_j] = NULL;
+ Uinv_bc_offset[ljb_j] = -1;
+ }
memNLU += len1*iword + len*nsupc*dword;
if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3)))
ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]");
- memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword; //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+
+ //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+ memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;
+
+ Lindval_loc_bc_offset[ljb_j]=nrbl*3;
+ Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb_j];
lusup = Lnzval_bc_ptr[ljb_j];
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
@@ -1869,8 +2053,6 @@ float *dense, *dense_col; /* SPA */
}
} /* for i ... */
-
-
/* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/
if(nrbl>1){
krow = PROW( jb, grid );
@@ -1884,7 +2066,6 @@ float *dense, *dense_col; /* SPA */
quickSortM(lloc,0,uu,nrbl,0,3);
}
-
if ( !(index_srt = intMalloc_dist(len1)) )
ABORT("Malloc fails for index_srt[]");
if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
@@ -1925,13 +2106,25 @@ float *dense, *dense_col; /* SPA */
Lrowind_bc_ptr[ljb_j] = NULL;
Lnzval_bc_ptr[ljb_j] = NULL;
Linv_bc_ptr[ljb_j] = NULL;
+ Linv_bc_offset[ljb_j] = -1;
+ Lrowind_bc_offset[ljb_j]=-1;
+ Lindval_loc_bc_offset[ljb_j]=-1;
+ Lnzval_bc_offset[ljb_j]=-1;
Uinv_bc_ptr[ljb_j] = NULL;
+ Uinv_bc_offset[ljb_j] = -1;
Lindval_loc_bc_ptr[ljb_j] = NULL;
} /* if nrbl ... */
} /* if mycol == pc */
- } /* for jb ... */
+ } /* end for jb ... */
SUPERLU_FREE(ilsum_j);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.ddist_psymbtonum [[5]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
SUPERLU_FREE(Urb_marker);
SUPERLU_FREE(LUb_length);
SUPERLU_FREE(LUb_indptr);
@@ -2072,8 +2265,94 @@ float *dense, *dense_col; /* SPA */
(*bsendx_plist)[k] = EMPTY;
}
}
- }
-
+ } /* end for jb ... */
+
+ Linv_bc_cnt +=1; // safe guard
+ Uinv_bc_cnt +=1;
+ Lrowind_bc_cnt +=1 ;
+ Lindval_loc_bc_cnt +=1;
+ Lnzval_bc_cnt +=1;
+ if ( !(Linv_bc_dat =
+ (float*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
+ }
+ if ( !(Uinv_bc_dat =
+ (float*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
+ }
+ if ( !(Lrowind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
+ }
+ if ( !(Lindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
+ }
+ if ( !(Lnzval_bc_dat =
+ (float*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ /* use contingous memory for Linv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Linv_bc_cnt=0;
+ Uinv_bc_cnt=0;
+ Lrowind_bc_cnt=0;
+ Lnzval_bc_cnt=0;
+ Lindval_loc_bc_cnt=0;
+ long int tmp_cnt;
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Linv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) {
+ Linv_bc_dat[Linv_bc_cnt+jj]=Linv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Linv_bc_ptr[jb]);
+ Linv_bc_ptr[jb]=&Linv_bc_dat[Linv_bc_cnt];
+ tmp_cnt = Linv_bc_offset[jb];
+ Linv_bc_offset[jb]=Linv_bc_cnt;
+ Linv_bc_cnt+=tmp_cnt;
+ }
+ if(Uinv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) {
+ Uinv_bc_dat[Uinv_bc_cnt+jj]=Uinv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Uinv_bc_ptr[jb]);
+ Uinv_bc_ptr[jb]=&Uinv_bc_dat[Uinv_bc_cnt];
+ tmp_cnt = Uinv_bc_offset[jb];
+ Uinv_bc_offset[jb]=Uinv_bc_cnt;
+ Uinv_bc_cnt+=tmp_cnt;
+ }
+ if(Lrowind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) {
+ Lrowind_bc_dat[Lrowind_bc_cnt+jj]=Lrowind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lrowind_bc_ptr[jb]);
+ Lrowind_bc_ptr[jb]=&Lrowind_bc_dat[Lrowind_bc_cnt];
+ tmp_cnt = Lrowind_bc_offset[jb];
+ Lrowind_bc_offset[jb]=Lrowind_bc_cnt;
+ Lrowind_bc_cnt+=tmp_cnt;
+ }
+ if(Lnzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) {
+ Lnzval_bc_dat[Lnzval_bc_cnt+jj]=Lnzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lnzval_bc_ptr[jb]);
+ Lnzval_bc_ptr[jb]=&Lnzval_bc_dat[Lnzval_bc_cnt];
+ tmp_cnt = Lnzval_bc_offset[jb];
+ Lnzval_bc_offset[jb]=Lnzval_bc_cnt;
+ Lnzval_bc_cnt+=tmp_cnt;
+ }
+ if(Lindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) {
+ Lindval_loc_bc_dat[Lindval_loc_bc_cnt+jj]=Lindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
+ Lindval_loc_bc_ptr[jb]=&Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
+ tmp_cnt = Lindval_loc_bc_offset[jb];
+ Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
+ Lindval_loc_bc_cnt+=tmp_cnt;
+ }
+ } /* end for jb ... */
+
/////////////////////////////////////////////////////////////////
/* Set up additional pointers for the index and value arrays of U.
@@ -2087,6 +2366,17 @@ float *dense, *dense_col; /* SPA */
ABORT("Malloc fails for Ucb_indptr[]");
if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
ABORT("Malloc fails for Ucb_valptr[]");
+ if ( !(Ucb_valoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valoffset[].");
+ }
+ Ucb_valoffset[nub-1] = -1;
+ if ( !(Ucb_indoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
+ }
+ Ucb_indoffset[nub-1] = -1;
+
nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
/* Count number of row blocks in a block column.
@@ -2108,11 +2398,21 @@ float *dense, *dense_col; /* SPA */
One pass of the skeleton graph of U. */
for (lb = 0; lb < nub; ++lb) {
if ( Urbs[lb] ) { /* Not an empty block column. */
- if ( !(Ucb_indptr[lb]
- = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+ if ( !(Ucb_indptr[lb]
+ = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
- if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+ Ucb_indoffset[lb]=Urbs[lb];
+ Ucb_indcnt += Ucb_indoffset[lb];
+
+ if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
+ Ucb_valoffset[lb]=Urbs[lb];
+ Ucb_valcnt += Ucb_valoffset[lb];
+ }else{
+ Ucb_valptr[lb]=NULL;
+ Ucb_valoffset[lb]=-1;
+ Ucb_indptr[lb]=NULL;
+ Ucb_indoffset[lb]=-1;
}
}
for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
@@ -2136,9 +2436,7 @@ float *dense, *dense_col; /* SPA */
}
}
-
-
-/* Count the nnzs per block column */
+ /* Count the nnzs per block column */
for (lb = 0; lb < nub; ++lb) {
Unnz[lb] = 0;
k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
@@ -2156,7 +2454,82 @@ float *dense, *dense_col; /* SPA */
}
} /* for jj ... */
}
- }
+ } /* end for lb ... */
+
+ Unzval_br_cnt +=1; // safe guard
+ Ufstnz_br_cnt +=1;
+ Ucb_valcnt +=1;
+ Ucb_indcnt +=1;
+ if ( !(Unzval_br_dat =
+ (float*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ if ( !(Ufstnz_br_dat =
+ (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
+ }
+ if ( !(Ucb_valdat =
+ (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valdat[].");
+ }
+ if ( !(Ucb_inddat =
+ (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_inddat[].");
+ }
+
+ /* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
+ k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+ Unzval_br_cnt=0;
+ Ufstnz_br_cnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Unzval_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Unzval_br_offset[lb]; ++jj) {
+ Unzval_br_dat[Unzval_br_cnt+jj]=Unzval_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Unzval_br_ptr[lb]);
+ Unzval_br_ptr[lb]=&Unzval_br_dat[Unzval_br_cnt];
+ tmp_cnt = Unzval_br_offset[lb];
+ Unzval_br_offset[lb]=Unzval_br_cnt;
+ Unzval_br_cnt+=tmp_cnt;
+ }
+
+ if(Ufstnz_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Ufstnz_br_offset[lb]; ++jj) {
+ Ufstnz_br_dat[Ufstnz_br_cnt+jj]=Ufstnz_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Ufstnz_br_ptr[lb]);
+ Ufstnz_br_ptr[lb]=&Ufstnz_br_dat[Ufstnz_br_cnt];
+ tmp_cnt = Ufstnz_br_offset[lb];
+ Ufstnz_br_offset[lb]=Ufstnz_br_cnt;
+ Ufstnz_br_cnt+=tmp_cnt;
+ }
+ } /* end for lb ... */
+
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Ucb_valcnt=0;
+ Ucb_indcnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Ucb_valptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_valoffset[lb]; ++jj) {
+ Ucb_valdat[Ucb_valcnt+jj]=Ucb_valptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_valptr[lb]);
+ Ucb_valptr[lb]=&Ucb_valdat[Ucb_valcnt];
+ tmp_cnt = Ucb_valoffset[lb];
+ Ucb_valoffset[lb]=Ucb_valcnt;
+ Ucb_valcnt+=tmp_cnt;
+ }
+ if(Ucb_indptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_indoffset[lb]; ++jj) {
+ Ucb_inddat[Ucb_indcnt+jj]=Ucb_indptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_indptr[lb]);
+ Ucb_indptr[lb]=&Ucb_inddat[Ucb_indcnt];
+ tmp_cnt = Ucb_indoffset[lb];
+ Ucb_indoffset[lb]=Ucb_indcnt;
+ Ucb_indcnt+=tmp_cnt;
+ }
+ } /* end for lb ... */
/////////////////////////////////////////////////////////////////
@@ -2790,14 +3163,13 @@ float *dense, *dense_col; /* SPA */
////////////////////////////////////////////////////////
- /* Free the memory used for storing L and U */
+ /* Free the memory used for storing symbolic structures of L and U */
SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
if (lsub != NULL)
SUPERLU_FREE(lsub);
if (usub != NULL)
SUPERLU_FREE(usub);
-
SUPERLU_FREE(nnzToRecv);
SUPERLU_FREE(ptrToRecv);
SUPERLU_FREE(nnzToSend);
@@ -2805,12 +3177,30 @@ float *dense, *dense_col; /* SPA */
SUPERLU_FREE(recvBuf);
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+ Llu->Lrowind_bc_dat = Lrowind_bc_dat;
+ Llu->Lrowind_bc_offset = Lrowind_bc_offset;
+ Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
+
Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+ Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
+ Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
+ Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
+
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
- Llu->Linv_bc_ptr = Linv_bc_ptr;
- Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Lnzval_bc_dat = Lnzval_bc_dat;
+ Llu->Lnzval_bc_offset = Lnzval_bc_offset;
+ Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
+
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+ Llu->Ufstnz_br_dat = Ufstnz_br_dat;
+ Llu->Ufstnz_br_offset = Ufstnz_br_offset;
+ Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
+
Llu->Unzval_br_ptr = Unzval_br_ptr;
+ Llu->Unzval_br_dat = Unzval_br_dat;
+ Llu->Unzval_br_offset = Unzval_br_offset;
+ Llu->Unzval_br_cnt = Unzval_br_cnt;
+
Llu->Unnz = Unnz;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
@@ -2830,9 +3220,72 @@ float *dense, *dense_col; /* SPA */
Llu->LBtree_ptr = LBtree_ptr;
Llu->URtree_ptr = URtree_ptr;
Llu->UBtree_ptr = UBtree_ptr;
+
+ Llu->Linv_bc_ptr = Linv_bc_ptr;
+ Llu->Linv_bc_dat = Linv_bc_dat;
+ Llu->Linv_bc_offset = Linv_bc_offset;
+ Llu->Linv_bc_cnt = Linv_bc_cnt;
+
+ Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Uinv_bc_dat = Uinv_bc_dat;
+ Llu->Uinv_bc_offset = Uinv_bc_offset;
+ Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+
Llu->Urbs = Urbs;
Llu->Ucb_indptr = Ucb_indptr;
+ Llu->Ucb_inddat = Ucb_inddat;
+ Llu->Ucb_indoffset = Ucb_indoffset;
+ Llu->Ucb_indcnt = Ucb_indcnt;
+
Llu->Ucb_valptr = Ucb_valptr;
+ Llu->Ucb_valdat = Ucb_valdat;
+ Llu->Ucb_valoffset = Ucb_valoffset;
+ Llu->Ucb_valcnt = Ucb_valcnt;
+
+#ifdef GPU_ACC
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+
+ // some dummy allocation to avoid checking whether they are null pointers later
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+
+ /* gpuMemcpy for the following is performed in pxgssvx */
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(double)));
+
+# endif /* end ifdef GPU_ACC */
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
@@ -2848,11 +3301,22 @@ float *dense, *dense_col; /* SPA */
MPI_MAX, grid->comm);
#if ( DEBUGlevel>=1 )
- /* Memory allocated but not freed:
+ /* Memory allocated but not free'd:
ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
ToRecv, ToSendR, ToSendD, mod_bit
*/
- CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
+ CHECK_MALLOC(iam, "Exit sdist_psymbtonum()");
+#endif
+
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t. end sdist_psymbtonum: memDist %.4f, memNLU %.4f, memTRS %.2f\n",
+ memDist*1e-6, memNLU*1e-6, memTRS*1e-6);
+ printf("\t\t. dense[] SPA %.4f (MB), ldaspa %d, ldaspa_j %d\n",
+ SUPERLU_MAX(ldaspa, ldaspa_j) * sp_ienv_dist(3, options) * dword * 1e-6,
+ (int) ldaspa, (int) ldaspa_j);
+ fflush(stdout);
+ }
#endif
return (- (memDist+memNLU));
diff --git a/SRC/psutil.c b/SRC/psutil.c
index 31c7f962..1f3606aa 100755
--- a/SRC/psutil.c
+++ b/SRC/psutil.c
@@ -17,11 +17,17 @@ at the top-level directory.
* -- Distributed SuperLU routine (version 2.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* March 15, 2003
+ *
+ * Last modified:
+ * December 28, 2022
*
*/
#include
#include "superlu_sdefs.h"
+#ifdef GPU_ACC
+#include "gpu_api_utils.h"
+#endif
/*! \brief Gather A from the distributed compressed row format to global A in compressed column format.
*/
@@ -448,27 +454,38 @@ sDestroy_LU(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct)
nsupers = Glu_persist->supno[n-1] + 1;
- nb = CEILING(nsupers, grid->npcol);
- for (i = 0; i < nb; ++i)
- if ( Llu->Lrowind_bc_ptr[i] ) {
- SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
-#if 0 // Sherry: the following is not allocated with cudaHostAlloc
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Lrowind_bc_ptr[i] ) {
+ // SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#if 0 // Sherry: the following is not allocated with cudaHostAlloc
//#ifdef GPU_ACC
checkGPU(gpuFreeHost(Llu->Lnzval_bc_ptr[i]));
#endif
- SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
- }
+ // SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+ // }
+
SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+ SUPERLU_FREE (Llu->Lrowind_bc_dat);
+ SUPERLU_FREE (Llu->Lrowind_bc_offset);
SUPERLU_FREE (Llu->Lnzval_bc_ptr);
-
- nb = CEILING(nsupers, grid->nprow);
- for (i = 0; i < nb; ++i)
- if ( Llu->Ufstnz_br_ptr[i] ) {
- SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
- SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
- }
+ SUPERLU_FREE (Llu->Lnzval_bc_dat);
+ SUPERLU_FREE (Llu->Lnzval_bc_offset);
+
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->nprow);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Ufstnz_br_ptr[i] ) {
+ // SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+ // SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+ // }
SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+ SUPERLU_FREE (Llu->Ufstnz_br_dat);
+ SUPERLU_FREE (Llu->Ufstnz_br_offset);
SUPERLU_FREE (Llu->Unzval_br_ptr);
+ SUPERLU_FREE (Llu->Unzval_br_dat);
+ SUPERLU_FREE (Llu->Unzval_br_offset);
/* The following can be freed after factorization. */
SUPERLU_FREE(Llu->ToRecv);
@@ -486,39 +503,78 @@ sDestroy_LU(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct)
SUPERLU_FREE(Llu->bsendx_plist);
SUPERLU_FREE(Llu->mod_bit);
- nb = CEILING(nsupers, grid->npcol);
- for (i = 0; i < nb; ++i)
- if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
- SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
- }
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
+ // SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
+ // }
SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
-
- nb = CEILING(nsupers, grid->npcol);
- for (i=0; iLinv_bc_ptr[i]!=NULL) {
- SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
- }
- if(Llu->Uinv_bc_ptr[i]!=NULL){
- SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
- }
- }
+ SUPERLU_FREE(Llu->Lindval_loc_bc_dat);
+ SUPERLU_FREE(Llu->Lindval_loc_bc_offset);
+
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i=0; iLinv_bc_ptr[i]!=NULL) {
+ // SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
+ // }
+ // if(Llu->Uinv_bc_ptr[i]!=NULL){
+ // SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
+ // }
+ // }
SUPERLU_FREE(Llu->Linv_bc_ptr);
+ SUPERLU_FREE(Llu->Linv_bc_dat);
+ SUPERLU_FREE(Llu->Linv_bc_offset);
SUPERLU_FREE(Llu->Uinv_bc_ptr);
+ SUPERLU_FREE(Llu->Uinv_bc_dat);
+ SUPERLU_FREE(Llu->Uinv_bc_offset);
SUPERLU_FREE(Llu->Unnz);
-
- nb = CEILING(nsupers, grid->npcol);
- for (i = 0; i < nb; ++i)
- if ( Llu->Urbs[i] ) {
- SUPERLU_FREE(Llu->Ucb_indptr[i]);
- SUPERLU_FREE(Llu->Ucb_valptr[i]);
- }
+
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Urbs[i] ) {
+ // SUPERLU_FREE(Llu->Ucb_indptr[i]);
+ // SUPERLU_FREE(Llu->Ucb_valptr[i]);
+ // }
SUPERLU_FREE(Llu->Ucb_indptr);
+ SUPERLU_FREE(Llu->Ucb_inddat);
+ SUPERLU_FREE(Llu->Ucb_indoffset);
SUPERLU_FREE(Llu->Ucb_valptr);
+ SUPERLU_FREE(Llu->Ucb_valdat);
+ SUPERLU_FREE(Llu->Ucb_valoffset);
SUPERLU_FREE(Llu->Urbs);
-
+
SUPERLU_FREE(Glu_persist->xsup);
SUPERLU_FREE(Glu_persist->supno);
+#ifdef GPU_ACC
+ checkGPU (gpuFree (Llu->d_xsup));
+ checkGPU (gpuFree (Llu->d_LRtree_ptr));
+ checkGPU (gpuFree (Llu->d_LBtree_ptr));
+ checkGPU (gpuFree (Llu->d_URtree_ptr));
+ checkGPU (gpuFree (Llu->d_UBtree_ptr));
+ checkGPU (gpuFree (Llu->d_ilsum));
+ checkGPU (gpuFree (Llu->d_Lrowind_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lrowind_bc_offset));
+ checkGPU (gpuFree (Llu->d_Lnzval_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lnzval_bc_offset));
+ checkGPU (gpuFree (Llu->d_Linv_bc_dat));
+ checkGPU (gpuFree (Llu->d_Uinv_bc_dat));
+ checkGPU (gpuFree (Llu->d_Linv_bc_offset));
+ checkGPU (gpuFree (Llu->d_Uinv_bc_offset));
+ checkGPU (gpuFree (Llu->d_Lindval_loc_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lindval_loc_bc_offset));
+
+ checkGPU (gpuFree (Llu->d_Ucolind_bc_dat));
+ checkGPU (gpuFree (Llu->d_Ucolind_bc_offset));
+ checkGPU (gpuFree (Llu->d_Unzval_bc_dat));
+ checkGPU (gpuFree (Llu->d_Unzval_bc_offset));
+ checkGPU (gpuFree (Llu->d_Uindval_loc_bc_dat));
+ checkGPU (gpuFree (Llu->d_Uindval_loc_bc_offset));
+#endif
+
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit sDestroy_LU()");
#endif
@@ -531,7 +587,7 @@ sDestroy_LU(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct)
* =======
* Set up the communication pattern for redistribution between B and X
* in the triangular solution.
- *
+ *
* Arguments
* =========
*
@@ -602,7 +658,7 @@ psgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
++SendCnt[p];
}
-
+
/* Set up the displacements for alltoall. */
MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
sdispls[0] = rdispls[0] = 0;
@@ -805,7 +861,7 @@ void sDestroy_A3d_gathered_on_2d(sSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
SUPERLU_FREE( A2d->colind );
SUPERLU_FREE( A2d->nzval );
}
- SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts
+ SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts
SUPERLU_FREE(A3d->row_disp);
SUPERLU_FREE(A3d->nnz_counts_int);
SUPERLU_FREE(A3d->nnz_disp);
@@ -830,9 +886,9 @@ void psinf_norm_error(int iam, int_t n, int_t nrhs, float x[], int_t ldx,
{
float err, xnorm, temperr, tempxnorm;
float *x_work, *xtrue_work;
+ int i, j;
float errcomp; // componentwise error
double derr;
- int i, j;
for (j = 0; j < nrhs; j++) {
x_work = &x[j*ldx];
@@ -856,7 +912,6 @@ void psinf_norm_error(int iam, int_t n, int_t nrhs, float x[], int_t ldx,
err = err / xnorm;
if ( !iam ) {
printf(".. Sol %2d: ||X - Xtrue|| / ||X|| = %e\t max_i |x - xtrue|_i / |x|_i = %e\n", j, err, errcomp);
- //printf("\t ||x||_inf = %e\n", xnorm);
fflush(stdout);
}
}
@@ -866,38 +921,38 @@ void psinf_norm_error(int iam, int_t n, int_t nrhs, float x[], int_t ldx,
void
sDestroy_Tree(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct)
{
- int_t i, nb, nsupers;
+ int i, nb, nsupers;
Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
sLocalLU_t *Llu = LUstruct->Llu;
#if ( DEBUGlevel>=1 )
int iam;
MPI_Comm_rank( MPI_COMM_WORLD, &iam );
- CHECK_MALLOC(iam, "Enter Destroy_Tree()");
+ CHECK_MALLOC(iam, "Enter sDestroy_Tree()");
#endif
nsupers = Glu_persist->supno[n-1] + 1;
nb = CEILING(nsupers, grid->npcol);
for (i=0;iLBtree_ptr[i].empty_==NO){
+ if(Llu->LBtree_ptr[i].empty_==NO){
// BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
C_BcTree_Nullify(&Llu->LBtree_ptr[i]);
}
- if(Llu->UBtree_ptr[i].empty_==NO){
+ if(Llu->UBtree_ptr[i].empty_==NO){
// BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
C_BcTree_Nullify(&Llu->UBtree_ptr[i]);
}
}
SUPERLU_FREE(Llu->LBtree_ptr);
SUPERLU_FREE(Llu->UBtree_ptr);
-
+
nb = CEILING(nsupers, grid->nprow);
for (i=0;iLRtree_ptr[i].empty_==NO){
+ if(Llu->LRtree_ptr[i].empty_==NO){
// RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
C_RdTree_Nullify(&Llu->LRtree_ptr[i]);
}
- if(Llu->URtree_ptr[i].empty_==NO){
+ if(Llu->URtree_ptr[i].empty_==NO){
// RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
C_RdTree_Nullify(&Llu->URtree_ptr[i]);
}
diff --git a/SRC/psymbfact.c b/SRC/psymbfact.c
index f865c094..9ea385a0 100644
--- a/SRC/psymbfact.c
+++ b/SRC/psymbfact.c
@@ -1,3 +1,4 @@
+
/*! \file
Copyright (c) 2003, The Regents of the University of California, through
Lawrence Berkeley National Laboratory (subject to receipt of any required
@@ -268,12 +269,15 @@ float symbfact_dist
comm_symbfact_t CS; /* information on communication */
/* relaxation parameters (for future release) and
statistics collected during the symbolic factorization */
- psymbfact_stat_t PS;
+ psymbfact_stat_t PS; /* Sherry: PS.allocMem is the total memory allocated (in Bytes)
+ This is locally collected, need to be summed up.
+ What's the relation with symb_mem_usage? */
/* temp array of size n, used as a marker by the subroutines */
int_t *tempArray;
int_t i, j, k;
int_t fstVtx, lstVtx, mark, fstVtx_lid, vtx_lid, maxNvtcsPProc;
- int_t nnz_asup_loc, nnz_ainf_loc, fill_rcmd;
+ int_t nnz_asup_loc, nnz_ainf_loc;
+ int_t fill_rcmd; /* fill ratio */
float totalMemLU, overestimMem;
MPI_Comm *commLvls;
@@ -343,12 +347,13 @@ float symbfact_dist
if ((flinfo =
symbfact_mapVtcs (iam, nprocs_num, nprocs_symb, A, fstVtxSep, sizes,
Pslu_freeable, &VInfo, tempArray, maxSzBlk, &PS)) > 0)
- return (flinfo);
+ return (flinfo); /* Number of bytes alllocated so far when run out of memory */
maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc;
/* Redistribute matrix A on processors following the distribution found
in symbfact_mapVtcs. Store the redistributed A temporarily into AS */
+ /* Sherry: should add argument PS.allocMem ?? */
symbfact_distributeMatrix (iam, nprocs_num, nprocs_symb, A,
perm_c, perm_r, &AS,
Pslu_freeable, &VInfo, tempArray, num_comm);
@@ -568,17 +573,17 @@ float symbfact_dist
t_symbFact_loc[1] = SuperLU_timer_() - t_symbFact_loc[1];
#endif
-#if ( PRNTlevel>=1 )
estimate_memUsage (n, iam, symb_mem_usage,
&totalMemLU, &overestimMem,
Pslu_freeable, &Llu_symbfact, &VInfo, &CS, &PS);
+#if ( PRNTlevel>=1 )
stat_loc[0] = (float) nnzL;
stat_loc[1] = (float) nnzU;
stat_loc[2] = (float) nsuper_loc;
stat_loc[3] = (float) Pslu_freeable->xlsub[VInfo.nvtcs_loc];
stat_loc[4] = (float) Pslu_freeable->xusub[VInfo.nvtcs_loc];
- stat_loc[5] = totalMemLU;
- stat_loc[6] = overestimMem;
+ stat_loc[5] = totalMemLU; // include the unused holes
+ stat_loc[6] = overestimMem; // the unused leftover holes
stat_loc[7] = totalMemLU - overestimMem;
stat_loc[8] = (float) PS.maxSzBuf;
stat_loc[9] = (float) PS.nDnsUpSeps;
@@ -664,6 +669,9 @@ float symbfact_dist
printf("\tParSYMBfact (MB) :\tL\\U MAX %.2f\tAVG %.2f\n",
mem_glob[0]*1e-6,
stat_glob[5]/nprocs_symb*1e-6);
+ /* the allocated memory recorded by PS.allocMem is still needed in distribution */
+ printf("\t\tworking memory PS.allocMem (MB):\t%.2f\n", PS.allocMem*1e-6);
+
#if ( PRNTlevel>=2 )
printf("\tRL overestim (MB):\tL\\U MAX %.2f\tAVG %.2f\n",
mem_glob[1]*1e-6,
@@ -748,6 +756,9 @@ float symbfact_dist
SUPERLU_FREE (time_lvls);
SUPERLU_FREE (time_lvlsT);
#endif
+ /* free communication buffers,
+ but seems does not free PS.allocMem part
+ */
symbfact_free (iam, nprocs_symb, &Llu_symbfact, &VInfo, &CS);
} /* if (iam < nprocs_symb) */
else {
@@ -772,8 +783,15 @@ float symbfact_dist
CHECK_MALLOC(iam, "Exit psymbfact()");
#endif
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t\tbefore exit symbfact_dist(): PS.allocMem (MB):\t%.2f\n", PS.allocMem*1e-6);
+ fflush(stdout);
+ }
+#endif
return (- PS.allocMem);
-} /* SYMBFACT_DIST */
+
+} /* end SYMBFACT_DIST */
static int_t
@@ -979,7 +997,7 @@ cntsVtcs
PS->allocMem -= n * sizeof(int_t);
}
return (SUCCES_RET);
-}
+} /* cntsVtcs */
static float
symbfact_mapVtcs
@@ -1024,6 +1042,10 @@ symbfact_mapVtcs
* computed. The array globToLoc and maxNvtcsPProc of Pslu_freeable
* are also computed.
*
+ *
+ * Return: 0 : ssuccess
+ * >0 : number of bytes allocated during parallel symbolic factorization
+ * when run out of memory
*/
int szSep, npNode, firstP, p, iSep, jSep, ind_ap_s, ind_ap_d;
int_t k, n, kk;
@@ -1038,6 +1060,10 @@ symbfact_mapVtcs
int_t *vtcs_pe; /* contains the number of vertices on each processor */
int *avail_pes; /* contains the processors to be used at each level */
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter symbfact_mapVtcs()");
+#endif
+
n = A->ncol;
/* allocate memory */
if (!(globToLoc = intMalloc_dist(n + 1))) {
@@ -1217,8 +1243,13 @@ symbfact_mapVtcs
VInfo->begEndBlks_loc = begEndBlks_loc;
VInfo->fstVtx_nextLvl = begEndBlks_loc[0];
}
- return SUCCES_RET;
-}
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit symbfact_mapVtcs()");
+#endif
+
+ return SUCCES_RET; /* 0 */
+
+} /* symbfact_mapVtcs */
static void
symbfact_distributeMatrix
@@ -1277,6 +1308,10 @@ symbfact_distributeMatrix
int_t *x_ainf, *x_asup, *ind_ainf, *ind_asup;
int *intBuf1, *intBuf2, *intBuf3, *intBuf4;
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter symbfact_distributeMatrix()");
+#endif
+
/* ------------------------------------------------------------
INITIALIZATION.
------------------------------------------------------------*/
@@ -1347,6 +1382,7 @@ symbfact_distributeMatrix
nnz_iam = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */
/* Allocate temporary storage for sending/receiving the A triplets. */
+ /* Sherry: need to add to memory peak */
if (!(snd_aind = intMalloc_symbfact(SendCnt)) && SendCnt != 0)
ABORT("Malloc fails for snd_aind[].");
if ( !(rcv_aind = intMalloc_symbfact(nnz_iam + 1)))
@@ -1530,6 +1566,7 @@ symbfact_distributeMatrix
/* ------------------------------------------------------------
Allocate space for storing indices of A after redistribution.
------------------------------------------------------------*/
+ /* Sherry: need to add to memory peak */
if (!(x_ainf = intCalloc_symbfact (nvtcs_loc + 1)))
ABORT("Malloc fails for x_ainf[].");
if (!(x_asup = intCalloc_symbfact (nvtcs_loc + 1)))
@@ -1576,6 +1613,7 @@ symbfact_distributeMatrix
x_asup[nvtcs_loc] = j;
/* Allocate space for storing indices of A after conversion */
+ /* Sherry: need to add to memory peak */
if ( !(ind_ainf = intMalloc_symbfact(x_ainf[nvtcs_loc])) && x_ainf[nvtcs_loc] != 0 )
ABORT("Malloc fails for ind_ainf[].");
if ( !(ind_asup = intMalloc_symbfact(x_asup[nvtcs_loc])) && x_asup[nvtcs_loc] != 0)
@@ -1629,7 +1667,13 @@ symbfact_distributeMatrix
VInfo->nnz_asup_loc = x_asup[nvtcs_loc];
VInfo->nnz_ainf_loc = x_ainf[nvtcs_loc];
}
-}
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit symbfact_distributeMatrix()");
+#endif
+
+} /* end symbfact_distributeMatrix */
+
static
float allocPrune_lvl
@@ -1659,6 +1703,11 @@ float allocPrune_lvl
float alpha = 1.5;
int_t FILL = sp_ienv_dist(6, options);
+#if ( DEBUGlevel>=1 )
+ int iam = -1;
+ CHECK_MALLOC(iam, "Enter allocPrune_lvl()");
+#endif
+
nvtcs_loc = VInfo->nvtcs_loc;
no_expand_pr = 0;
@@ -1731,7 +1780,13 @@ float allocPrune_lvl
Llu_symbfact->indUsubPr = 0;
Llu_symbfact->no_expand_pr += no_expand_pr;
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit allocPrune_lvl()");
+#endif
+
return 0;
+
}
static float
@@ -1764,6 +1819,11 @@ allocPrune_domain
float alpha = 1.5;
int_t FILL = 2 * sp_ienv_dist(6, options);
+#if ( DEBUGlevel>=1 )
+ int iam = -1;
+ CHECK_MALLOC(iam, "Enter allocPrune_domain()");
+#endif
+
nvtcs_loc = VInfo->nvtcs_loc;
no_expand_pr = 0;
@@ -1820,6 +1880,11 @@ allocPrune_domain
Llu_symbfact->no_expand_pr = no_expand_pr;
Llu_symbfact->no_expcp = 0;
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit allocPrune_domain()");
+#endif
+
return 0;
}
@@ -1861,6 +1926,11 @@ int symbfact_alloc
float alpha = 1.5;
int_t FILL = sp_ienv_dist(6, options);
+#if ( DEBUGlevel>=1 )
+ int iam = -1;
+ CHECK_MALLOC(iam, "Enter symbfact_alloc()");
+#endif
+
nvtcs_loc = VInfo->nvtcs_loc;
nnz_a_loc = VInfo->nnz_ainf_loc + VInfo->nnz_asup_loc;
nlvls = (int) LOG2( nprocs ) + 1;
@@ -1938,7 +2008,11 @@ int symbfact_alloc
Llu_symbfact->no_expand = no_expand;
- return SUCCES_RET;
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit symbfact_alloc()");
+#endif
+
+ return SUCCES_RET; /* 0 */
} /* SYMBFACT_ALLOC */
static int_t
@@ -2306,7 +2380,7 @@ updateRcvd_prGraph
}
static int_t
-update_prGraph
+update_prGraph
(
int iam,
int_t n, /* order of the matrix */
@@ -2406,7 +2480,7 @@ update_prGraph
k ++;
}
return SUCCES_RET;
-}
+} /* end update_prGraph */
static int_t
blk_symbfact
@@ -2820,7 +2894,7 @@ blk_symbfact
*p_nsuper_loc = nsuper_loc;
return 0;
-}
+} /* blk_symbfact */
static void
domain_symbfact
@@ -2851,6 +2925,10 @@ domain_symbfact
{
int_t lstVtx_lid, maxNvtcsPProc;
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter domain_symbfact()");
+#endif
+
/* call blk_symbfact */
blk_symbfact (A, iam, lvl,
szSep, ind_sizes1, ind_sizes2, sizes, fstVtxSep,
@@ -2873,6 +2951,11 @@ domain_symbfact
Llu_symbfact->xusub[lstVtx_lid] = *p_nextu;
}
VInfo->maxNeltsVtx -= lstVtx - fstVtx;
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit domain_symbfact()");
+#endif
+
}
@@ -2914,6 +2997,10 @@ initLvl_symbfact
int_t use_fillcnts, cntelt_vtx_l, cntelt_vtx_u;
MPI_Status status;
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter initLvl_symbfact()");
+#endif
+
fill = PS->fill_par;
VInfo->filledSep = FALSE;
@@ -3078,6 +3165,11 @@ initLvl_symbfact
VInfo->nnz_asup_loc -= nelts_asup;
}
VInfo->fstVtx_nextLvl = fstVtx_nextLvl;
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit initLvl_symbfact()");
+#endif
+
}
@@ -3519,7 +3611,7 @@ rl_update
*pmarkl = markl;
return 0;
-}
+} /* end rl_update */
static int_t
dnsUpSeps_symbfact
@@ -4215,6 +4307,10 @@ interLvl_symbfact
int_t req_ind, sent_msgs, req_ind_snd;
int_t initInfo_loc[2], initInfo_gl[2];
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter interLvl_symbfact()");
+#endif
+
/* Initialization */
n = A->ncol;
fstVtx = fstVtxSep[ind_sizes2];
@@ -4575,6 +4671,10 @@ interLvl_symbfact
if (request_rcv != NULL) SUPERLU_FREE (request_rcv);
if (status != NULL) SUPERLU_FREE (status);
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit interLvl_symbfact()");
+#endif
+
return 0;
}
@@ -4713,6 +4813,10 @@ intraLvl_symbfact
MPI_Status status[4];
MPI_Request request[4];
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Enter intraLvl_symbfact()");
+#endif
+
/* Initializations */
lsub = Llu_symbfact->lsub; xlsub = Llu_symbfact->xlsub;
usub = Llu_symbfact->usub; xusub = Llu_symbfact->xusub;
@@ -5123,6 +5227,11 @@ intraLvl_symbfact
/* if current separator dense, then reset value of filledSep */
if (VInfo->filledSep == FILLED_SEP)
VInfo->filledSep = FALSE;
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit intraLvl_symbfact()");
+#endif
+
}
static void
@@ -5177,9 +5286,13 @@ estimate_memUsage
int_t n, /* Input - order of the matrix */
int iam, /* Input - my processor number */
superlu_dist_mem_usage_t *symb_mem_usage,
- float *p_totalMemLU, /* Output -memory used for symbolic factorization */
- float *p_overestimMem, /* Output -memory allocated during to right looking
- overestimation memory usage */
+ float *p_totalMemLU, /* Output -memory used for symbolic factorization.
+ This also includes the overestimMem below.
+ */
+ float *p_overestimMem, /* Output -memory allocated during the right looking
+ overestimation memory usage.
+ This is the "hole" leftover in the LU arrays.
+ */
Pslu_freeable_t *Pslu_freeable, /* global LU data structures (modified) */
Llu_symbfact_t *Llu_symbfact, /* Input - local L, U data structures */
vtcsInfo_symbfact_t *VInfo, /* Input - local info on vertices distribution */
@@ -5218,11 +5331,15 @@ estimate_memUsage
*p_totalMemLU = lu_mem;
*p_overestimMem = overestimMem;
-
+
+
+ /* see Llu_symbfact_t{} structure */
symb_mem_usage->for_lu = (float) ((3 * nvtcs_loc + 2 * nsuper_loc) * lword);
symb_mem_usage->for_lu += (float) (Llu_symbfact->xlsub[nvtcs_loc] * lword);
- symb_mem_usage->for_lu += (float) (Llu_symbfact->xusub[nvtcs_loc] * lword);
+ symb_mem_usage->for_lu += (float) (Llu_symbfact->xusub[nvtcs_loc] * lword);
+
symb_mem_usage->total = lu_mem;
+
}
@@ -5251,4 +5368,3 @@ intCalloc_symbfact(int_t n)
for (i = 0; i < n; i++) buf[i] = 0;
return (buf);
}
-
diff --git a/SRC/psymbfact.h b/SRC/psymbfact.h
index 2ef18fb1..c394d4a6 100644
--- a/SRC/psymbfact.h
+++ b/SRC/psymbfact.h
@@ -238,7 +238,9 @@ typedef struct {
int_t maxSzUPr; /* maximum size of pruned U */
int_t maxSzBuf; /* maximum size of the send and receive buffers */
int_t szDnsSep; /* size of memory used when there are dense separators */
- float allocMem; /* size of the total memory allocated (in bytes) */
+ float allocMem; /* size of the total memory allocated (in bytes)
+ This is the working storage, does not include LU data arrays
+ */
} psymbfact_stat_t;
/* MACROS */
diff --git a/SRC/psymbfact_util.c b/SRC/psymbfact_util.c
index d85d02b8..19afa2b4 100644
--- a/SRC/psymbfact_util.c
+++ b/SRC/psymbfact_util.c
@@ -84,6 +84,8 @@ static int_t *expand
* Return value: 0 - successful return
* > 0 - number of bytes allocated when run out of space
*
+ *
+ * Sherry: this function is used inside the domains.
*/
/************************************************************************/
int_t psymbfact_LUXpandMem
@@ -160,6 +162,13 @@ int_t psymbfact_LUXpandMem
iam, mem_type, (long) prev_len, (long) min_new_len, (long) new_len);
return ERROR_RET;
}
+
+#if ( PRNTlevel>=1 )
+ if (iam==0) { // Sherry
+ printf("\t.. psymbfact_LUXpandMem::after expand() PS->allocMem (MB) %.2f\n", PS->allocMem*1e-6);
+ fflush(stdout);
+ }
+#endif
xsub_nextLvl = new_len - len_tcopy_fend;
@@ -196,7 +205,7 @@ int_t psymbfact_LUXpandMem
Llu_symbfact->no_expand ++;
return SUCCES_RET;
-}
+} /* end psymbfact_LUXpandMem */
/*! \brief
*
@@ -205,6 +214,9 @@ int_t psymbfact_LUXpandMem
* Return value: SUCCES_RET - successful return
* ERROR_RET - error due to a memory alocation failure
*
+ *
+ * Sherry: this function is used in the upper separator tree above the domains.
+ * It does not call 'expand()'
*/
/************************************************************************/
int_t psymbfact_LUXpand
@@ -367,7 +379,7 @@ int_t psymbfact_LUXpand
Llu_symbfact->no_expcp ++;
return SUCCES_RET;
-}
+} /* end psymbfact_LUXpand */
/*! \brief
*
@@ -376,6 +388,9 @@ int_t psymbfact_LUXpand
* Return value: 0 - successful return
* > 0 - number of bytes allocated when run out of space
*
+ *
+ * Sherry: this function calls psymbfact_LUXpandMem().
+ *
*/
/************************************************************************/
int_t psymbfact_LUXpand_RL
@@ -485,7 +500,7 @@ int_t psymbfact_LUXpand_RL
Llu_symbfact->no_expcp ++;
return SUCCES_RET;
-}
+} /* end psymbfact_LUXpand_RL */
/*! \brief
*
@@ -494,6 +509,9 @@ int_t psymbfact_LUXpand_RL
* Return value: SUCCES_RET - successful return
* ERROR_RET - error when run out of space
*
+ *
+ * Sherry: this function calls 'expand()' directly.
+ *
*/
/************************************************************************/
int_t psymbfact_prLUXpand
@@ -535,6 +553,13 @@ int_t psymbfact_prLUXpand
fprintf(stderr, "Can't expand MemType %d: \n", mem_type);
return (ERROR_RET);
}
+
+#if ( PRNTlevel>=1 )
+ if (iam==0) { // Sherry
+ printf("\t.. psymbfact_prLUXpand::after expand() PS->allocMem (MB) %.2f\n", PS->allocMem*1e-6);
+ fflush(stdout);
+ }
+#endif
Llu_symbfact->no_expand_pr ++;
if ( mem_type == LSUB_PR ) {
@@ -548,4 +573,4 @@ int_t psymbfact_prLUXpand
SUPERLU_FREE (prev_mem);
return SUCCES_RET;
-}
+} /* end psymbfact_prLUXpand */
diff --git a/SRC/pzdistribute.c b/SRC/pzdistribute.c
index 2a15a0e4..b2f63086 100644
--- a/SRC/pzdistribute.c
+++ b/SRC/pzdistribute.c
@@ -16,11 +16,14 @@ at the top-level directory.
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 15, 2008
* October 18, 2021, minor fix, v7.1.1
+ * January 9, 2023, add new data structures for SpTRSV
*
*/
#include "superlu_zdefs.h"
-
+#ifdef GPU_ACC
+#include "gpu_api_utils.h"
+#endif
/*! \brief
*
@@ -401,21 +404,42 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int *index1; /* temporary pointer to array of int */
doublecomplex *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */
+ long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
+
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Unnz; /* size ceil(NSUPERS/Pc) */
- doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+
+ int_t *Unnz; /* size ceil(NSUPERS/Pc) */
+ doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
+ doublecomplex *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */
+ long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Unzval_br_cnt=0;
int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
+ long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Ufstnz_br_cnt=0;
- C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
- int msgsize;
+ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
+ int msgsize;
int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+ Ucb_indptr_t *Ucb_inddat;
+ long int *Ucb_indoffset;
+ long int Ucb_indcnt=0;
int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
+ int_t *Ucb_valdat;
+ long int *Ucb_valoffset;
+ long int Ucb_valcnt=0;
+
/*-- Counts to be used in factorization. --*/
int *ToRecv, *ToSendD, **ToSendR;
@@ -463,7 +487,11 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int *frecv, *brecv;
int_t *lloc;
doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */
+ long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */
+ long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
double *SeedSTD_BC,*SeedSTD_RD;
int_t idx_indx,idx_lusup;
int_t nbrow;
@@ -674,8 +702,18 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(Unzval_br_ptr =
(doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
ABORT("Malloc fails for Unzval_br_ptr[].");
+ if ( !(Unzval_br_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
+ }
+ Unzval_br_offset[k-1] = -1;
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Ufstnz_br_ptr[].");
+ if ( !(Ufstnz_br_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
+ }
+ Ufstnz_br_offset[k-1] = -1;
if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
ABORT("Malloc fails for ToSendD[].");
@@ -766,8 +804,13 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(index = intMalloc_dist(len1+1)) )
ABORT("Malloc fails for Uindex[].");
Ufstnz_br_ptr[lb] = index;
+ Ufstnz_br_offset[lb]=len1+1;
+ Ufstnz_br_cnt += Ufstnz_br_offset[lb];
if ( !(Unzval_br_ptr[lb] = doublecomplexMalloc_dist(len)) )
ABORT("Malloc fails for Unzval_br_ptr[*][].");
+ Unzval_br_offset[lb]=len;
+ Unzval_br_cnt += Unzval_br_offset[lb];
+
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
index[0] = Ucbs[lb]; /* Number of column blocks */
@@ -777,6 +820,8 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} else {
Ufstnz_br_ptr[lb] = NULL;
Unzval_br_ptr[lb] = NULL;
+ Unzval_br_offset[lb]=-1;
+ Ufstnz_br_offset[lb]=-1;
}
Urb_length[lb] = 0; /* Reset block length. */
Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
@@ -825,22 +870,47 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lrowind_bc_ptr[].");
Lrowind_bc_ptr[k-1] = NULL;
+ if ( !(Lrowind_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
+ }
+ Lrowind_bc_offset[k-1] = -1;
+ if ( !(Lnzval_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
+ }
+ Lnzval_bc_offset[k-1] = -1;
if ( !(Lindval_loc_bc_ptr =
(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
Lindval_loc_bc_ptr[k-1] = NULL;
+ if ( !(Lindval_loc_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
+ }
+ Lindval_loc_bc_offset[k-1] = -1;
if ( !(Linv_bc_ptr =
- (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
+ (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
}
+ if ( !(Linv_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
+ }
if ( !(Uinv_bc_ptr =
- (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
+ (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
}
+ if ( !(Uinv_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
+ }
Linv_bc_ptr[k-1] = NULL;
Uinv_bc_ptr[k-1] = NULL;
+ Linv_bc_offset[k-1] = -1;
+ Uinv_bc_offset[k-1] = -1;
if ( !(Unnz =
(int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
@@ -871,6 +941,11 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
+ long int Linv_bc_cnt=0;
+ long int Uinv_bc_cnt=0;
+ long int Lrowind_bc_cnt=0;
+ long int Lnzval_bc_cnt=0;
+ long int Lindval_loc_bc_cnt=0;
for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */
pc = PCOL( jb, grid );
@@ -1013,14 +1088,37 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
if ( !(index = intMalloc_dist(len1)) )
ABORT("Malloc fails for index[]");
+ Lrowind_bc_offset[ljb]=len1;
+ Lrowind_bc_cnt += Lrowind_bc_offset[ljb];
+
if (!(lusup = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
ABORT("Malloc fails for lusup[]");
+ Lnzval_bc_offset[ljb]=len*nsupc;
+ Lnzval_bc_cnt += Lnzval_bc_offset[ljb];
if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) )
ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
- if (!(Linv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
- ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
- if (!(Uinv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
- ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+ Lindval_loc_bc_offset[ljb]=nrbl*3;
+ Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb];
+
+ myrow = MYROW( iam, grid );
+ krow = PROW( jb, grid );
+ if(myrow==krow){ /* diagonal block */
+ if (!(Linv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
+ ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+ Linv_bc_offset[ljb]=nsupc*nsupc;
+ Linv_bc_cnt += Linv_bc_offset[ljb];
+
+ if (!(Uinv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
+ ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+ Uinv_bc_offset[ljb]=nsupc*nsupc;
+ Uinv_bc_cnt += Uinv_bc_offset[ljb];
+ }else{
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ }
+
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
@@ -1133,9 +1231,14 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} else {
Lrowind_bc_ptr[ljb] = NULL;
Lnzval_bc_ptr[ljb] = NULL;
- Linv_bc_ptr[ljb] = NULL;
- Uinv_bc_ptr[ljb] = NULL;
- Lindval_loc_bc_ptr[ljb] = NULL;
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Lrowind_bc_offset[ljb]=-1;
+ Lindval_loc_bc_offset[ljb]=-1;
+ Lnzval_bc_offset[ljb]=-1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ Lindval_loc_bc_ptr[ljb] = NULL;
} /* if nrbl ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
@@ -1144,6 +1247,98 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} /* for jb ... */
+ Linv_bc_cnt +=1; // safe guard
+ Uinv_bc_cnt +=1;
+ Lrowind_bc_cnt +=1;
+ Lindval_loc_bc_cnt +=1;
+ Lnzval_bc_cnt +=1;
+ if ( !(Linv_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
+ }
+ if ( !(Uinv_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
+ }
+
+ if ( !(Lrowind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
+ }
+ if ( !(Lindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
+ }
+ if ( !(Lnzval_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+
+ /* use contingous memory for Linv_bc_ptr, Uinv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Linv_bc_cnt=0;
+ Uinv_bc_cnt=0;
+ Lrowind_bc_cnt=0;
+ Lnzval_bc_cnt=0;
+ Lindval_loc_bc_cnt=0;
+ long int tmp_cnt;
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Linv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) {
+ Linv_bc_dat[Linv_bc_cnt+jj]=Linv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Linv_bc_ptr[jb]);
+ Linv_bc_ptr[jb]=&Linv_bc_dat[Linv_bc_cnt];
+ tmp_cnt = Linv_bc_offset[jb];
+ Linv_bc_offset[jb]=Linv_bc_cnt;
+ Linv_bc_cnt+=tmp_cnt;
+ }
+
+ if(Uinv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) {
+ Uinv_bc_dat[Uinv_bc_cnt+jj]=Uinv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Uinv_bc_ptr[jb]);
+ Uinv_bc_ptr[jb]=&Uinv_bc_dat[Uinv_bc_cnt];
+ tmp_cnt = Uinv_bc_offset[jb];
+ Uinv_bc_offset[jb]=Uinv_bc_cnt;
+ Uinv_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lrowind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) {
+ Lrowind_bc_dat[Lrowind_bc_cnt+jj]=Lrowind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lrowind_bc_ptr[jb]);
+ Lrowind_bc_ptr[jb]=&Lrowind_bc_dat[Lrowind_bc_cnt];
+ tmp_cnt = Lrowind_bc_offset[jb];
+ Lrowind_bc_offset[jb]=Lrowind_bc_cnt;
+ Lrowind_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lnzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) {
+ Lnzval_bc_dat[Lnzval_bc_cnt+jj]=Lnzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lnzval_bc_ptr[jb]);
+ Lnzval_bc_ptr[jb]=&Lnzval_bc_dat[Lnzval_bc_cnt];
+ tmp_cnt = Lnzval_bc_offset[jb];
+ Lnzval_bc_offset[jb]=Lnzval_bc_cnt;
+ Lnzval_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) {
+ Lindval_loc_bc_dat[Lindval_loc_bc_cnt+jj]=Lindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
+ Lindval_loc_bc_ptr[jb]=&Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
+ tmp_cnt = Lindval_loc_bc_offset[jb];
+ Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
+ Lindval_loc_bc_cnt+=tmp_cnt;
+ }
+ } /* for jb ... */
+
/////////////////////////////////////////////////////////////////
/* Set up additional pointers for the index and value arrays of U.
@@ -1157,6 +1352,17 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
ABORT("Malloc fails for Ucb_indptr[]");
if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
ABORT("Malloc fails for Ucb_valptr[]");
+ if ( !(Ucb_valoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valoffset[].");
+ }
+ Ucb_valoffset[nub-1] = -1;
+ if ( !(Ucb_indoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
+ }
+ Ucb_indoffset[nub-1] = -1;
+
nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
/* Count number of row blocks in a block column.
@@ -1179,10 +1385,19 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
for (lb = 0; lb < nub; ++lb) {
if ( Urbs[lb] ) { /* Not an empty block column. */
if ( !(Ucb_indptr[lb]
- = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+ = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
+ Ucb_indoffset[lb]=Urbs[lb];
+ Ucb_indcnt += Ucb_indoffset[lb];
if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
+ Ucb_valoffset[lb]=Urbs[lb];
+ Ucb_valcnt += Ucb_valoffset[lb];
+ }else{
+ Ucb_valptr[lb]=NULL;
+ Ucb_valoffset[lb]=-1;
+ Ucb_indptr[lb]=NULL;
+ Ucb_indoffset[lb]=-1;
}
}
for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
@@ -1227,6 +1442,81 @@ pzdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
}
}
+ Unzval_br_cnt +=1; // safe guard
+ Ufstnz_br_cnt +=1;
+ Ucb_valcnt +=1;
+ Ucb_indcnt +=1;
+ if ( !(Unzval_br_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ if ( !(Ufstnz_br_dat =
+ (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
+ }
+ if ( !(Ucb_valdat =
+ (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valdat[].");
+ }
+ if ( !(Ucb_inddat =
+ (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_inddat[].");
+ }
+
+ /* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
+ k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+ Unzval_br_cnt=0;
+ Ufstnz_br_cnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Unzval_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Unzval_br_offset[lb]; ++jj) {
+ Unzval_br_dat[Unzval_br_cnt+jj]=Unzval_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Unzval_br_ptr[lb]);
+ Unzval_br_ptr[lb]=&Unzval_br_dat[Unzval_br_cnt];
+ tmp_cnt = Unzval_br_offset[lb];
+ Unzval_br_offset[lb]=Unzval_br_cnt;
+ Unzval_br_cnt+=tmp_cnt;
+ }
+
+ if(Ufstnz_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Ufstnz_br_offset[lb]; ++jj) {
+ Ufstnz_br_dat[Ufstnz_br_cnt+jj]=Ufstnz_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Ufstnz_br_ptr[lb]);
+ Ufstnz_br_ptr[lb]=&Ufstnz_br_dat[Ufstnz_br_cnt];
+ tmp_cnt = Ufstnz_br_offset[lb];
+ Ufstnz_br_offset[lb]=Ufstnz_br_cnt;
+ Ufstnz_br_cnt+=tmp_cnt;
+ }
+ }
+
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Ucb_valcnt=0;
+ Ucb_indcnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Ucb_valptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_valoffset[lb]; ++jj) {
+ Ucb_valdat[Ucb_valcnt+jj]=Ucb_valptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_valptr[lb]);
+ Ucb_valptr[lb]=&Ucb_valdat[Ucb_valcnt];
+ tmp_cnt = Ucb_valoffset[lb];
+ Ucb_valoffset[lb]=Ucb_valcnt;
+ Ucb_valcnt+=tmp_cnt;
+ }
+ if(Ucb_indptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_indoffset[lb]; ++jj) {
+ Ucb_inddat[Ucb_indcnt+jj]=Ucb_indptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_indptr[lb]);
+ Ucb_indptr[lb]=&Ucb_inddat[Ucb_indcnt];
+ tmp_cnt = Ucb_indoffset[lb];
+ Ucb_indoffset[lb]=Ucb_indcnt;
+ Ucb_indcnt+=tmp_cnt;
+ }
+ } /* for lb ... */
+
/////////////////////////////////////////////////////////////////
#if ( PROFlevel>=1 )
@@ -1729,10 +2019,8 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
}
/* Every process receives the count, but it is only useful on the
diagonal processes. */
- //MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);
-
k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(URtree_ptr = (C_Tree*)SUPERLU_MALLOC(k * sizeof(C_Tree))) )
ABORT("Malloc fails for URtree_ptr[].");
@@ -1919,12 +2207,31 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
////////////////////////////////////////////////////////
-
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+ Llu->Lrowind_bc_dat = Lrowind_bc_dat;
+ Llu->Lrowind_bc_offset = Lrowind_bc_offset;
+ Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
+
Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+ Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
+ Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
+ Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
+
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+ Llu->Lnzval_bc_dat = Lnzval_bc_dat;
+ Llu->Lnzval_bc_offset = Lnzval_bc_offset;
+ Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
+
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+ Llu->Ufstnz_br_dat = Ufstnz_br_dat;
+ Llu->Ufstnz_br_offset = Ufstnz_br_offset;
+ Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
+
Llu->Unzval_br_ptr = Unzval_br_ptr;
+ Llu->Unzval_br_dat = Unzval_br_dat;
+ Llu->Unzval_br_offset = Unzval_br_offset;
+ Llu->Unzval_br_cnt = Unzval_br_cnt;
+
Llu->Unnz = Unnz;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
@@ -1944,11 +2251,74 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
Llu->LBtree_ptr = LBtree_ptr;
Llu->URtree_ptr = URtree_ptr;
Llu->UBtree_ptr = UBtree_ptr;
+
Llu->Linv_bc_ptr = Linv_bc_ptr;
+ Llu->Linv_bc_dat = Linv_bc_dat;
+ Llu->Linv_bc_offset = Linv_bc_offset;
+ Llu->Linv_bc_cnt = Linv_bc_cnt;
+
Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Uinv_bc_dat = Uinv_bc_dat;
+ Llu->Uinv_bc_offset = Uinv_bc_offset;
+ Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+
Llu->Urbs = Urbs;
Llu->Ucb_indptr = Ucb_indptr;
+ Llu->Ucb_inddat = Ucb_inddat;
+ Llu->Ucb_indoffset = Ucb_indoffset;
+ Llu->Ucb_indcnt = Ucb_indcnt;
Llu->Ucb_valptr = Ucb_valptr;
+ Llu->Ucb_valdat = Ucb_valdat;
+ Llu->Ucb_valoffset = Ucb_valoffset;
+ Llu->Ucb_valcnt = Ucb_valcnt;
+
+
+#ifdef GPU_ACC
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+
+ // some dummy allocation to avoid checking whether they are null pointers later
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(doublecomplex)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
+
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+
+
+ /* gpuMemcpy for the following is performed in pxgssvx */
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(doublecomplex)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(doublecomplex)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(doublecomplex)));
+
+#endif
#if ( PRNTlevel>=1 )
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
index dbea442e..390c9709 100644
--- a/SRC/pzgssvx.c
+++ b/SRC/pzgssvx.c
@@ -720,13 +720,9 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
if ( iinfo > 0 ) {
if ( iinfo <= m ) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "The %d-th row of A is exactly zero\n", (int)iinfo);
-#endif
} else {
-#if ( PRNTlevel>=1 )
- fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)iinfo-n);
-#endif
+ fprintf(stderr, "The %d-th column of A is exactly zero\n", (int)(iinfo-n));
}
} else if ( iinfo < 0 ) return;
@@ -1001,7 +997,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
printf("{" IFMT "," IFMT "}: pzgssvx: invalid ColPerm option when ParSymbfact is used\n",
MYROW(grid->iam, grid), MYCOL(grid->iam, grid));
}
- }
+ } /* end preparing for parallel symbolic */
if ( permc_spec != MY_PERMC && Fact == DOFACT ) {
/* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */
@@ -1021,9 +1017,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
// }
// }
if (flinfo > 0) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n");
-#endif
*info = flinfo;
return;
}
@@ -1073,10 +1067,11 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
SUPERLU_MALLOC(sizeof(Glu_freeable_t))) )
ABORT("Malloc fails for Glu_freeable.");
- /* Every process does this. */
+ /* Every process does this.
+ returned value (-iinfo) is the size of lsub[], incuding pruned graph.*/
iinfo = symbfact(options, iam, &GAC, perm_c, etree,
Glu_persist, Glu_freeable);
- nnzLU = Glu_freeable->nnzLU;
+ nnzLU = Glu_freeable->nnzLU;
stat->utime[SYMBFAC] = SuperLU_timer_() - t;
if ( iinfo <= 0 ) { /* Successful return */
QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage);
@@ -1096,10 +1091,8 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
}
#endif
} else { /* symbfact out of memory */
-#if ( PRNTlevel>=1 )
if ( !iam )
fprintf(stderr,"symbfact() error returns " IFMT "\n",iinfo);
-#endif
*info = iinfo;
return;
}
@@ -1114,9 +1107,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
nnzLU = Pslu_freeable.nnzLU;
stat->utime[SYMBFAC] = SuperLU_timer_() - t;
if (flinfo > 0) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "Insufficient memory for parallel symbolic factorization.");
-#endif
*info = flinfo;
return;
}
@@ -1164,6 +1155,8 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
t = SuperLU_timer_();
dist_mem_use = zdist_psymbtonum(options, n, A, ScalePermstruct,
&Pslu_freeable, LUstruct, grid);
+
+ /* dist_mem_use = memDist + memNLU */
if (dist_mem_use > 0)
ABORT ("Not enough memory available for dist_psymbtonum\n");
@@ -1286,28 +1279,62 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
MPI_SUM, 0, grid->comm );
stat->TinyPivots = TinyPivots;
+ if ( iam==0 ) {
+ printf("\n** Memory Usage **********************************\n");
+ }
+
+ /* Compute numerical factorization memeory */
+ zQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
+
/*-- Compute high watermark of all stages --*/
if (parSymbFact == TRUE) {
/* The memory used in the redistribution routine
includes the memory used for storing the symbolic
structure and the memory allocated for numerical
factorization */
- mem_stage[0] = (-flinfo); /* symbfact step */
- mem_stage[1] = (-dist_mem_use); /* distribution step */
+ /* parallel symbfact step:
+ (-flinfo) is the allocMem returned from symbfact_dist() */
+ mem_stage[0] = symb_mem_usage.total + (-flinfo);
+
+ /* see leading comment of dist_symbLU() */
+ /* dist_mem_use = (memDist + memNLU) in zdist_psymbtonum() */
+ mem_stage[1] = symb_mem_usage.for_lu + (-dist_mem_use); /* distribution step */
loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1] );
if ( options->RowPerm != NO )
loc_max = SUPERLU_MAX(loc_max, GA_mem_use);
- } else {
+
+#if ( PRNTlevel>=1 )
+ if ( iam==0 ) {
+ printf("\t(P0) Globle A for MC64: GA_mem_use %.2f\n", GA_mem_use*1e-6);
+ printf("\t(P0) parallel symbolic::stage[0]: symb_memory %.2f, allocMem %.2f\n",
+ symb_mem_usage.total*1e-6, (-flinfo)*1e-6);
+ printf("\t(P0) parallel distribution::stage[1]: symb_LU %.2f, dist_mem_use %.2f\n",
+ symb_mem_usage.for_lu*1e-6, (-dist_mem_use)*1e-6);
+ fflush(stdout);
+
+ }
+#endif
+ } else { /* Serial symbolic. GA_mem_use is for global A */
mem_stage[0] = symb_mem_usage.total + GA_mem_use; /* symbfact step */
mem_stage[1] = symb_mem_usage.for_lu
+ dist_mem_use
+ num_mem_usage.for_lu; /* distribution step */
loc_max = SUPERLU_MAX( mem_stage[0], mem_stage[1] );
+#if ( PRNTlevel>=1 )
+ if ( iam==0 ) {
+ printf("\t(P0) serial symbolic::stage[0]: symb_memory %.2f, GA_mem_use %.2f\n",
+ symb_mem_usage.total*1e-6, GA_mem_use*1e-6);
+ printf("\t(P0) serial distribution::stage[1]:"
+ "symb_LU %.2f, dist_mem_use %.2f, num_mem_usage.for_lu %.2f\n",
+ symb_mem_usage.for_lu*1e-6, dist_mem_use*1e-6,
+ num_mem_usage.for_lu*1e-6);
+ fflush(stdout);
+
+ }
+#endif
}
- zQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage);
mem_stage[2] = num_mem_usage.total; /* numerical factorization step */
-
loc_max = SUPERLU_MAX( loc_max, mem_stage[2] ); /* local max of 3 stages */
local_struct.val = loc_max;
@@ -1335,7 +1362,6 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
float buffer_peak = global_struct.val*1e-6;
if ( iam==0 ) {
- printf("\n** Memory Usage **********************************\n");
printf("** Total highmark (MB):\n"
" Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n",
avg * 1e-6,
@@ -1362,7 +1388,6 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
} /* end if (!factored) */
-
if ( options->Fact == DOFACT || options->Fact == SamePattern ) {
/* Need to reset the solve's communication pattern,
because perm_r[] and/or perm_c[] is changed. */
@@ -1453,19 +1478,20 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
if ( options->DiagInv==YES && (Fact != FACTORED) ) {
pzCompute_Diag_Inv(n, LUstruct, grid, stat, info);
+
#ifdef GPU_ACC
+
+ pzconvertU(options, grid, LUstruct, stat, n);
+
checkGPU(gpuMemcpy(LUstruct->Llu->d_Linv_bc_dat, LUstruct->Llu->Linv_bc_dat,
(LUstruct->Llu->Linv_bc_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
checkGPU(gpuMemcpy(LUstruct->Llu->d_Uinv_bc_dat, LUstruct->Llu->Uinv_bc_dat,
(LUstruct->Llu->Uinv_bc_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
checkGPU(gpuMemcpy(LUstruct->Llu->d_Lnzval_bc_dat, LUstruct->Llu->Lnzval_bc_dat,
(LUstruct->Llu->Lnzval_bc_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
- //checkGPU(gpuMemcpy(LUstruct->Llu->d_Unzval_br_dat, LUstruct->Llu->Unzval_br_dat,
- // (LUstruct->Llu->Unzval_br_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
#endif
}
-
// #pragma omp parallel
// {
// #pragma omp master
@@ -1634,4 +1660,289 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
CHECK_MALLOC(iam, "Exit pzgssvx()");
#endif
+} /* pzgssvx */
+
+#ifdef GPU_ACC
+void
+pzconvertU(superlu_dist_options_t *options, gridinfo_t *grid,
+ zLUstruct_t *LUstruct, SuperLUStat_t *stat, int n)
+{
+int64_t nnz_ind,nnz_offset;
+int64_t nnz_val;
+Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+int_t nsupers,nsupers_j,ncol,ncol_loc,nrow;
+int_t lk,ik,ub,nub,i,il,gik,k,uptr,jj,ii,fnz,irow,jb;
+zLocalLU_t *Llu = LUstruct->Llu;
+int_t *Urbs = Llu->Urbs;
+int_t **Ucb_valptr = Llu->Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
+Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+int_t knsupc,iknsupc,ikfrow,iklrow;
+int_t *xsup = Glu_persist->xsup;;
+
+int iam = grid->iam;
+int mycol = MYCOL (iam, grid);
+int myrow = MYROW (iam, grid);
+
+int_t *usub;
+doublecomplex *uval;
+
+int64_t Ucolind_bc_cnt=0;
+int64_t Unzval_bc_cnt=0, Unzval_br_cnt=0;
+int64_t Uindval_loc_bc_cnt=0;
+
+int_t next_lind; /* next available position in index[*] */
+int_t next_lval; /* next available position in nzval[*] */
+
+nsupers = Glu_persist->supno[n-1] + 1;
+nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */
+
+if ( !(Llu->Ucolind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) )
+ ABORT("Malloc fails for Llu->Ucolind_bc_ptr[].");
+Llu->Ucolind_bc_ptr[nsupers_j-1] = NULL;
+
+if ( !(Llu->Unzval_bc_ptr =
+ (doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) )
+ ABORT("Malloc fails for Llu->Unzval_bc_ptr[].");
+Llu->Unzval_bc_ptr[nsupers_j-1] = NULL;
+
+if ( !(Llu->Uindval_loc_bc_ptr =
+ (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) )
+ ABORT("Malloc fails for Llu->Uindval_loc_bc_ptr[].");
+Llu->Uindval_loc_bc_ptr[nsupers_j-1] = NULL;
+
+if ( !(Llu->Uindval_loc_bc_offset =
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Uindval_loc_bc_offset[].");
}
+Llu->Uindval_loc_bc_offset[nsupers_j-1] = -1;
+
+if ( !(Llu->Ucolind_bc_offset =
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Ucolind_bc_offset[].");
+}
+Llu->Ucolind_bc_offset[nsupers_j-1] = -1;
+
+if ( !(Llu->Unzval_bc_offset =
+ (int64_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int64_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Lnzval_bc_offset[].");
+}
+Llu->Unzval_bc_offset[nsupers_j-1] = -1;
+
+for (lk=0;lknpcol + mycol;/* Global block number, col-wise. */
+ knsupc = SuperSize( k );
+ nub = Urbs[lk]; /* Number of U blocks in block column lk */
+
+ if(nub>0){
+ // First pass count sizes of Llu->Ucolind_bc_ptr[lk] and Llu->Unzval_bc_ptr[lk]
+ nnz_ind=0;
+ nnz_val=0;
+ nnz_ind+=BC_HEADER_NEWU;
+ nrow=0;
+ for (ub = 0; ub < nub; ++ub) {
+ ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+ usub = Llu->Ufstnz_br_ptr[ik];
+ uval = Llu->Unzval_br_ptr[ik];
+ i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+ i += UB_DESCRIPTOR;
+ gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+ iknsupc = SuperSize( gik );
+ nrow += iknsupc;
+ ikfrow = FstBlockC( gik );
+ iklrow = FstBlockC( gik+1 );
+ uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+
+ nnz_ind+=UB_DESCRIPTOR_NEWU;
+
+ for (jj = 0; jj < knsupc; ++jj) {
+ fnz = usub[i + jj];
+ if ( fnz < iklrow ) { /* Nonzero segment. */
+ nnz_val+=iknsupc;
+ nnz_ind+=1;
+ Unzval_br_cnt+=iklrow - fnz;
+ // for (irow = fnz; irow < iklrow; ++irow)
+ // dest[irow - ikfrow] -= uval[uptr++] * y[jj];
+ // stat->ops[SOLVE] += 2 * (iklrow - fnz);
+ }
+ } /* for jj ... */
+ } /* for ub ... */
+
+ // Second pass fills Llu->Ucolind_bc_ptr[lk] and Llu->Unzval_bc_ptr[lk]
+ if ( !(Llu->Ucolind_bc_ptr[lk] = intMalloc_dist(nnz_ind+nrow*2)) )
+ ABORT("Malloc fails for Llu->Ucolind_bc_ptr[lk]");
+ Llu->Ucolind_bc_offset[lk]=nnz_ind+nrow*2;
+ Ucolind_bc_cnt += Llu->Ucolind_bc_offset[lk];
+
+ if (!(Llu->Unzval_bc_ptr[lk]=doublecomplexCalloc_dist(nnz_val)))
+ ABORT("Calloc fails for Llu->Unzval_bc_ptr[lk].");
+ Llu->Unzval_bc_offset[lk]=nnz_val;
+ Unzval_bc_cnt += Llu->Unzval_bc_offset[lk];
+
+ if ( !(Llu->Uindval_loc_bc_ptr[lk] = intCalloc_dist(nub*3)) )
+ ABORT("Malloc fails for Llu->Uindval_loc_bc_ptr[lk][]");
+ Llu->Uindval_loc_bc_offset[lk]=nub*3;
+ Uindval_loc_bc_cnt += Llu->Uindval_loc_bc_offset[lk];
+
+ Llu->Ucolind_bc_ptr[lk][0]=nub;
+ Llu->Ucolind_bc_ptr[lk][1]=nrow;
+ Llu->Ucolind_bc_ptr[lk][2]=nnz_ind;
+ nnz_offset=nnz_ind;
+
+ nnz_ind=0;
+ nnz_val=0;
+ ncol=0;
+ nnz_ind+=BC_HEADER_NEWU;
+ nrow=0;
+ for (ub = 0; ub < nub; ++ub) {
+ ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */
+ usub = Llu->Ufstnz_br_ptr[ik];
+ uval = Llu->Unzval_br_ptr[ik];
+ i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */
+ i += UB_DESCRIPTOR;
+ gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */
+ iknsupc = SuperSize( gik );
+ ikfrow = FstBlockC( gik );
+ iklrow = FstBlockC( gik+1 );
+ uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */
+
+ for(ii=0; iiUcolind_bc_ptr[lk][nnz_offset+nrow*2] = ub;
+ Llu->Ucolind_bc_ptr[lk][nnz_offset+nrow*2+1] = ii;
+ nrow++;
+ }
+
+ ncol_loc=0;
+ for (jj = 0; jj < knsupc; ++jj) {
+ fnz = usub[i + jj];
+ if ( fnz < iklrow ) { /* Nonzero segment. */
+ Llu->Ucolind_bc_ptr[lk][nnz_ind+ncol_loc+UB_DESCRIPTOR_NEWU]=FstBlockC(k)+jj; /* Global column number */
+ ncol_loc++;
+ for (irow = fnz; irow < iklrow; ++irow){
+ Llu->Unzval_bc_ptr[lk][nnz_val+irow - ikfrow]=uval[uptr++];
+ // if(lk==2){
+ // printf("uval %5d %5d %5d %f %5d %5d \n",gik, uptr-1, irow - ikfrow, uval[uptr-1], Ucb_valptr[lk][ub],ub);
+ // // printf("Unzval_bc_ptr %5d %f\n",gik, Llu->Unzval_bc_ptr[lk][nnz_val+irow - ikfrow]);
+ // }
+ }
+ nnz_val+=iknsupc;
+ }
+ } /* for jj ... */
+ Llu->Ucolind_bc_ptr[lk][nnz_ind]=gik;
+ Llu->Ucolind_bc_ptr[lk][nnz_ind+1]=ncol_loc;
+
+ Llu->Uindval_loc_bc_ptr[lk][ub] = ik;
+ Llu->Uindval_loc_bc_ptr[lk][ub+nub] = nnz_ind;
+ Llu->Uindval_loc_bc_ptr[lk][ub+nub*2] = ncol;
+ // if(lk==69)
+ // printf("ub ncol_loc %5d %5d \n",ub, ncol_loc);
+ ncol+=ncol_loc*iknsupc;
+ nnz_ind+=ncol_loc+UB_DESCRIPTOR_NEWU;
+ } /* for ub ... */
+
+ }else{ /* nub <= 0 */
+ Llu->Ucolind_bc_ptr[lk] = NULL;
+ Llu->Unzval_bc_ptr[lk] = NULL;
+ Llu->Ucolind_bc_offset[lk]=-1;
+ Llu->Unzval_bc_offset[lk]=-1;
+ Llu->Uindval_loc_bc_ptr[lk] = NULL;
+ Llu->Uindval_loc_bc_offset[lk]=-1;
+ }
+} /* end for lk ... */
+
+ // safe guard
+ Ucolind_bc_cnt +=1;
+ Unzval_bc_cnt +=1;
+ Uindval_loc_bc_cnt +=1;
+ if ( !(Llu->Ucolind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Ucolind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Ucolind_bc_dat[].");
+ }
+ if ( !(Llu->Unzval_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Unzval_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Unzval_bc_dat[].");
+ }
+ if ( !(Llu->Uindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Uindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Llu->Uindval_loc_bc_dat[].");
+ }
+
+ /* use contingous memory for Ucolind_bc_ptr, Unzval_bc_ptr, Uindval_loc_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Ucolind_bc_cnt=0;
+ Unzval_bc_cnt=0;
+ Uindval_loc_bc_cnt=0;
+ int64_t tmp_cnt;
+
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Llu->Ucolind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Llu->Ucolind_bc_offset[jb]; ++jj) {
+ Llu->Ucolind_bc_dat[Ucolind_bc_cnt+jj]=Llu->Ucolind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Llu->Ucolind_bc_ptr[jb]);
+ Llu->Ucolind_bc_ptr[jb]=&Llu->Ucolind_bc_dat[Ucolind_bc_cnt];
+ tmp_cnt = Llu->Ucolind_bc_offset[jb];
+ Llu->Ucolind_bc_offset[jb]=Ucolind_bc_cnt;
+ Ucolind_bc_cnt+=tmp_cnt;
+ }
+
+ if(Llu->Unzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Llu->Unzval_bc_offset[jb]; ++jj) {
+ Llu->Unzval_bc_dat[Unzval_bc_cnt+jj]=Llu->Unzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Llu->Unzval_bc_ptr[jb]);
+ Llu->Unzval_bc_ptr[jb]=&Llu->Unzval_bc_dat[Unzval_bc_cnt];
+ tmp_cnt = Llu->Unzval_bc_offset[jb];
+ Llu->Unzval_bc_offset[jb]=Unzval_bc_cnt;
+ Unzval_bc_cnt+=tmp_cnt;
+ }
+
+ if(Llu->Uindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Llu->Uindval_loc_bc_offset[jb]; ++jj) {
+ Llu->Uindval_loc_bc_dat[Uindval_loc_bc_cnt+jj]=Llu->Uindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Llu->Uindval_loc_bc_ptr[jb]);
+ Llu->Uindval_loc_bc_ptr[jb]=&Llu->Uindval_loc_bc_dat[Uindval_loc_bc_cnt];
+ tmp_cnt = Llu->Uindval_loc_bc_offset[jb];
+ Llu->Uindval_loc_bc_offset[jb]=Uindval_loc_bc_cnt;
+ Uindval_loc_bc_cnt+=tmp_cnt;
+ }
+
+ } /* end for jb ... */
+
+ Llu->Ucolind_bc_cnt = Ucolind_bc_cnt;
+ Llu->Unzval_bc_cnt = Unzval_bc_cnt;
+ Llu->Uindval_loc_bc_cnt = Uindval_loc_bc_cnt;
+ // printf("Ucolind_bc_cnt %10d\n",Ucolind_bc_cnt);
+ //printf("Unzval_bc_cnt %10ld v.s. Unzval_br_cnt %10ld\n",Unzval_bc_cnt,Unzval_br_cnt);
+ // printf("Llu->Ucolind_bc_offset %10d\n",Llu->Ucolind_bc_offset[0]);
+
+ checkGPU(gpuFree(Llu->d_Ucolind_bc_dat));
+ checkGPU(gpuFree(Llu->d_Ucolind_bc_offset));
+ checkGPU(gpuFree(Llu->d_Unzval_bc_dat));
+ checkGPU(gpuFree(Llu->d_Unzval_bc_offset));
+ checkGPU(gpuFree(Llu->d_Uindval_loc_bc_dat));
+ checkGPU(gpuFree(Llu->d_Uindval_loc_bc_offset));
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, (Llu->Ucolind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Ucolind_bc_dat, Llu->Ucolind_bc_dat, (Llu->Ucolind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t)));
+ checkGPU(gpuMemcpy(Llu->d_Ucolind_bc_offset, Llu->Ucolind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t)));
+ checkGPU(gpuMemcpy(Llu->d_Unzval_bc_offset, Llu->Unzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, (Llu->Unzval_bc_cnt) * sizeof(doublecomplex)));
+ checkGPU(gpuMemcpy(LUstruct->Llu->d_Unzval_bc_dat, LUstruct->Llu->Unzval_bc_dat,(LUstruct->Llu->Unzval_bc_cnt) * sizeof(doublecomplex), gpuMemcpyHostToDevice));
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, (Llu->Uindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Uindval_loc_bc_dat, Llu->Uindval_loc_bc_dat, (Llu->Uindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t)));
+ checkGPU(gpuMemcpy(Llu->d_Uindval_loc_bc_offset, Llu->Uindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(int64_t), gpuMemcpyHostToDevice));
+
+ SUPERLU_FREE (Llu->Ucolind_bc_dat);
+ SUPERLU_FREE (Llu->Ucolind_bc_offset);
+ SUPERLU_FREE (Llu->Unzval_bc_dat);
+ SUPERLU_FREE (Llu->Unzval_bc_offset);
+ SUPERLU_FREE (Llu->Uindval_loc_bc_dat);
+ SUPERLU_FREE (Llu->Uindval_loc_bc_offset);
+
+} /* pzconvertU */
+#endif /* ifdef GPU_ACC */
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index 649a9c7c..0f8c5aa7 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -758,13 +758,9 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
if ( iinfo > 0 ) {
if ( iinfo <= m ) {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo);
-#endif
} else {
-#if ( PRNTlevel>=1 )
fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n);
-#endif
}
} else if ( iinfo < 0 ) return;
@@ -1144,7 +1140,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
if (!iam) {
fprintf (stderr, "symbfact() error returns %d\n",
(int) iinfo);
- exit (-1);
+ return;
}
}
diff --git a/SRC/pzgstrs.c b/SRC/pzgstrs.c
index d9452655..e37800c9 100644
--- a/SRC/pzgstrs.c
+++ b/SRC/pzgstrs.c
@@ -209,9 +209,9 @@ pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb,
#endif
{
// t = SuperLU_timer_();
-#ifdef _OPENMP
-#pragma omp taskloop private (i,l,irow,k,j,knsupc) untied
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (i,l,irow,k,j,knsupc) untied
+//#endif
for (i = 0; i < m_loc; ++i) {
irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*B */
@@ -466,9 +466,9 @@ pzReDistribute_X_to_B(int_t n, doublecomplex *B, int_t m_loc, int_t ldb, int_t f
#endif
{
// t = SuperLU_timer_();
-#ifdef _OPENMP
-#pragma omp taskloop private (k,knsupc,lk,irow,l,i,j) untied
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (k,knsupc,lk,irow,l,i,j) untied
+//#endif
for (k = 0; k < nsupers; k++) {
knsupc = SuperSize( k );
lk = LBi( k, grid ); /* Local block number */
@@ -1415,9 +1415,9 @@ if(procs==1){
#endif
{
-#ifdef _OPENMP
-#pragma omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup
+//#endif
for (jj=0;jj=1 )
@@ -2029,9 +2029,9 @@ if(procs==1){
#pragma omp master
#endif
{
-#ifdef _OPENMP
-#pragma omp taskloop private (ii,jj,k,lk,thread_id) nogroup
-#endif
+//#ifdef _OPENMP
+//#pragma omp taskloop private (ii,jj,k,lk,thread_id) nogroup
+//#endif
for (jj=0;jjsupno
* and Glu_persist->xsup.
*
@@ -52,7 +52,7 @@ at the top-level directory.
* Glu_persist->supno, Glu_persist->xsup.
*
* This routine also deallocates memory allocated during symbolic
- * factorization routine. That is, the folloing arrays are freed:
+ * factorization routine. That is, the following arrays are free'd:
* Pslu_freeable->xlsub, Pslu_freeable->lsub,
* Pslu_freeable->xusub, Pslu_freeable->usub,
* Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
@@ -69,7 +69,7 @@ at the top-level directory.
* Order of the input matrix
*
* Pslu_freeable (Input) Pslu_freeable_t *
- * Local L and U structure,
+ * Local L and U structure: lsub[] / usub[]. They are free'd after distribution.
* global to local indexing information.
*
* Glu_persist (Output) Glu_persist_t *
@@ -110,15 +110,16 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
{
int iam, nprocs, pc, pr, p, np, p_diag;
int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u,
- *tmp_ptrToSend, *mem;
+ *tmp_ptrToSend, *mem; // temp memory
int_t *nnzToRecv_l, *nnzToRecv_u;
int_t *send_1, *send_2, nsend_1, nsend_2;
- int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind;
+ int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; // temp memory
int_t nsupers, nsupers_i, nsupers_j;
int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc;
int_t maxszsn, maxNvtcsPProc;
int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s;
- int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s;
+ int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; /* computed from symbfact_dist(),
+ free'd in this routine after distribution */
int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n;
int_t *xsub_s, *sub_s, *xsub_n, *sub_n;
int_t *globToLoc, nvtcs_loc;
@@ -126,8 +127,8 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
RecvCnt_l, RecvCnt_u, ind_loc;
int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc;
int_t nelts, isize;
- float memAux; /* Memory used during this routine and freed on return */
- float memRet; /* Memory allocated and not freed on return */
+ float memAux; /* Memory used during this routine and free'd before return */
+ float memRet; /* Memory allocated and not free'd on return */
int_t iword, dword;
/* ------------------------------------------------------------
@@ -559,6 +560,13 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
else
nnzToRecv[iam] = nnz_loc_u;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [1] memAux %.2f, memRet %.2f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+
/* ------------------------------------------------------------
DEALLOCATE TEMPORARY STORAGE.
-------------------------------------------------------------*/
@@ -647,7 +655,16 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
}
else
sendU = FALSE;
- }
+
+ /* Sherry: this loop goes around twice ? */
+
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [2] end while: memAux %.4f\t memRet %.4f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+ } /* end while sendL || sendU */
/* deallocate memory allocated during symbolic factorization routine */
if (rcv_luind != NULL) {
@@ -675,6 +692,14 @@ dist_symbLU (superlu_dist_options_t *options, int_t n,
*p_xlsub = xlsub_n; *p_lsub = lsub_n;
*p_xusub = xusub_n; *p_usub = usub_n;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.dist_symbLU [3] before return: memAux %.4f\t memRet %.4f (MB)\n", memAux*1e-6, memRet*1e-6);
+ fflush(stdout);
+ }
+#endif
+
+ /* It is confirmed that memAux is 0 now */
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit dist_symbLU()");
#endif
@@ -776,8 +801,8 @@ zdist_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
MPI_Status status;
int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */
int_t *supno = Glu_persist->supno;
- float memAux; /* Memory used during this routine and freed on return */
- float memRet; /* Memory allocated and not freed on return */
+ float memAux; /* Memory used during this routine and free'd before return */
+ float memRet; /* Memory allocated and not free'd on return */
int_t iword, dword, szbuf;
/* ------------------------------------------------------------
@@ -1137,7 +1162,7 @@ zdist_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
#endif
return (-memRet);
-} /* dist_A */
+} /* zdist_A */
/*! \brief
*
@@ -1225,27 +1250,52 @@ zdist_psymbtonum(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
doublecomplex *lusup, *uval; /* nonzero values in L and U */
int *recvBuf; // 1/16/22 Sherry changed to int, was: int_t *recvBuf;
int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend;
- doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */
+ long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
+
doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */
+ long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
+ doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */
+ long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
+
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *index_srt; /* indices consist of headers and row subscripts */
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+
+ int_t *index_srt; /* indices consist of headers and row subscripts */
doublecomplex *lusup_srt; /* nonzero values in L and U */
doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
+ doublecomplex *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */
+ long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Unzval_br_cnt=0;
+
int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
+ long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Ufstnz_br_cnt=0;
int_t *Unnz; /* size ceil(NSUPERS/Pc) */
- C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
int msgsize;
- int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
+ int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */
Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+ Ucb_indptr_t *Ucb_inddat;
+ long int *Ucb_indoffset;
+ long int Ucb_indcnt=0;
int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
-
+ int_t *Ucb_valdat;
+ long int *Ucb_valoffset;
+ long int Ucb_valcnt=0;
/*-- Counts to be used in factorization. --*/
int *ToRecv, *ToSendD, **ToSendR;
@@ -1263,10 +1313,11 @@ zdist_psymbtonum(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int nbrecvx = 0; /* Number of Xk I will receive. */
int nbsendx = 0; /* Number of Xk I will send */
int_t *ilsum; /* starting position of each supernode in
- the full array (local) */
+ the full array (local, blockwise) */
int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in
the full array (local, block column wise) */
- /*-- Auxiliary arrays; freed on return --*/
+ /*-- Auxiliary arrays; free'd on return --*/
+ // Sherry check
int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */
int_t *LUb_length; /* L,U block length; size nsupers_ij */
int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */
@@ -1296,15 +1347,21 @@ doublecomplex *dense, *dense_col; /* SPA */
int_t lptr1_tmp, idx_i, idx_v,m, uu;
int_t nub;
- float memStrLU, memA,
+ /* counting memory */
+ float memA, /* memory used by zdist_A: distributing A values. */
+ memStrLU, /* memory used by dist_symbLU: distributing symbolic LU */
memDist = 0.; /* memory used for redistributing the data, which does
not include the memory for the numerical values
- of L and U (positive number)*/
+ of L and U (positive number).
+ It includes memA and memStrLU.
+ */
float memNLU = 0.; /* memory allocated for storing the numerical values of
L and U, that will be used in the numeric
- factorization (positive number) */
- float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/
-
+ factorization (positive number).
+ It also contains dense-SPA[] array */
+ float memTRS = 0.; /* memory allocated for storing the meta-data for
+ triangular solve (positive number)*/
+
#if ( PRNTlevel>=1 )
int_t nLblocks = 0, nUblocks = 0;
#endif
@@ -1316,7 +1373,7 @@ doublecomplex *dense, *dense_col; /* SPA */
/* Initialization. */
iam = grid->iam;
#if ( DEBUGlevel>=1 )
- CHECK_MALLOC(iam, "Enter dist_psymbtonum()");
+ CHECK_MALLOC(iam, "Enter zdist_psymbtonum()");
#endif
myrow = MYROW( iam, grid );
mycol = MYCOL( iam, grid );
@@ -1360,7 +1417,7 @@ doublecomplex *dense, *dense_col; /* SPA */
if ( myrow == PROW( gb, grid ) ) {
i = SuperSize( gb );
ldaspa += i;
- lb = LBi( gb, grid );
+ lb = LBi( gb, grid ); // local block number
ilsum[lb + 1] = ilsum[lb] + i;
}
ilsum[nsupers_i] = ldaspa;
@@ -1370,7 +1427,7 @@ doublecomplex *dense, *dense_col; /* SPA */
if (mycol == PCOL( gb, grid )) {
i = SuperSize( gb );
ldaspa_j += i;
- lb = LBj( gb, grid );
+ lb = LBj( gb, grid ); // local block number
ilsum_j[lb + 1] = ilsum_j[lb] + i;
}
ilsum_j[nsupers_j] = ldaspa_j;
@@ -1413,7 +1470,7 @@ doublecomplex *dense, *dense_col; /* SPA */
for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j];
/* Auxiliary arrays used to set up L and U block data structures.
- They are freed on return. */
+ They are free'd on return. */
if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) {
fprintf(stderr, "Calloc fails for LUb_length[].");
return (memDist + memNLU + memTRS);
@@ -1439,11 +1496,33 @@ doublecomplex *dense, *dense_col; /* SPA */
fprintf(stderr, "Malloc fails for Unzval_br_ptr[].");
return (memDist + memNLU + memTRS);
}
+
+ if ( !(Unzval_br_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_i * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Unzval_br_offset[nsupers_i-1] = -1;
+
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) {
fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[].");
return (memDist + memNLU + memTRS);
}
+ if ( !(Ufstnz_br_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_i * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Ufstnz_br_offset[nsupers_i-1] = -1;
+ memTRS += 2 * nsupers_i * sizeof(long int);
+
memNLU += nsupers_i*sizeof(doublecomplex*) + nsupers_i*sizeof(int_t*);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.zdist_psymbtonum [1] memDist %.4f, memNLU %.4f\n", memDist*1e-6, memNLU*1e-6);
+ }
+#endif
+
Unzval_br_ptr[nsupers_i-1] = NULL;
Ufstnz_br_ptr[nsupers_i-1] = NULL;
@@ -1465,7 +1544,7 @@ doublecomplex *dense, *dense_col; /* SPA */
memDist += (nsupers_i + nsupers_j)*iword;
/* Auxiliary arrays used to set up L, U block data structures.
- They are freed on return.
+ They are free'd on return.
k is the number of local row blocks. */
if ( !(dense = doublecomplexCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j)
* sp_ienv_dist(3, options))) ) {
@@ -1484,6 +1563,12 @@ doublecomplex *dense, *dense_col; /* SPA */
/* ------------------------------------------------ */
memNLU += 2*nsupers_i*iword +
SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3, options)*dword;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.zdist_psymbtonum [[2]] memDist %.2f, memNLU %.2f [+ dense SPA]\n", memDist*1e-6, memNLU*1e-6);
+ fflush(stdout);
+ }
+#endif
/* Pointers to the beginning of each block column of L. */
if ( !(Lnzval_bc_ptr =
@@ -1496,28 +1581,69 @@ doublecomplex *dense, *dense_col; /* SPA */
return (memDist + memNLU + memTRS);
}
+ if ( !(Lrowind_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
+ }
+ Lrowind_bc_offset[nsupers_j-1] = -1;
+ if ( !(Lnzval_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
+ }
+
if ( !(Linv_bc_ptr =
(doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) {
fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
+ if ( !(Linv_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
+ }
+
if ( !(Uinv_bc_ptr =
(doublecomplex**)SUPERLU_MALLOC(nsupers_j * sizeof(doublecomplex*))) ) {
fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
+
+ if ( !(Uinv_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
+ return (memDist + memNLU + memTRS);
+ }
+ Linv_bc_ptr[nsupers_j-1] = NULL;
+ Uinv_bc_ptr[nsupers_j-1] = NULL;
+ Linv_bc_offset[nsupers_j-1] = -1;
+ Uinv_bc_offset[nsupers_j-1] = -1;
+
+
if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){
fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[].");
return (memDist + memNLU + memTRS);
}
+ if ( !(Lindval_loc_bc_offset =
+ (long int*)SUPERLU_MALLOC(nsupers_j * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
+ }
if ( !(Unnz = (int_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int_t))) ){
fprintf(stderr, "Malloc fails for Unnz[].");
return (memDist + memNLU + memTRS);
}
- memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr
+
+ //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr, Uinv_bc_ptr, and 5 more ...
+ memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(doublecomplex) + nsupers_j*iword
+ + 5 * nsupers_j * sizeof(long int);
memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.zdist_psymbtonum [[3]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
Lnzval_bc_ptr[nsupers_j-1] = NULL;
Lrowind_bc_ptr[nsupers_j-1] = NULL;
Linv_bc_ptr[nsupers_j-1] = NULL;
@@ -1550,11 +1676,23 @@ doublecomplex *dense, *dense_col; /* SPA */
bsendx_plist[i] = &index1[j];
/* -------------------------------------------------------------- */
memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword;
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.zdist_psymbtonum [[4]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
+ long int Linv_bc_cnt=0;
+ long int Uinv_bc_cnt=0;
+ long int Lrowind_bc_cnt=0;
+ long int Lnzval_bc_cnt=0;
+ long int Lindval_loc_bc_cnt=0;
+
for (jb = 0; jb < nsupers; jb++) {
jbcol = PCOL( jb, grid );
jbrow = PROW( jb, grid );
@@ -1563,13 +1701,21 @@ doublecomplex *dense, *dense_col; /* SPA */
fsupc = FstBlockC( jb );
nsupc = SuperSize( jb );
+ /*------------------------------------------------
+ * SET UP U BLOCKS.
+ *------------------------------------------------*/
if ( myrow == jbrow ) { /* Block row jb in my process row */
+ Ufstnz_br_ptr[ljb_i] = NULL;
+ Unzval_br_ptr[ljb_i] = NULL;
+ Unzval_br_offset[ljb_i]=-1;
+ Ufstnz_br_offset[ljb_i]=-1;
+
/* Scatter A into SPA. */
for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) {
for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) {
if (i >= asup_rowptr[ilsum[nsupers_i]])
printf ("ERR7\n");
- jcol = asup_colind[i];
+ jcol = asup_colind[i]; // upper triangular part
if (jcol >= n)
printf ("Pe[%d] ERR distsn jb %d gb %d j %d jcol %d\n",
iam, (int) jb, (int) gb, (int) j, jcol);
@@ -1585,13 +1731,13 @@ doublecomplex *dense, *dense_col; /* SPA */
dense_col += ldaspa_j;
}
- /*------------------------------------------------
- * SET UP U BLOCKS.
- *------------------------------------------------*/
/* Count number of blocks and length of each block. */
nrbu = 0;
len = 0; /* Number of column subscripts I own. */
len1 = 0; /* number of fstnz subscripts */
+
+ /* ljb_i is the current local row block number in U.
+ Loop through every nonzero in this row block */
for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) {
if (i >= xusub[nsupers_i]) printf ("ERR10\n");
jcol = usub[i];
@@ -1608,7 +1754,7 @@ doublecomplex *dense, *dense_col; /* SPA */
pr = PROW( gb, grid );
if ( pr != jbrow && mycol == pc)
bsendx_plist[lb][jbrow] = YES;
- if (mycol == pc) {
+ if (mycol == pc) { /* I own this block */
len += nsupc;
LUb_length[lb] += nsupc;
ToSendD[ljb_i] = YES;
@@ -1652,12 +1798,26 @@ doublecomplex *dense, *dense_col; /* SPA */
return (memDist + memNLU + memTRS);
}
Ufstnz_br_ptr[ljb_i] = index;
+ Ufstnz_br_offset[ljb_i]=len1+1;
+ Ufstnz_br_cnt += Ufstnz_br_offset[ljb_i];
+
if (!(Unzval_br_ptr[ljb_i] =
doublecomplexMalloc_dist(len))) {
fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]");
return (memDist + memNLU + memTRS);
}
+ Unzval_br_offset[ljb_i]=len;
+ Unzval_br_cnt += Unzval_br_offset[ljb_i];
+
memNLU += (len1+1)*iword + len*dword;
+#if ( PRNTlevel>=1 )
+ if (iam==0 && (jb %10000 == 0) ) {
+ printf("\t.zdist_psymbtonum [jb %d setup-U] memNLU %.4f, memTRS %.4f\n",
+ (int) jb, memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
uval = Unzval_br_ptr[ljb_i];
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
@@ -1712,7 +1872,7 @@ doublecomplex *dense, *dense_col; /* SPA */
} else {
Ufstnz_br_ptr[ljb_i] = NULL;
Unzval_br_ptr[ljb_i] = NULL;
- } /* if nrbu ... */
+ } /* end if-else nrbu ... */
} /* if myrow == jbrow */
/*------------------------------------------------
@@ -1804,23 +1964,47 @@ doublecomplex *dense, *dense_col; /* SPA */
fprintf (stderr, "Malloc fails for index[]");
return (memDist + memNLU + memTRS);
}
+
+ Lrowind_bc_offset[ljb_j]=len1;
+ Lrowind_bc_cnt += Lrowind_bc_offset[ljb_j];
Lrowind_bc_ptr[ljb_j] = index;
+
if (!(Lnzval_bc_ptr[ljb_j] =
doublecomplexMalloc_dist(len*nsupc))) {
fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block %d\n", (int) jb);
return (memDist + memNLU + memTRS);
}
-
- if (!(Linv_bc_ptr[ljb_j] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
- ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]");
- if (!(Uinv_bc_ptr[ljb_j] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
- ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");
+ Lnzval_bc_offset[ljb_j]=len*nsupc;
+ Lnzval_bc_cnt += Lnzval_bc_offset[ljb_j];
+
+ myrow = MYROW( iam, grid );
+ krow = PROW( jb, grid );
+ if(myrow==krow){ /* diagonal block */
+ if (!(Linv_bc_ptr[ljb_j] = (doublecomplex*)doublecomplexMalloc_dist(nsupc*nsupc)) )
+ ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]");
+ Linv_bc_offset[ljb_j]=nsupc*nsupc;
+ Linv_bc_cnt += Linv_bc_offset[ljb_j];
+ if (!(Uinv_bc_ptr[ljb_j] = (doublecomplex*)doublecomplexMalloc_dist(nsupc*nsupc)) )
+ ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]");
+ Uinv_bc_offset[ljb_j]=nsupc*nsupc;
+ Uinv_bc_cnt += Uinv_bc_offset[ljb_j];
+ }else{
+ Linv_bc_ptr[ljb_j] = NULL;
+ Linv_bc_offset[ljb_j] = -1;
+ Uinv_bc_ptr[ljb_j] = NULL;
+ Uinv_bc_offset[ljb_j] = -1;
+ }
memNLU += len1*iword + len*nsupc*dword;
if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3)))
ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]");
- memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword; //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+
+ //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb]
+ memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword;
+
+ Lindval_loc_bc_offset[ljb_j]=nrbl*3;
+ Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb_j];
lusup = Lnzval_bc_ptr[ljb_j];
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
@@ -1868,8 +2052,6 @@ doublecomplex *dense, *dense_col; /* SPA */
}
} /* for i ... */
-
-
/* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/
if(nrbl>1){
krow = PROW( jb, grid );
@@ -1883,7 +2065,6 @@ doublecomplex *dense, *dense_col; /* SPA */
quickSortM(lloc,0,uu,nrbl,0,3);
}
-
if ( !(index_srt = intMalloc_dist(len1)) )
ABORT("Malloc fails for index_srt[]");
if (!(lusup_srt = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
@@ -1924,13 +2105,25 @@ doublecomplex *dense, *dense_col; /* SPA */
Lrowind_bc_ptr[ljb_j] = NULL;
Lnzval_bc_ptr[ljb_j] = NULL;
Linv_bc_ptr[ljb_j] = NULL;
+ Linv_bc_offset[ljb_j] = -1;
+ Lrowind_bc_offset[ljb_j]=-1;
+ Lindval_loc_bc_offset[ljb_j]=-1;
+ Lnzval_bc_offset[ljb_j]=-1;
Uinv_bc_ptr[ljb_j] = NULL;
+ Uinv_bc_offset[ljb_j] = -1;
Lindval_loc_bc_ptr[ljb_j] = NULL;
} /* if nrbl ... */
} /* if mycol == pc */
- } /* for jb ... */
+ } /* end for jb ... */
SUPERLU_FREE(ilsum_j);
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t.ddist_psymbtonum [[5]] memNLU %.2f, memTRS %.2f\n", memNLU*1e-6, memTRS*1e-6);
+ fflush(stdout);
+ }
+#endif
+
SUPERLU_FREE(Urb_marker);
SUPERLU_FREE(LUb_length);
SUPERLU_FREE(LUb_indptr);
@@ -2071,8 +2264,94 @@ doublecomplex *dense, *dense_col; /* SPA */
(*bsendx_plist)[k] = EMPTY;
}
}
- }
-
+ } /* end for jb ... */
+
+ Linv_bc_cnt +=1; // safe guard
+ Uinv_bc_cnt +=1;
+ Lrowind_bc_cnt +=1 ;
+ Lindval_loc_bc_cnt +=1;
+ Lnzval_bc_cnt +=1;
+ if ( !(Linv_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
+ }
+ if ( !(Uinv_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
+ }
+ if ( !(Lrowind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
+ }
+ if ( !(Lindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
+ }
+ if ( !(Lnzval_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ /* use contingous memory for Linv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Linv_bc_cnt=0;
+ Uinv_bc_cnt=0;
+ Lrowind_bc_cnt=0;
+ Lnzval_bc_cnt=0;
+ Lindval_loc_bc_cnt=0;
+ long int tmp_cnt;
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Linv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) {
+ Linv_bc_dat[Linv_bc_cnt+jj]=Linv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Linv_bc_ptr[jb]);
+ Linv_bc_ptr[jb]=&Linv_bc_dat[Linv_bc_cnt];
+ tmp_cnt = Linv_bc_offset[jb];
+ Linv_bc_offset[jb]=Linv_bc_cnt;
+ Linv_bc_cnt+=tmp_cnt;
+ }
+ if(Uinv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) {
+ Uinv_bc_dat[Uinv_bc_cnt+jj]=Uinv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Uinv_bc_ptr[jb]);
+ Uinv_bc_ptr[jb]=&Uinv_bc_dat[Uinv_bc_cnt];
+ tmp_cnt = Uinv_bc_offset[jb];
+ Uinv_bc_offset[jb]=Uinv_bc_cnt;
+ Uinv_bc_cnt+=tmp_cnt;
+ }
+ if(Lrowind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) {
+ Lrowind_bc_dat[Lrowind_bc_cnt+jj]=Lrowind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lrowind_bc_ptr[jb]);
+ Lrowind_bc_ptr[jb]=&Lrowind_bc_dat[Lrowind_bc_cnt];
+ tmp_cnt = Lrowind_bc_offset[jb];
+ Lrowind_bc_offset[jb]=Lrowind_bc_cnt;
+ Lrowind_bc_cnt+=tmp_cnt;
+ }
+ if(Lnzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) {
+ Lnzval_bc_dat[Lnzval_bc_cnt+jj]=Lnzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lnzval_bc_ptr[jb]);
+ Lnzval_bc_ptr[jb]=&Lnzval_bc_dat[Lnzval_bc_cnt];
+ tmp_cnt = Lnzval_bc_offset[jb];
+ Lnzval_bc_offset[jb]=Lnzval_bc_cnt;
+ Lnzval_bc_cnt+=tmp_cnt;
+ }
+ if(Lindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) {
+ Lindval_loc_bc_dat[Lindval_loc_bc_cnt+jj]=Lindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
+ Lindval_loc_bc_ptr[jb]=&Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
+ tmp_cnt = Lindval_loc_bc_offset[jb];
+ Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
+ Lindval_loc_bc_cnt+=tmp_cnt;
+ }
+ } /* end for jb ... */
+
/////////////////////////////////////////////////////////////////
/* Set up additional pointers for the index and value arrays of U.
@@ -2086,6 +2365,17 @@ doublecomplex *dense, *dense_col; /* SPA */
ABORT("Malloc fails for Ucb_indptr[]");
if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
ABORT("Malloc fails for Ucb_valptr[]");
+ if ( !(Ucb_valoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valoffset[].");
+ }
+ Ucb_valoffset[nub-1] = -1;
+ if ( !(Ucb_indoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
+ }
+ Ucb_indoffset[nub-1] = -1;
+
nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
/* Count number of row blocks in a block column.
@@ -2107,11 +2397,21 @@ doublecomplex *dense, *dense_col; /* SPA */
One pass of the skeleton graph of U. */
for (lb = 0; lb < nub; ++lb) {
if ( Urbs[lb] ) { /* Not an empty block column. */
- if ( !(Ucb_indptr[lb]
- = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+ if ( !(Ucb_indptr[lb]
+ = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
- if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
+ Ucb_indoffset[lb]=Urbs[lb];
+ Ucb_indcnt += Ucb_indoffset[lb];
+
+ if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
+ Ucb_valoffset[lb]=Urbs[lb];
+ Ucb_valcnt += Ucb_valoffset[lb];
+ }else{
+ Ucb_valptr[lb]=NULL;
+ Ucb_valoffset[lb]=-1;
+ Ucb_indptr[lb]=NULL;
+ Ucb_indoffset[lb]=-1;
}
}
for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
@@ -2135,9 +2435,7 @@ doublecomplex *dense, *dense_col; /* SPA */
}
}
-
-
-/* Count the nnzs per block column */
+ /* Count the nnzs per block column */
for (lb = 0; lb < nub; ++lb) {
Unnz[lb] = 0;
k = lb * grid->npcol + mycol;/* Global block number, column-wise. */
@@ -2155,7 +2453,82 @@ doublecomplex *dense, *dense_col; /* SPA */
}
} /* for jj ... */
}
- }
+ } /* end for lb ... */
+
+ Unzval_br_cnt +=1; // safe guard
+ Ufstnz_br_cnt +=1;
+ Ucb_valcnt +=1;
+ Ucb_indcnt +=1;
+ if ( !(Unzval_br_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ if ( !(Ufstnz_br_dat =
+ (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
+ }
+ if ( !(Ucb_valdat =
+ (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valdat[].");
+ }
+ if ( !(Ucb_inddat =
+ (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_inddat[].");
+ }
+
+ /* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
+ k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+ Unzval_br_cnt=0;
+ Ufstnz_br_cnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Unzval_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Unzval_br_offset[lb]; ++jj) {
+ Unzval_br_dat[Unzval_br_cnt+jj]=Unzval_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Unzval_br_ptr[lb]);
+ Unzval_br_ptr[lb]=&Unzval_br_dat[Unzval_br_cnt];
+ tmp_cnt = Unzval_br_offset[lb];
+ Unzval_br_offset[lb]=Unzval_br_cnt;
+ Unzval_br_cnt+=tmp_cnt;
+ }
+
+ if(Ufstnz_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Ufstnz_br_offset[lb]; ++jj) {
+ Ufstnz_br_dat[Ufstnz_br_cnt+jj]=Ufstnz_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Ufstnz_br_ptr[lb]);
+ Ufstnz_br_ptr[lb]=&Ufstnz_br_dat[Ufstnz_br_cnt];
+ tmp_cnt = Ufstnz_br_offset[lb];
+ Ufstnz_br_offset[lb]=Ufstnz_br_cnt;
+ Ufstnz_br_cnt+=tmp_cnt;
+ }
+ } /* end for lb ... */
+
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Ucb_valcnt=0;
+ Ucb_indcnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Ucb_valptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_valoffset[lb]; ++jj) {
+ Ucb_valdat[Ucb_valcnt+jj]=Ucb_valptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_valptr[lb]);
+ Ucb_valptr[lb]=&Ucb_valdat[Ucb_valcnt];
+ tmp_cnt = Ucb_valoffset[lb];
+ Ucb_valoffset[lb]=Ucb_valcnt;
+ Ucb_valcnt+=tmp_cnt;
+ }
+ if(Ucb_indptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_indoffset[lb]; ++jj) {
+ Ucb_inddat[Ucb_indcnt+jj]=Ucb_indptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_indptr[lb]);
+ Ucb_indptr[lb]=&Ucb_inddat[Ucb_indcnt];
+ tmp_cnt = Ucb_indoffset[lb];
+ Ucb_indoffset[lb]=Ucb_indcnt;
+ Ucb_indcnt+=tmp_cnt;
+ }
+ } /* end for lb ... */
/////////////////////////////////////////////////////////////////
@@ -2789,14 +3162,13 @@ doublecomplex *dense, *dense_col; /* SPA */
////////////////////////////////////////////////////////
- /* Free the memory used for storing L and U */
+ /* Free the memory used for storing symbolic structures of L and U */
SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub);
if (lsub != NULL)
SUPERLU_FREE(lsub);
if (usub != NULL)
SUPERLU_FREE(usub);
-
SUPERLU_FREE(nnzToRecv);
SUPERLU_FREE(ptrToRecv);
SUPERLU_FREE(nnzToSend);
@@ -2804,12 +3176,30 @@ doublecomplex *dense, *dense_col; /* SPA */
SUPERLU_FREE(recvBuf);
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+ Llu->Lrowind_bc_dat = Lrowind_bc_dat;
+ Llu->Lrowind_bc_offset = Lrowind_bc_offset;
+ Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
+
Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+ Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
+ Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
+ Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
+
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
- Llu->Linv_bc_ptr = Linv_bc_ptr;
- Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Lnzval_bc_dat = Lnzval_bc_dat;
+ Llu->Lnzval_bc_offset = Lnzval_bc_offset;
+ Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
+
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+ Llu->Ufstnz_br_dat = Ufstnz_br_dat;
+ Llu->Ufstnz_br_offset = Ufstnz_br_offset;
+ Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
+
Llu->Unzval_br_ptr = Unzval_br_ptr;
+ Llu->Unzval_br_dat = Unzval_br_dat;
+ Llu->Unzval_br_offset = Unzval_br_offset;
+ Llu->Unzval_br_cnt = Unzval_br_cnt;
+
Llu->Unnz = Unnz;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
@@ -2829,9 +3219,72 @@ doublecomplex *dense, *dense_col; /* SPA */
Llu->LBtree_ptr = LBtree_ptr;
Llu->URtree_ptr = URtree_ptr;
Llu->UBtree_ptr = UBtree_ptr;
+
+ Llu->Linv_bc_ptr = Linv_bc_ptr;
+ Llu->Linv_bc_dat = Linv_bc_dat;
+ Llu->Linv_bc_offset = Linv_bc_offset;
+ Llu->Linv_bc_cnt = Linv_bc_cnt;
+
+ Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Uinv_bc_dat = Uinv_bc_dat;
+ Llu->Uinv_bc_offset = Uinv_bc_offset;
+ Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+
Llu->Urbs = Urbs;
Llu->Ucb_indptr = Ucb_indptr;
+ Llu->Ucb_inddat = Ucb_inddat;
+ Llu->Ucb_indoffset = Ucb_indoffset;
+ Llu->Ucb_indcnt = Ucb_indcnt;
+
Llu->Ucb_valptr = Ucb_valptr;
+ Llu->Ucb_valdat = Ucb_valdat;
+ Llu->Ucb_valoffset = Ucb_valoffset;
+ Llu->Ucb_valcnt = Ucb_valcnt;
+
+#ifdef GPU_ACC
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+
+ // some dummy allocation to avoid checking whether they are null pointers later
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+
+ /* gpuMemcpy for the following is performed in pxgssvx */
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(double)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(double)));
+
+# endif /* end ifdef GPU_ACC */
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
@@ -2847,11 +3300,22 @@ doublecomplex *dense, *dense_col; /* SPA */
MPI_MAX, grid->comm);
#if ( DEBUGlevel>=1 )
- /* Memory allocated but not freed:
+ /* Memory allocated but not free'd:
ilsum, fmod, fsendx_plist, bmod, bsendx_plist,
ToRecv, ToSendR, ToSendD, mod_bit
*/
- CHECK_MALLOC(iam, "Exit dist_psymbtonum()");
+ CHECK_MALLOC(iam, "Exit zdist_psymbtonum()");
+#endif
+
+#if ( PRNTlevel>=1 )
+ if (iam==0) {
+ printf("\t. end zdist_psymbtonum: memDist %.4f, memNLU %.4f, memTRS %.2f\n",
+ memDist*1e-6, memNLU*1e-6, memTRS*1e-6);
+ printf("\t\t. dense[] SPA %.4f (MB), ldaspa %d, ldaspa_j %d\n",
+ SUPERLU_MAX(ldaspa, ldaspa_j) * sp_ienv_dist(3, options) * dword * 1e-6,
+ (int) ldaspa, (int) ldaspa_j);
+ fflush(stdout);
+ }
#endif
return (- (memDist+memNLU));
diff --git a/SRC/pzutil.c b/SRC/pzutil.c
index 1821c68e..18446005 100755
--- a/SRC/pzutil.c
+++ b/SRC/pzutil.c
@@ -16,11 +16,17 @@ at the top-level directory.
* -- Distributed SuperLU routine (version 2.0) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* March 15, 2003
+ *
+ * Last modified:
+ * December 28, 2022
*
*/
#include
#include "superlu_zdefs.h"
+#ifdef GPU_ACC
+#include "gpu_api_utils.h"
+#endif
/*! \brief Gather A from the distributed compressed row format to global A in compressed column format.
*/
@@ -429,55 +435,6 @@ void zLUstructFree(zLUstruct_t *LUstruct)
#endif
}
-void
-zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
-{
- int_t i, nb, nsupers;
- Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
- zLocalLU_t *Llu = LUstruct->Llu;
-#if ( DEBUGlevel>=1 )
- int iam;
- MPI_Comm_rank( MPI_COMM_WORLD, &iam );
- CHECK_MALLOC(iam, "Enter zDestroy_Tree()");
-#endif
-
- nsupers = Glu_persist->supno[n-1] + 1;
-
- nb = CEILING(nsupers, grid->npcol);
- for (i=0;iLBtree_ptr[i].empty_==NO){
- // BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
- C_BcTree_Nullify(&Llu->LBtree_ptr[i]);
- }
- if(Llu->UBtree_ptr[i].empty_==NO){
- // BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
- C_BcTree_Nullify(&Llu->UBtree_ptr[i]);
- }
- }
- SUPERLU_FREE(Llu->LBtree_ptr);
- SUPERLU_FREE(Llu->UBtree_ptr);
-
- nb = CEILING(nsupers, grid->nprow);
- for (i=0;iLRtree_ptr[i].empty_==NO){
- // RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
- C_RdTree_Nullify(&Llu->LRtree_ptr[i]);
- }
- if(Llu->URtree_ptr[i].empty_==NO){
- // RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
- C_RdTree_Nullify(&Llu->URtree_ptr[i]);
- }
- }
- SUPERLU_FREE(Llu->LRtree_ptr);
- SUPERLU_FREE(Llu->URtree_ptr);
-
-#if ( DEBUGlevel>=1 )
- CHECK_MALLOC(iam, "Exit zDestroy_Tree()");
-#endif
-}
-
-
-
/*! \brief Destroy distributed L & U matrices. */
void
zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
@@ -496,27 +453,38 @@ zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
nsupers = Glu_persist->supno[n-1] + 1;
- nb = CEILING(nsupers, grid->npcol);
- for (i = 0; i < nb; ++i)
- if ( Llu->Lrowind_bc_ptr[i] ) {
- SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
-#if 0 // Sherry: the following is not allocated with gpuHostAlloc
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Lrowind_bc_ptr[i] ) {
+ // SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#if 0 // Sherry: the following is not allocated with cudaHostAlloc
//#ifdef GPU_ACC
checkGPU(gpuFreeHost(Llu->Lnzval_bc_ptr[i]));
#endif
- SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
- }
+ // SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+ // }
+
SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+ SUPERLU_FREE (Llu->Lrowind_bc_dat);
+ SUPERLU_FREE (Llu->Lrowind_bc_offset);
SUPERLU_FREE (Llu->Lnzval_bc_ptr);
-
- nb = CEILING(nsupers, grid->nprow);
- for (i = 0; i < nb; ++i)
- if ( Llu->Ufstnz_br_ptr[i] ) {
- SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
- SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
- }
+ SUPERLU_FREE (Llu->Lnzval_bc_dat);
+ SUPERLU_FREE (Llu->Lnzval_bc_offset);
+
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->nprow);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Ufstnz_br_ptr[i] ) {
+ // SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+ // SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+ // }
SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+ SUPERLU_FREE (Llu->Ufstnz_br_dat);
+ SUPERLU_FREE (Llu->Ufstnz_br_offset);
SUPERLU_FREE (Llu->Unzval_br_ptr);
+ SUPERLU_FREE (Llu->Unzval_br_dat);
+ SUPERLU_FREE (Llu->Unzval_br_offset);
/* The following can be freed after factorization. */
SUPERLU_FREE(Llu->ToRecv);
@@ -527,166 +495,90 @@ zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
/* The following can be freed only after iterative refinement. */
SUPERLU_FREE(Llu->ilsum);
SUPERLU_FREE(Llu->fmod);
- SUPERLU_FREE(Llu->fsendx_plist[0]);
+ SUPERLU_FREE((Llu->fsendx_plist)[0]);
SUPERLU_FREE(Llu->fsendx_plist);
SUPERLU_FREE(Llu->bmod);
- SUPERLU_FREE(Llu->bsendx_plist[0]);
+ SUPERLU_FREE((Llu->bsendx_plist)[0]);
SUPERLU_FREE(Llu->bsendx_plist);
SUPERLU_FREE(Llu->mod_bit);
- nb = CEILING(nsupers, grid->npcol);
- for (i = 0; i < nb; ++i)
- if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
- SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
- }
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
+ // SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
+ // }
SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
-
- nb = CEILING(nsupers, grid->npcol);
- for (i=0; iLinv_bc_ptr[i]!=NULL) {
- SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
- }
- if(Llu->Uinv_bc_ptr[i]!=NULL){
- SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
- }
- }
+ SUPERLU_FREE(Llu->Lindval_loc_bc_dat);
+ SUPERLU_FREE(Llu->Lindval_loc_bc_offset);
+
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i=0; iLinv_bc_ptr[i]!=NULL) {
+ // SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
+ // }
+ // if(Llu->Uinv_bc_ptr[i]!=NULL){
+ // SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
+ // }
+ // }
SUPERLU_FREE(Llu->Linv_bc_ptr);
+ SUPERLU_FREE(Llu->Linv_bc_dat);
+ SUPERLU_FREE(Llu->Linv_bc_offset);
SUPERLU_FREE(Llu->Uinv_bc_ptr);
+ SUPERLU_FREE(Llu->Uinv_bc_dat);
+ SUPERLU_FREE(Llu->Uinv_bc_offset);
SUPERLU_FREE(Llu->Unnz);
-
- nb = CEILING(nsupers, grid->npcol);
- for (i = 0; i < nb; ++i)
- if ( Llu->Urbs[i] ) {
- SUPERLU_FREE(Llu->Ucb_indptr[i]);
- SUPERLU_FREE(Llu->Ucb_valptr[i]);
- }
+
+ /* Following are free'd in distribution routines */
+ // nb = CEILING(nsupers, grid->npcol);
+ // for (i = 0; i < nb; ++i)
+ // if ( Llu->Urbs[i] ) {
+ // SUPERLU_FREE(Llu->Ucb_indptr[i]);
+ // SUPERLU_FREE(Llu->Ucb_valptr[i]);
+ // }
SUPERLU_FREE(Llu->Ucb_indptr);
+ SUPERLU_FREE(Llu->Ucb_inddat);
+ SUPERLU_FREE(Llu->Ucb_indoffset);
SUPERLU_FREE(Llu->Ucb_valptr);
+ SUPERLU_FREE(Llu->Ucb_valdat);
+ SUPERLU_FREE(Llu->Ucb_valoffset);
SUPERLU_FREE(Llu->Urbs);
-
+
SUPERLU_FREE(Glu_persist->xsup);
SUPERLU_FREE(Glu_persist->supno);
+#ifdef GPU_ACC
+ checkGPU (gpuFree (Llu->d_xsup));
+ checkGPU (gpuFree (Llu->d_LRtree_ptr));
+ checkGPU (gpuFree (Llu->d_LBtree_ptr));
+ checkGPU (gpuFree (Llu->d_URtree_ptr));
+ checkGPU (gpuFree (Llu->d_UBtree_ptr));
+ checkGPU (gpuFree (Llu->d_ilsum));
+ checkGPU (gpuFree (Llu->d_Lrowind_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lrowind_bc_offset));
+ checkGPU (gpuFree (Llu->d_Lnzval_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lnzval_bc_offset));
+ checkGPU (gpuFree (Llu->d_Linv_bc_dat));
+ checkGPU (gpuFree (Llu->d_Uinv_bc_dat));
+ checkGPU (gpuFree (Llu->d_Linv_bc_offset));
+ checkGPU (gpuFree (Llu->d_Uinv_bc_offset));
+ checkGPU (gpuFree (Llu->d_Lindval_loc_bc_dat));
+ checkGPU (gpuFree (Llu->d_Lindval_loc_bc_offset));
+
+ checkGPU (gpuFree (Llu->d_Ucolind_bc_dat));
+ checkGPU (gpuFree (Llu->d_Ucolind_bc_offset));
+ checkGPU (gpuFree (Llu->d_Unzval_bc_dat));
+ checkGPU (gpuFree (Llu->d_Unzval_bc_offset));
+ checkGPU (gpuFree (Llu->d_Uindval_loc_bc_dat));
+ checkGPU (gpuFree (Llu->d_Uindval_loc_bc_offset));
+#endif
+
#if ( DEBUGlevel>=1 )
CHECK_MALLOC(iam, "Exit zDestroy_LU()");
#endif
}
-// /*! \brief Destroy distributed L & U matrices. */
-// void
-// zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
-// {
-// int_t i, nb, nsupers;
-// Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-// zLocalLU_t *Llu = LUstruct->Llu;
-
-// #if ( DEBUGlevel>=1 )
-// int iam;
-// MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-// CHECK_MALLOC(iam, "Enter zDestroy_LU()");
-// #endif
-
-// zDestroy_Tree(n, grid, LUstruct);
-
-// nsupers = Glu_persist->supno[n-1] + 1;
-
-// nb = CEILING(nsupers, grid->npcol);
-// // for (i = 0; i < nb; ++i)
-// // if ( Llu->Lrowind_bc_ptr[i] ) {
-// // SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
-// // SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
-// // }
-// SUPERLU_FREE (Llu->Lrowind_bc_ptr);
-// SUPERLU_FREE (Llu->Lrowind_bc_dat);
-// SUPERLU_FREE (Llu->Lrowind_bc_offset);
-// SUPERLU_FREE (Llu->Lnzval_bc_ptr);
-// SUPERLU_FREE (Llu->Lnzval_bc_dat);
-// SUPERLU_FREE (Llu->Lnzval_bc_offset);
-
-// nb = CEILING(nsupers, grid->nprow);
-// for (i = 0; i < nb; ++i)
-// if ( Llu->Ufstnz_br_ptr[i] ) {
-// SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
-// SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
-// }
-// SUPERLU_FREE (Llu->Ufstnz_br_ptr);
-// SUPERLU_FREE (Llu->Unzval_br_ptr);
-
-// /* The following can be freed after factorization. */
-// SUPERLU_FREE(Llu->ToRecv);
-// SUPERLU_FREE(Llu->ToSendD);
-// SUPERLU_FREE(Llu->ToSendR[0]);
-// SUPERLU_FREE(Llu->ToSendR);
-
-// /* The following can be freed only after iterative refinement. */
-// SUPERLU_FREE(Llu->ilsum);
-// SUPERLU_FREE(Llu->fmod);
-// SUPERLU_FREE(Llu->fsendx_plist[0]);
-// SUPERLU_FREE(Llu->fsendx_plist);
-// SUPERLU_FREE(Llu->bmod);
-// SUPERLU_FREE(Llu->bsendx_plist[0]);
-// SUPERLU_FREE(Llu->bsendx_plist);
-// SUPERLU_FREE(Llu->mod_bit);
-
-// // nb = CEILING(nsupers, grid->npcol);
-// // for (i = 0; i < nb; ++i)
-// // if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
-// // SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
-// // }
-// SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
-// SUPERLU_FREE(Llu->Lindval_loc_bc_dat);
-// SUPERLU_FREE(Llu->Lindval_loc_bc_offset);
-
-// nb = CEILING(nsupers, grid->npcol);
-// for (i=0; iLinv_bc_ptr[i]!=NULL) {
-// // SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
-// // }
-
-// if(Llu->Uinv_bc_ptr[i]!=NULL){
-// SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
-// }
-// }
-// SUPERLU_FREE(Llu->Linv_bc_ptr);
-// SUPERLU_FREE(Llu->Linv_bc_dat);
-// SUPERLU_FREE(Llu->Linv_bc_offset);
-// SUPERLU_FREE(Llu->Uinv_bc_ptr);
-// SUPERLU_FREE(Llu->Unnz);
-
-// nb = CEILING(nsupers, grid->npcol);
-// for (i = 0; i < nb; ++i)
-// if ( Llu->Urbs[i] ) {
-// SUPERLU_FREE(Llu->Ucb_indptr[i]);
-// SUPERLU_FREE(Llu->Ucb_valptr[i]);
-// }
-// SUPERLU_FREE(Llu->Ucb_indptr);
-// SUPERLU_FREE(Llu->Ucb_valptr);
-// SUPERLU_FREE(Llu->Urbs);
-
-// SUPERLU_FREE(Glu_persist->xsup);
-// SUPERLU_FREE(Glu_persist->supno);
-
-// #ifdef GPU_ACC
-// checkGPU (gpuFree (Llu->d_xsup));
-// checkGPU (gpuFree (Llu->d_LRtree_ptr));
-// checkGPU (gpuFree (Llu->d_LBtree_ptr));
-// checkGPU (gpuFree (Llu->d_ilsum));
-// checkGPU (gpuFree (Llu->d_Lrowind_bc_dat));
-// checkGPU (gpuFree (Llu->d_Lrowind_bc_offset));
-// checkGPU (gpuFree (Llu->d_Lnzval_bc_dat));
-// checkGPU (gpuFree (Llu->d_Lnzval_bc_offset));
-// checkGPU (gpuFree (Llu->d_Linv_bc_dat));
-// checkGPU (gpuFree (Llu->d_Linv_bc_offset));
-// checkGPU (gpuFree (Llu->d_Lindval_loc_bc_dat));
-// checkGPU (gpuFree (Llu->d_Lindval_loc_bc_offset));
-// #endif
-
-
-// #if ( DEBUGlevel>=1 )
-// CHECK_MALLOC(iam, "Exit zDestroy_LU()");
-// #endif
-// }
-
/*! \brief
*
*
@@ -694,7 +586,7 @@ zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
* =======
* Set up the communication pattern for redistribution between B and X
* in the triangular solution.
- *
+ *
* Arguments
* =========
*
@@ -765,7 +657,7 @@ pzgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row,
p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */
++SendCnt[p];
}
-
+
/* Set up the displacements for alltoall. */
MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm);
sdispls[0] = rdispls[0] = 0;
@@ -968,7 +860,7 @@ void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
SUPERLU_FREE( A2d->colind );
SUPERLU_FREE( A2d->nzval );
}
- SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts
+ SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts
SUPERLU_FREE(A3d->row_disp);
SUPERLU_FREE(A3d->nnz_counts_int);
SUPERLU_FREE(A3d->nnz_disp);
@@ -995,27 +887,82 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx
doublecomplex *x_work, *xtrue_work;
doublecomplex temp;
int i, j;
+ double errcomp; // componentwise error
+ double derr;
for (j = 0; j < nrhs; j++) {
x_work = &x[j*ldx];
xtrue_work = &xtrue[j*ldxtrue];
- err = xnorm = 0.0;
+ err = xnorm = errcomp = 0.0;
for (i = 0; i < n; i++) {
z_sub(&temp, &x_work[i], &xtrue_work[i]);
err = SUPERLU_MAX(err, slud_z_abs(&temp));
xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i]));
+ errcomp = SUPERLU_MAX(errcomp, slud_z_abs(&temp) / slud_z_abs(&x_work[i]) );
}
/* get the golbal max err & xnrom */
temperr = err;
- tempxnorm = xnorm;
MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm);
+ tempxnorm = xnorm;
MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm);
+ temperr = errcomp;
+ MPI_Allreduce( &temperr, &errcomp, 1, MPI_FLOAT, MPI_MAX, slucomm);
err = err / xnorm;
- if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err);
+ if ( !iam ) {
+ printf(".. Sol %2d: ||X - Xtrue|| / ||X|| = %e\t max_i |x - xtrue|_i / |x|_i = %e\n", j, err, errcomp);
+ fflush(stdout);
+ }
}
}
+/*! \brief Destroy broadcast and reduction trees used in triangular solve */
+void
+zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+ int i, nb, nsupers;
+ Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+ zLocalLU_t *Llu = LUstruct->Llu;
+#if ( DEBUGlevel>=1 )
+ int iam;
+ MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+ CHECK_MALLOC(iam, "Enter zDestroy_Tree()");
+#endif
+
+ nsupers = Glu_persist->supno[n-1] + 1;
+
+ nb = CEILING(nsupers, grid->npcol);
+ for (i=0;iLBtree_ptr[i].empty_==NO){
+ // BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt);
+ C_BcTree_Nullify(&Llu->LBtree_ptr[i]);
+ }
+ if(Llu->UBtree_ptr[i].empty_==NO){
+ // BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt);
+ C_BcTree_Nullify(&Llu->UBtree_ptr[i]);
+ }
+ }
+ SUPERLU_FREE(Llu->LBtree_ptr);
+ SUPERLU_FREE(Llu->UBtree_ptr);
+
+ nb = CEILING(nsupers, grid->nprow);
+ for (i=0;iLRtree_ptr[i].empty_==NO){
+ // RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt);
+ C_RdTree_Nullify(&Llu->LRtree_ptr[i]);
+ }
+ if(Llu->URtree_ptr[i].empty_==NO){
+ // RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt);
+ C_RdTree_Nullify(&Llu->URtree_ptr[i]);
+ }
+ }
+ SUPERLU_FREE(Llu->LRtree_ptr);
+ SUPERLU_FREE(Llu->URtree_ptr);
+
+#if ( DEBUGlevel>=1 )
+ CHECK_MALLOC(iam, "Exit zDestroy_Tree()");
+#endif
+}
diff --git a/SRC/sdistribute.c b/SRC/sdistribute.c
index 66fdced7..63488284 100644
--- a/SRC/sdistribute.c
+++ b/SRC/sdistribute.c
@@ -17,10 +17,14 @@ at the top-level directory.
* -- Distributed SuperLU routine (version 2.3) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 15, 2008
+ *
+ * January 9, 2023
*
*/
#include "superlu_sdefs.h"
-
+#ifdef GPU_ACC
+#include "gpu_api_utils.h"
+#endif
/*! \brief
*
@@ -60,7 +64,8 @@ at the top-level directory.
*/
float
-sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
+sdistribute(superlu_dist_options_t *options,
+ int_t n, SuperMatrix *A,
Glu_freeable_t *Glu_freeable,
sLUstruct_t *LUstruct, gridinfo_t *grid)
{
@@ -92,21 +97,43 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int_t *index_srt; /* indices consist of headers and row subscripts */
int *index1; /* temporary pointer to array of int */
float *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
- float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */
+ long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
+
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Unnz; /* size ceil(NSUPERS/Pc) */
+ int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+
+ int_t *Unnz; /* size ceil(NSUPERS/Pc) */
float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
- int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
- int msgsize;
+ float *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */
+ long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Unzval_br_cnt=0;
+ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
+ long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Ufstnz_br_cnt=0;
+ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
+
+ int msgsize;
int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+ Ucb_indptr_t *Ucb_inddat;
+ long int *Ucb_indoffset;
+ long int Ucb_indcnt=0;
+
int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
+ int_t *Ucb_valdat;
+ long int *Ucb_valoffset;
+ long int Ucb_valcnt=0;
/*-- Counts to be used in factorization. --*/
int *ToRecv, *ToSendD, **ToSendR;
@@ -155,7 +182,12 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int *frecv, *brecv;
int_t *lloc;
float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */
+ long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */
+ long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
+
double *SeedSTD_BC,*SeedSTD_RD;
int_t idx_indx,idx_lusup;
int_t nbrow;
@@ -360,8 +392,19 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(Unzval_br_ptr =
(float**)SUPERLU_MALLOC(k * sizeof(float*))) )
ABORT("Malloc fails for Unzval_br_ptr[].");
+ if ( !(Unzval_br_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
+ }
+ Unzval_br_offset[k-1] = -1;
+
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Ufstnz_br_ptr[].");
+ if ( !(Ufstnz_br_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
+ }
+ Ufstnz_br_offset[k-1] = -1;
if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
ABORT("Malloc fails for ToSendD[].");
@@ -450,8 +493,14 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(index = intMalloc_dist(len1+1)) )
ABORT("Malloc fails for Uindex[].");
Ufstnz_br_ptr[lb] = index;
+ Ufstnz_br_offset[lb] = len1+1;
+ Ufstnz_br_cnt += Ufstnz_br_offset[lb];
+
if ( !(Unzval_br_ptr[lb] = floatMalloc_dist(len)) )
ABORT("Malloc fails for Unzval_br_ptr[*][].");
+ Unzval_br_offset[lb]=len;
+ Unzval_br_cnt += Unzval_br_offset[lb];
+
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
index[0] = Ucbs[lb]; /* Number of column blocks */
@@ -461,6 +510,8 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} else {
Ufstnz_br_ptr[lb] = NULL;
Unzval_br_ptr[lb] = NULL;
+ Unzval_br_offset[lb]=-1;
+ Ufstnz_br_offset[lb]=-1;
}
Urb_length[lb] = 0; /* Reset block length. */
Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
@@ -504,28 +555,54 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
/* Pointers to the beginning of each block column of L. */
if ( !(Lnzval_bc_ptr = (float**)SUPERLU_MALLOC(k * sizeof(float*))) )
ABORT("Malloc fails for Lnzval_bc_ptr[].");
+ Lnzval_bc_ptr[k-1] = NULL;
if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lrowind_bc_ptr[].");
Lrowind_bc_ptr[k-1] = NULL;
+ if ( !(Lrowind_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
+ }
+ Lrowind_bc_offset[k-1] = -1;
+ if ( !(Lnzval_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
+ }
+ Lnzval_bc_offset[k-1] = -1;
if ( !(Lindval_loc_bc_ptr =
(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
Lindval_loc_bc_ptr[k-1] = NULL;
+ if ( !(Lindval_loc_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
+ }
+ Lindval_loc_bc_offset[k-1] = -1;
if ( !(Linv_bc_ptr =
(float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
}
+ if ( !(Linv_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
+ }
if ( !(Uinv_bc_ptr =
(float**)SUPERLU_MALLOC(k * sizeof(float*))) ) {
fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
}
+ if ( !(Uinv_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
+ }
Linv_bc_ptr[k-1] = NULL;
Uinv_bc_ptr[k-1] = NULL;
+ Linv_bc_offset[k-1] = -1;
+ Uinv_bc_offset[k-1] = -1;
if ( !(Unnz = (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
- ABORT("Malloc fails for Unnz[].");
+ ABORT("Malloc fails for Unnz[].");
/* These lists of processes will be used for triangular solves. */
if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
@@ -544,12 +621,17 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
bsendx_plist[i] = &index1[j];
- mem_use += 4.0*k*sizeof(int*) + 2.0*len*sizeof(int);
+ mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*sizeof(int);
/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
+ long int Linv_bc_cnt=0;
+ long int Uinv_bc_cnt=0;
+ long int Lrowind_bc_cnt=0;
+ long int Lnzval_bc_cnt=0;
+ long int Lindval_loc_bc_cnt=0;
for (jb = 0; jb < nsupers; ++jb) {
pc = PCOL( jb, grid );
@@ -691,16 +773,38 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
index[] and nzval[]. */
/* Add room for descriptors */
len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
- if ( !(index = intMalloc_dist(len1)) )
- ABORT("Malloc fails for index[]");
- if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
+ if ( !(index = intMalloc_dist(len1)) )
+ ABORT("Malloc fails for index[]");
+ Lrowind_bc_offset[ljb]=len1;
+ Lrowind_bc_cnt += Lrowind_bc_offset[ljb];
+ if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float))))
ABORT("Malloc fails for lusup[]");
- if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
+ Lnzval_bc_offset[ljb]=len*nsupc;
+ Lnzval_bc_cnt += Lnzval_bc_offset[ljb];
+
+ if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
- if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
- ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+ Lindval_loc_bc_offset[ljb]=nrbl*3;
+ Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb];
+
+ myrow = MYROW( iam, grid );
+ krow = PROW( jb, grid );
+ if(myrow==krow){ /* diagonal block */
+ if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
+ ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+ Linv_bc_offset[ljb]=nsupc*nsupc;
+ Linv_bc_cnt += Linv_bc_offset[ljb];
if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float))))
ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+ Uinv_bc_offset[ljb]=nsupc*nsupc;
+ Uinv_bc_cnt += Uinv_bc_offset[ljb];
+ }else{
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ }
+
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
@@ -812,9 +916,14 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} else {
Lrowind_bc_ptr[ljb] = NULL;
Lnzval_bc_ptr[ljb] = NULL;
- Linv_bc_ptr[ljb] = NULL;
- Uinv_bc_ptr[ljb] = NULL;
- Lindval_loc_bc_ptr[ljb] = NULL;
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Lrowind_bc_offset[ljb]=-1;
+ Lindval_loc_bc_offset[ljb]=-1;
+ Lnzval_bc_offset[ljb]=-1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ Lindval_loc_bc_ptr[ljb] = NULL;
} /* if nrbl ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
@@ -823,6 +932,98 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} /* for jb ... */
+ Linv_bc_cnt +=1; // safe guard
+ Uinv_bc_cnt +=1;
+ Lrowind_bc_cnt +=1;
+ Lindval_loc_bc_cnt +=1;
+ Lnzval_bc_cnt +=1;
+
+ if ( !(Linv_bc_dat =
+ (float*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
+ }
+ if ( !(Uinv_bc_dat =
+ (float*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
+ }
+
+ if ( !(Lrowind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
+ }
+ if ( !(Lindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
+ }
+ if ( !(Lnzval_bc_dat =
+ (float*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+
+ /* use contingous memory for Linv_bc_ptr, Uinv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Linv_bc_cnt=0;
+ Uinv_bc_cnt=0;
+ Lrowind_bc_cnt=0;
+ Lnzval_bc_cnt=0;
+ Lindval_loc_bc_cnt=0;
+ long int tmp_cnt;
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Linv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) {
+ Linv_bc_dat[Linv_bc_cnt+jj]=Linv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Linv_bc_ptr[jb]);
+ Linv_bc_ptr[jb]=&Linv_bc_dat[Linv_bc_cnt];
+ tmp_cnt = Linv_bc_offset[jb];
+ Linv_bc_offset[jb]=Linv_bc_cnt;
+ Linv_bc_cnt+=tmp_cnt;
+ }
+ if(Uinv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) {
+ Uinv_bc_dat[Uinv_bc_cnt+jj]=Uinv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Uinv_bc_ptr[jb]);
+ Uinv_bc_ptr[jb]=&Uinv_bc_dat[Uinv_bc_cnt];
+ tmp_cnt = Uinv_bc_offset[jb];
+ Uinv_bc_offset[jb]=Uinv_bc_cnt;
+ Uinv_bc_cnt+=tmp_cnt;
+ }
+ if(Lrowind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) {
+ Lrowind_bc_dat[Lrowind_bc_cnt+jj]=Lrowind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lrowind_bc_ptr[jb]);
+ Lrowind_bc_ptr[jb]=&Lrowind_bc_dat[Lrowind_bc_cnt];
+ tmp_cnt = Lrowind_bc_offset[jb];
+ Lrowind_bc_offset[jb]=Lrowind_bc_cnt;
+ Lrowind_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lnzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) {
+ Lnzval_bc_dat[Lnzval_bc_cnt+jj]=Lnzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lnzval_bc_ptr[jb]);
+ Lnzval_bc_ptr[jb]=&Lnzval_bc_dat[Lnzval_bc_cnt];
+ tmp_cnt = Lnzval_bc_offset[jb];
+ Lnzval_bc_offset[jb]=Lnzval_bc_cnt;
+ Lnzval_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) {
+ Lindval_loc_bc_dat[Lindval_loc_bc_cnt+jj]=Lindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
+ Lindval_loc_bc_ptr[jb]=&Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
+ tmp_cnt = Lindval_loc_bc_offset[jb];
+ Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
+ Lindval_loc_bc_cnt+=tmp_cnt;
+ }
+
+ } /* for jb ... */
+
/////////////////////////////////////////////////////////////////
/* Set up additional pointers for the index and value arrays of U.
@@ -836,6 +1037,17 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
ABORT("Malloc fails for Ucb_indptr[]");
if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
ABORT("Malloc fails for Ucb_valptr[]");
+ if ( !(Ucb_valoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valoffset[].");
+ }
+ Ucb_valoffset[nub-1] = -1;
+ if ( !(Ucb_indoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
+ }
+ Ucb_indoffset[nub-1] = -1;
+
nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
/* Count number of row blocks in a block column.
@@ -858,10 +1070,19 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
for (lb = 0; lb < nub; ++lb) {
if ( Urbs[lb] ) { /* Not an empty block column. */
if ( !(Ucb_indptr[lb]
- = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+ = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
+ Ucb_indoffset[lb]=Urbs[lb];
+ Ucb_indcnt += Ucb_indoffset[lb];
if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
+ Ucb_valoffset[lb]=Urbs[lb];
+ Ucb_valcnt += Ucb_valoffset[lb];
+ }else{
+ Ucb_valptr[lb]=NULL;
+ Ucb_valoffset[lb]=-1;
+ Ucb_indptr[lb]=NULL;
+ Ucb_indoffset[lb]=-1;
}
}
for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
@@ -906,6 +1127,81 @@ sdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
}
}
+ Unzval_br_cnt +=1; // safe guard
+ Ufstnz_br_cnt +=1;
+ Ucb_valcnt +=1 ;
+ Ucb_indcnt +=1;
+ if ( !(Unzval_br_dat =
+ (float*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(float))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ if ( !(Ufstnz_br_dat =
+ (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
+ }
+ if ( !(Ucb_valdat =
+ (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valdat[].");
+ }
+ if ( !(Ucb_inddat =
+ (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_inddat[].");
+ }
+
+ /* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
+ k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+ Unzval_br_cnt=0;
+ Ufstnz_br_cnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Unzval_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Unzval_br_offset[lb]; ++jj) {
+ Unzval_br_dat[Unzval_br_cnt+jj]=Unzval_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Unzval_br_ptr[lb]);
+ Unzval_br_ptr[lb]=&Unzval_br_dat[Unzval_br_cnt];
+ tmp_cnt = Unzval_br_offset[lb];
+ Unzval_br_offset[lb]=Unzval_br_cnt;
+ Unzval_br_cnt+=tmp_cnt;
+ }
+
+ if(Ufstnz_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Ufstnz_br_offset[lb]; ++jj) {
+ Ufstnz_br_dat[Ufstnz_br_cnt+jj]=Ufstnz_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Ufstnz_br_ptr[lb]);
+ Ufstnz_br_ptr[lb]=&Ufstnz_br_dat[Ufstnz_br_cnt];
+ tmp_cnt = Ufstnz_br_offset[lb];
+ Ufstnz_br_offset[lb]=Ufstnz_br_cnt;
+ Ufstnz_br_cnt+=tmp_cnt;
+ }
+ }
+
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Ucb_valcnt=0;
+ Ucb_indcnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Ucb_valptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_valoffset[lb]; ++jj) {
+ Ucb_valdat[Ucb_valcnt+jj]=Ucb_valptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_valptr[lb]);
+ Ucb_valptr[lb]=&Ucb_valdat[Ucb_valcnt];
+ tmp_cnt = Ucb_valoffset[lb];
+ Ucb_valoffset[lb]=Ucb_valcnt;
+ Ucb_valcnt+=tmp_cnt;
+ }
+ if(Ucb_indptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_indoffset[lb]; ++jj) {
+ Ucb_inddat[Ucb_indcnt+jj]=Ucb_indptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_indptr[lb]);
+ Ucb_indptr[lb]=&Ucb_inddat[Ucb_indcnt];
+ tmp_cnt = Ucb_indoffset[lb];
+ Ucb_indoffset[lb]=Ucb_indcnt;
+ Ucb_indcnt+=tmp_cnt;
+ }
+ }
+
/////////////////////////////////////////////////////////////////
#if ( PROFlevel>=1 )
@@ -1585,12 +1881,31 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
////////////////////////////////////////////////////////
-
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+ Llu->Lrowind_bc_dat = Lrowind_bc_dat;
+ Llu->Lrowind_bc_offset = Lrowind_bc_offset;
+ Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
+
Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+ Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
+ Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
+ Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
+
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+ Llu->Lnzval_bc_dat = Lnzval_bc_dat;
+ Llu->Lnzval_bc_offset = Lnzval_bc_offset;
+ Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
+
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+ Llu->Ufstnz_br_dat = Ufstnz_br_dat;
+ Llu->Ufstnz_br_offset = Ufstnz_br_offset;
+ Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
+
Llu->Unzval_br_ptr = Unzval_br_ptr;
+ Llu->Unzval_br_dat = Unzval_br_dat;
+ Llu->Unzval_br_offset = Unzval_br_offset;
+ Llu->Unzval_br_cnt = Unzval_br_cnt;
+
Llu->Unnz = Unnz;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
@@ -1605,15 +1920,79 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
Llu->nbsendx = nbsendx;
Llu->ilsum = ilsum;
Llu->ldalsum = ldaspa;
+
Llu->LRtree_ptr = LRtree_ptr;
Llu->LBtree_ptr = LBtree_ptr;
Llu->URtree_ptr = URtree_ptr;
Llu->UBtree_ptr = UBtree_ptr;
+
Llu->Linv_bc_ptr = Linv_bc_ptr;
+ Llu->Linv_bc_dat = Linv_bc_dat;
+ Llu->Linv_bc_offset = Linv_bc_offset;
+ Llu->Linv_bc_cnt = Linv_bc_cnt;
+
Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Uinv_bc_dat = Uinv_bc_dat;
+ Llu->Uinv_bc_offset = Uinv_bc_offset;
+ Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+
Llu->Urbs = Urbs;
Llu->Ucb_indptr = Ucb_indptr;
+ Llu->Ucb_inddat = Ucb_inddat;
+ Llu->Ucb_indoffset = Ucb_indoffset;
+ Llu->Ucb_indcnt = Ucb_indcnt;
Llu->Ucb_valptr = Ucb_valptr;
+ Llu->Ucb_valdat = Ucb_valdat;
+ Llu->Ucb_valoffset = Ucb_valoffset;
+ Llu->Ucb_valcnt = Ucb_valcnt;
+
+#ifdef GPU_ACC
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+
+
+ // some dummy allocation to avoid checking whether they are null pointers later
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(float) ));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
+
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+
+
+ /* gpuMemcpy for the following is performed in pxgssvx */
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(float) ));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(float) ));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(float) ));
+
+#endif /* match ifdef GPU_ACC */
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h
index bde5c7c0..7ea4145e 100644
--- a/SRC/slustruct_gpu.h
+++ b/SRC/slustruct_gpu.h
@@ -96,8 +96,6 @@ typedef struct //LUstruct_gpu_
local_u_blk_info_t *local_u_blk_infoVec;
int_t *local_u_blk_infoPtr;
- int_t *ijb_lookupVec;
- int_t *ijb_lookupPtr;
// GPU buffers for performing Schur Complement Update on GPU
sSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS];
diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu
index d16eb3f1..940be99e 100644
--- a/SRC/ssuperlu_gpu.cu
+++ b/SRC/ssuperlu_gpu.cu
@@ -799,8 +799,6 @@ int sfree_LUstruct_gpu (
checkGPU(gpuFree(A_gpu->jib_lookupPtr));
checkGPU(gpuFree(A_gpu->local_u_blk_infoVec));
checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr));
- checkGPU(gpuFree(A_gpu->ijb_lookupVec));
- checkGPU(gpuFree(A_gpu->ijb_lookupPtr));
/* Destroy all the meta-structures associated with the streams. */
gpuStreamDestroy(sluGPU->CopyStream);
diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h
index 467bfb65..13ca6aea 100644
--- a/SRC/superlu_FortranCInterface.h
+++ b/SRC/superlu_FortranCInterface.h
@@ -8,9 +8,9 @@
#define FC_GLOBAL_(name,NAME) name##_
/* Mangling for Fortran module symbols without underscores. */
-#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name
+#define FC_MODULE(mod_name,name, mod_NAME,NAME) mod_name##_##name##_
/* Mangling for Fortran module symbols with underscores. */
-#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name
+#define FC_MODULE_(mod_name,name, mod_NAME,NAME) mod_name##_##name##_
#endif
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index f8da9ac1..0290652c 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -680,9 +680,10 @@ extern void dGenCSRLblocks(int, int_t, gridinfo_t*,
Glu_persist_t*, dLocalLU_t *, double **, int_t **, int_t **, int_t*, int_t*);
/* multi-GPU */
-#ifdef GPU_ACC
+#ifdef GPU_SOLVE
extern void create_nv_buffer(int* , int*, int* , int* );
-extern void nv_init_wrapper(int*, char**, int*);
+extern void nv_init_wrapper(int*, char**, int*);
+extern void prepare_multiGPU_buffers(int,int,int,int,int,int);
#endif
/* BLAS */
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index 789d14fa..43f01aac 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -49,13 +49,6 @@ at the top-level directory.
#include
#endif
-#ifdef GPU_ACC
-
-#include "oneside.h"
-#include "gpu_wrapper.h"
-#endif
-
-
#include
#include
@@ -108,6 +101,7 @@ at the top-level directory.
#endif
#ifdef GPU_ACC
+#include "oneside.h"
#include "gpu_api_utils.h"
#endif
@@ -1249,19 +1243,16 @@ typedef struct
#endif
extern void C_RdTree_Create(C_Tree* tree, MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, char precision);
-extern void C_RdTree_Create_nv(C_Tree* tree, MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, char precision,int* needrecvrd,int* needsendrd);
extern void C_RdTree_Nullify(C_Tree* tree);
extern yes_no_t C_RdTree_IsRoot(C_Tree* tree);
extern void C_RdTree_forwardMessageSimple(C_Tree* Tree, void* localBuffer, int msgSize);
extern void C_RdTree_waitSendRequest(C_Tree* Tree);
extern void C_BcTree_Create(C_Tree* tree, MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, char precision);
-extern void C_BcTree_Create_nv(C_Tree* tree, MPI_Comm comm, int* ranks, int rank_cnt, int msgSize, char precision,int* needrecv);
extern void C_BcTree_Nullify(C_Tree* tree);
extern yes_no_t C_BcTree_IsRoot(C_Tree* tree);
extern void C_BcTree_forwardMessageSimple(C_Tree* tree, void* localBuffer, int msgSize);
extern void C_BcTree_waitSendRequest(C_Tree* tree);
-extern __global__ void schedule(int nrhs, C_Tree *LRtree_ptr,int_t maxrecvsz,int mype,int* flag_bc_q,int* flag_rd_q,double* ready_x,double* ready_lsum,int* my_flag_bc,int* my_flag_rd,int* d_nfrecv,int* d_status,int* d_launch_flag,int* d_colnum,int* d_mynum,int* d_mymaskstart,int* d_mymasklength,int* d_nfrecvmod,int* d_statusmod,int* d_colnummod,int* d_mynummod,int* d_mymaskstartmod,int* d_mymasklengthmod,int* d_recv_cnt,int* d_msgnum, int* d_flag_mod, double *lsum, int_t *fmod, gridinfo_t *grid,int_t *xsup,int_t *ilsum,int nbrow_loc,int_t nsupers);
/*==== For 3D code ====*/
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index d3cdf426..6aa086cc 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -1,7 +1,7 @@
/* superlu_dist_config.h.in */
/* Enable CUDA */
-/* #undef HAVE_CUDA */
+#define HAVE_CUDA TRUE
/* Enable HIP */
/* #undef HAVE_HIP */
@@ -13,13 +13,13 @@
/* #undef HAVE_COLAMD */
/* Enable LAPACK */
-/* #undef SLU_HAVE_LAPACK */
+#define SLU_HAVE_LAPACK TRUE
/* Enable CombBLAS */
/* #undef HAVE_COMBBLAS */
/* enable 64bit index mode */
-#define XSDK_INDEX_SIZE 64
+/* #undef XSDK_INDEX_SIZE */
#if (XSDK_INDEX_SIZE == 64)
#define _LONGINT 1
diff --git a/SRC/superlu_sdefs.h b/SRC/superlu_sdefs.h
index 82dbcb08..1193675c 100644
--- a/SRC/superlu_sdefs.h
+++ b/SRC/superlu_sdefs.h
@@ -95,25 +95,49 @@ typedef struct {
*/
#define MAX_LOOKAHEADS 50
typedef struct {
- int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */
- long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc);
+ free'd in distribution routinies */
+ int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
long int Lrowind_bc_cnt;
- float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc);
+ free'd in distribution routinies */
float *Lnzval_bc_dat; /* size sum of sizes of Lnzval_bc_ptr[lk]) */
long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
long int Lnzval_bc_cnt;
- float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc);
+ free'd in distribution routinies */
float *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */
long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
long int Linv_bc_cnt;
- int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */
- int_t *Lindval_loc_bc_dat; /* size sum of sizes of Lindval_loc_bc_ptr[lk]) */
- long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
- long int Lindval_loc_bc_cnt;
+ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc);
+ pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr;
+ free'd in distribution routinies */
+
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+ long int Lindval_loc_bc_cnt;
+
+ /* for new U format -> */
+ int_t **Ucolind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Ucolind_bc_dat; /* size: sum of sizes of Ucolind_bc_ptr[lk]) */
+ int64_t *Ucolind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int64_t Ucolind_bc_cnt;
+
+ float **Unzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ float *Unzval_bc_dat; /* size: sum of sizes of Unzval_bc_ptr[lk]) */
+ int64_t *Unzval_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int64_t Unzval_bc_cnt;
+
+ int_t **Uindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Ucolind_bc_ptr and Unzval_bc_ptr */
+ int_t *Uindval_loc_bc_dat; /* size: sum of sizes of Uindval_loc_bc_ptr[lk]) */
+ int64_t *Uindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int64_t Uindval_loc_bc_cnt;
+ /* end for new U format <- */
+
int_t *Unnz; /* number of nonzeros per block column in U*/
int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */
float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
@@ -216,18 +240,37 @@ typedef struct {
int_t nfrecvmod;
int_t inv; /* whether the diagonal block is inverted*/
- /* The following variables are used in GPU trisolve*/
#ifdef GPU_ACC
+ /* The following variables are used in GPU trisolve */
+
int_t *d_Lrowind_bc_dat;
long int *d_Lrowind_bc_offset;
float *d_Lnzval_bc_dat;
- long int *d_Lnzval_bc_offset;
+ long int *d_Lnzval_bc_offset;
+ int_t *d_Ucolind_bc_dat;
+ int64_t *d_Ucolind_bc_offset;
+ float *d_Unzval_bc_dat;
+ long int *d_Unzval_bc_offset;
+
float *d_Linv_bc_dat ;
float *d_Uinv_bc_dat ;
long int *d_Linv_bc_offset ;
long int *d_Uinv_bc_offset ;
- int_t *d_Lindval_loc_bc_dat ;
- long int *d_Lindval_loc_bc_offset ;
+ int_t *d_Lindval_loc_bc_dat ;
+ int64_t *d_Lindval_loc_bc_offset ;
+ int_t *d_Uindval_loc_bc_dat ;
+ int64_t *d_Uindval_loc_bc_offset ;
+
+ // long int *d_Lindval_loc_bc_offset ;
+ // int_t *d_Urbs;
+ // int_t *d_Ufstnz_br_dat;
+ // long int *d_Ufstnz_br_offset;
+ // float *d_Unzval_br_dat;
+ // long int *d_Unzval_br_offset;
+ // int_t *d_Ucb_valdat;
+ // long int *d_Ucb_valoffset;
+ // Ucb_indptr_t *d_Ucb_inddat;
+ // long int *d_Ucb_indoffset;
int_t *d_ilsum ;
int_t *d_xsup ;
@@ -261,8 +304,8 @@ typedef struct {
(also numbers of X values to be received) */
int *RecvCounts; /* Numbers of X indices to be received
(also numbers of X values to be sent) */
- void *val_tosend; /* X values to be sent to other processes */
- void *val_torecv; /* X values to be received from other processes */
+ void *val_tosend; /* X values to be sent to other processes */
+ void *val_torecv; /* X values to be received from other processes */
int_t TotalIndSend; /* Total number of indices to be sent
(also total number of values to be received) */
int_t TotalValSend; /* Total number of values to be sent.
@@ -558,9 +601,9 @@ extern void sComputeLevelsets(int , int_t , gridinfo_t *,
Glu_persist_t *, sLocalLU_t *, int_t *);
#ifdef GPU_ACC
+extern void psconvertU(superlu_dist_options_t *, gridinfo_t *, sLUstruct_t *, SuperLUStat_t *, int);
extern void slsum_fmod_inv_gpu_wrap(int_t, int_t, int_t, int_t, float *, float *, int, int, int_t , int *fmod, C_Tree *, C_Tree *, int_t *, int_t *, int64_t *, float *, int64_t *, float *, int64_t *, int_t *, int64_t *, int_t *, gridinfo_t *, float * , float * , int_t );
-extern void slsum_bmod_inv_gpu_wrap(superlu_dist_options_t *,
-int_t, int_t, int_t, int_t, float *, float *,int,int, int_t , int *bmod, C_Tree *, C_Tree *, int_t *, int_t *,int_t *, int64_t *, float *, int64_t *, int_t *, int64_t *, Ucb_indptr_t *, int64_t *, float *, int64_t *,int_t *,gridinfo_t *);
+extern void slsum_bmod_inv_gpu_wrap(superlu_dist_options_t *, int_t, int_t, int_t, int_t, float *, float *, int, int, int_t , int *bmod, C_Tree *, C_Tree *, int_t *, int_t *, int64_t *, float *, int64_t *, float *, int64_t *, int_t *, int64_t *,int_t *,gridinfo_t *);
#endif
extern void psgsrfs(superlu_dist_options_t *, int_t,
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index 02781a7f..3087df94 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -95,25 +95,49 @@ typedef struct {
*/
#define MAX_LOOKAHEADS 50
typedef struct {
- int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */
- long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc);
+ free'd in distribution routinies */
+ int_t *Lrowind_bc_dat; /* size sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
long int Lrowind_bc_cnt;
- doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc);
+ free'd in distribution routinies */
doublecomplex *Lnzval_bc_dat; /* size sum of sizes of Lnzval_bc_ptr[lk]) */
long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
long int Lnzval_bc_cnt;
- doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc);
+ free'd in distribution routinies */
doublecomplex *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */
long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
long int Linv_bc_cnt;
- int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */
- int_t *Lindval_loc_bc_dat; /* size sum of sizes of Lindval_loc_bc_ptr[lk]) */
- long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
- long int Lindval_loc_bc_cnt;
+ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc);
+ pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr;
+ free'd in distribution routinies */
+
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+ long int Lindval_loc_bc_cnt;
+
+ /* for new U format -> */
+ int_t **Ucolind_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Ucolind_bc_dat; /* size: sum of sizes of Ucolind_bc_ptr[lk]) */
+ int64_t *Ucolind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int64_t Ucolind_bc_cnt;
+
+ doublecomplex **Unzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Unzval_bc_dat; /* size: sum of sizes of Unzval_bc_ptr[lk]) */
+ int64_t *Unzval_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int64_t Unzval_bc_cnt;
+
+ int_t **Uindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Ucolind_bc_ptr and Unzval_bc_ptr */
+ int_t *Uindval_loc_bc_dat; /* size: sum of sizes of Uindval_loc_bc_ptr[lk]) */
+ int64_t *Uindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int64_t Uindval_loc_bc_cnt;
+ /* end for new U format <- */
+
int_t *Unnz; /* number of nonzeros per block column in U*/
int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */
doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
@@ -216,18 +240,37 @@ typedef struct {
int_t nfrecvmod;
int_t inv; /* whether the diagonal block is inverted*/
- /* The following variables are used in GPU trisolve*/
#ifdef GPU_ACC
+ /* The following variables are used in GPU trisolve */
+
int_t *d_Lrowind_bc_dat;
long int *d_Lrowind_bc_offset;
doublecomplex *d_Lnzval_bc_dat;
- long int *d_Lnzval_bc_offset;
+ long int *d_Lnzval_bc_offset;
+ int_t *d_Ucolind_bc_dat;
+ int64_t *d_Ucolind_bc_offset;
+ doublecomplex *d_Unzval_bc_dat;
+ long int *d_Unzval_bc_offset;
+
doublecomplex *d_Linv_bc_dat ;
doublecomplex *d_Uinv_bc_dat ;
long int *d_Linv_bc_offset ;
long int *d_Uinv_bc_offset ;
- int_t *d_Lindval_loc_bc_dat ;
- long int *d_Lindval_loc_bc_offset ;
+ int_t *d_Lindval_loc_bc_dat ;
+ int64_t *d_Lindval_loc_bc_offset ;
+ int_t *d_Uindval_loc_bc_dat ;
+ int64_t *d_Uindval_loc_bc_offset ;
+
+ // long int *d_Lindval_loc_bc_offset ;
+ // int_t *d_Urbs;
+ // int_t *d_Ufstnz_br_dat;
+ // long int *d_Ufstnz_br_offset;
+ // doublecomplex *d_Unzval_br_dat;
+ // long int *d_Unzval_br_offset;
+ // int_t *d_Ucb_valdat;
+ // long int *d_Ucb_valoffset;
+ // Ucb_indptr_t *d_Ucb_inddat;
+ // long int *d_Ucb_indoffset;
int_t *d_ilsum ;
int_t *d_xsup ;
@@ -261,8 +304,8 @@ typedef struct {
(also numbers of X values to be received) */
int *RecvCounts; /* Numbers of X indices to be received
(also numbers of X values to be sent) */
- doublecomplex *val_tosend; /* X values to be sent to other processes */
- doublecomplex *val_torecv; /* X values to be received from other processes */
+ void *val_tosend; /* X values to be sent to other processes */
+ void *val_torecv; /* X values to be received from other processes */
int_t TotalIndSend; /* Total number of indices to be sent
(also total number of values to be received) */
int_t TotalValSend; /* Total number of values to be sent.
@@ -558,9 +601,9 @@ extern void zComputeLevelsets(int , int_t , gridinfo_t *,
Glu_persist_t *, zLocalLU_t *, int_t *);
#ifdef GPU_ACC
+extern void pzconvertU(superlu_dist_options_t *, gridinfo_t *, zLUstruct_t *, SuperLUStat_t *, int);
extern void zlsum_fmod_inv_gpu_wrap(int_t, int_t, int_t, int_t, doublecomplex *, doublecomplex *, int, int, int_t , int *fmod, C_Tree *, C_Tree *, int_t *, int_t *, int64_t *, doublecomplex *, int64_t *, doublecomplex *, int64_t *, int_t *, int64_t *, int_t *, gridinfo_t *, doublecomplex * , doublecomplex * , int_t );
-extern void zlsum_bmod_inv_gpu_wrap(superlu_dist_options_t *,
-int_t, int_t, int_t, int_t, doublecomplex *, doublecomplex *,int,int, int_t , int *bmod, C_Tree *, C_Tree *, int_t *, int_t *,int_t *, int64_t *, doublecomplex *, int64_t *, int_t *, int64_t *, Ucb_indptr_t *, int64_t *, doublecomplex *, int64_t *,int_t *,gridinfo_t *);
+extern void zlsum_bmod_inv_gpu_wrap(superlu_dist_options_t *, int_t, int_t, int_t, int_t, doublecomplex *, doublecomplex *, int, int, int_t , int *bmod, C_Tree *, C_Tree *, int_t *, int_t *, int64_t *, doublecomplex *, int64_t *, doublecomplex *, int64_t *, int_t *, int64_t *,int_t *,gridinfo_t *);
#endif
extern void pzgsrfs(superlu_dist_options_t *, int_t,
diff --git a/SRC/symbfact.c b/SRC/symbfact.c
index 43da0473..ac3553d8 100644
--- a/SRC/symbfact.c
+++ b/SRC/symbfact.c
@@ -111,6 +111,8 @@ int_t symbfact
info = symbfact_SubInit(options, DOFACT, NULL, 0, m, n,
((NCPformat*)A->Store)->nnz,
Glu_persist, Glu_freeable);
+ if ( info != 0 )
+ return info;
iwork = (int_t *) intMalloc_dist(6*m+2*n);
perm_r = iwork;
diff --git a/SRC/util.c b/SRC/util.c
index ac15ca07..45df8b71 100755
--- a/SRC/util.c
+++ b/SRC/util.c
@@ -154,9 +154,14 @@ void countnz_dist(const int_t n, int_t *xprune,
/*! \brief
*
*
- * Fix up the data storage lsub for L-subscripts. It removes the subscript
+ * Fix up the data storage lsub[] for L-subscripts. It removes the subscript
* sets for structural pruning, and applies permuation to the remaining
* subscripts.
+ *
+ * Return value:
+ * number of entries in lsub[], which includes the size of the pruned graph,
+ * which is interspersed in the supernodal graph in the lsub[] array.
+ *
*
*/
int64_t
@@ -196,7 +201,7 @@ fixupL_dist(const int_t n, const int_t *perm_r,
xlsub[n] = nextl;
return lsub_size;
-}
+} /* fixupL_dist */
/*! \brief Set the default values for the options argument.
*/
diff --git a/SRC/zdistribute.c b/SRC/zdistribute.c
index aa1206c9..18b75bc6 100644
--- a/SRC/zdistribute.c
+++ b/SRC/zdistribute.c
@@ -16,10 +16,14 @@ at the top-level directory.
* -- Distributed SuperLU routine (version 2.3) --
* Lawrence Berkeley National Lab, Univ. of California Berkeley.
* October 15, 2008
+ *
+ * January 9, 2023
*
*/
#include "superlu_zdefs.h"
-
+#ifdef GPU_ACC
+#include "gpu_api_utils.h"
+#endif
/*! \brief
*
@@ -59,7 +63,8 @@ at the top-level directory.
*/
float
-zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
+zdistribute(superlu_dist_options_t *options,
+ int_t n, SuperMatrix *A,
Glu_freeable_t *Glu_freeable,
zLUstruct_t *LUstruct, gridinfo_t *grid)
{
@@ -91,21 +96,43 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int_t *index_srt; /* indices consist of headers and row subscripts */
int *index1; /* temporary pointer to array of int */
doublecomplex *lusup, *lusup_srt, *uval; /* nonzero values in L and U */
- doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Lnzval_bc_dat; /* size: sum of sizes of Lnzval_bc_ptr[lk]) */
+ long int *Lnzval_bc_offset; /* size ceil(NSUPERS/Pc) */
+
int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
- int_t *Unnz; /* size ceil(NSUPERS/Pc) */
+ int_t *Lrowind_bc_dat; /* size: sum of sizes of Lrowind_bc_ptr[lk]) */
+ long int *Lrowind_bc_offset; /* size ceil(NSUPERS/Pc) */
+ int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ int_t *Lindval_loc_bc_dat; /* size: sum of sizes of Lindval_loc_bc_ptr[lk]) */
+ long int *Lindval_loc_bc_offset; /* size ceil(NSUPERS/Pc) */
+
+ int_t *Unnz; /* size ceil(NSUPERS/Pc) */
doublecomplex **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */
- int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
- C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
- C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
- int msgsize;
+ doublecomplex *Unzval_br_dat; /* size: sum of sizes of Unzval_br_ptr[lk]) */
+ long int *Unzval_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Unzval_br_cnt=0;
+ int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */
+ int_t *Ufstnz_br_dat; /* size: sum of sizes of Ufstnz_br_ptr[lk]) */
+ long int *Ufstnz_br_offset; /* size ceil(NSUPERS/Pr) */
+ long int Ufstnz_br_cnt=0;
+ C_Tree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */
+ C_Tree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */
+ C_Tree *URtree_ptr; /* size ceil(NSUPERS/Pr) */
+
+ int msgsize;
int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */
Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
+ Ucb_indptr_t *Ucb_inddat;
+ long int *Ucb_indoffset;
+ long int Ucb_indcnt=0;
+
int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */
+ int_t *Ucb_valdat;
+ long int *Ucb_valoffset;
+ long int Ucb_valcnt=0;
/*-- Counts to be used in factorization. --*/
int *ToRecv, *ToSendD, **ToSendR;
@@ -154,7 +181,12 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
int *frecv, *brecv;
int_t *lloc;
doublecomplex **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Linv_bc_dat; /* size: sum of sizes of Linv_bc_ptr[lk]) */
+ long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
doublecomplex **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */
+ doublecomplex *Uinv_bc_dat; /* size: sum of sizes of Uinv_bc_ptr[lk]) */
+ long int *Uinv_bc_offset; /* size ceil(NSUPERS/Pc) */
+
double *SeedSTD_BC,*SeedSTD_RD;
int_t idx_indx,idx_lusup;
int_t nbrow;
@@ -359,8 +391,19 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(Unzval_br_ptr =
(doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
ABORT("Malloc fails for Unzval_br_ptr[].");
+ if ( !(Unzval_br_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Unzval_br_offset[].");
+ }
+ Unzval_br_offset[k-1] = -1;
+
if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Ufstnz_br_ptr[].");
+ if ( !(Ufstnz_br_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_offset[].");
+ }
+ Ufstnz_br_offset[k-1] = -1;
if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) )
ABORT("Malloc fails for ToSendD[].");
@@ -449,8 +492,14 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
if ( !(index = intMalloc_dist(len1+1)) )
ABORT("Malloc fails for Uindex[].");
Ufstnz_br_ptr[lb] = index;
+ Ufstnz_br_offset[lb] = len1+1;
+ Ufstnz_br_cnt += Ufstnz_br_offset[lb];
+
if ( !(Unzval_br_ptr[lb] = doublecomplexMalloc_dist(len)) )
ABORT("Malloc fails for Unzval_br_ptr[*][].");
+ Unzval_br_offset[lb]=len;
+ Unzval_br_cnt += Unzval_br_offset[lb];
+
mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 );
mybufmax[3] = SUPERLU_MAX( mybufmax[3], len );
index[0] = Ucbs[lb]; /* Number of column blocks */
@@ -460,6 +509,8 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} else {
Ufstnz_br_ptr[lb] = NULL;
Unzval_br_ptr[lb] = NULL;
+ Unzval_br_offset[lb]=-1;
+ Ufstnz_br_offset[lb]=-1;
}
Urb_length[lb] = 0; /* Reset block length. */
Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */
@@ -503,28 +554,54 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
/* Pointers to the beginning of each block column of L. */
if ( !(Lnzval_bc_ptr = (doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) )
ABORT("Malloc fails for Lnzval_bc_ptr[].");
+ Lnzval_bc_ptr[k-1] = NULL;
if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lrowind_bc_ptr[].");
Lrowind_bc_ptr[k-1] = NULL;
+ if ( !(Lrowind_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_offset[].");
+ }
+ Lrowind_bc_offset[k-1] = -1;
+ if ( !(Lnzval_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_offset[].");
+ }
+ Lnzval_bc_offset[k-1] = -1;
if ( !(Lindval_loc_bc_ptr =
(int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) )
ABORT("Malloc fails for Lindval_loc_bc_ptr[].");
Lindval_loc_bc_ptr[k-1] = NULL;
+ if ( !(Lindval_loc_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_offset[].");
+ }
+ Lindval_loc_bc_offset[k-1] = -1;
if ( !(Linv_bc_ptr =
(doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
fprintf(stderr, "Malloc fails for Linv_bc_ptr[].");
}
+ if ( !(Linv_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_offset[].");
+ }
if ( !(Uinv_bc_ptr =
(doublecomplex**)SUPERLU_MALLOC(k * sizeof(doublecomplex*))) ) {
fprintf(stderr, "Malloc fails for Uinv_bc_ptr[].");
}
+ if ( !(Uinv_bc_offset =
+ (long int*)SUPERLU_MALLOC(k * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_offset[].");
+ }
Linv_bc_ptr[k-1] = NULL;
Uinv_bc_ptr[k-1] = NULL;
+ Linv_bc_offset[k-1] = -1;
+ Uinv_bc_offset[k-1] = -1;
if ( !(Unnz = (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) )
- ABORT("Malloc fails for Unnz[].");
+ ABORT("Malloc fails for Unnz[].");
/* These lists of processes will be used for triangular solves. */
if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
@@ -543,12 +620,17 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
bsendx_plist[i] = &index1[j];
- mem_use += 4.0*k*sizeof(int*) + 2.0*len*sizeof(int);
+ mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*sizeof(int);
/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U.
------------------------------------------------------------*/
+ long int Linv_bc_cnt=0;
+ long int Uinv_bc_cnt=0;
+ long int Lrowind_bc_cnt=0;
+ long int Lnzval_bc_cnt=0;
+ long int Lindval_loc_bc_cnt=0;
for (jb = 0; jb < nsupers; ++jb) {
pc = PCOL( jb, grid );
@@ -690,16 +772,38 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
index[] and nzval[]. */
/* Add room for descriptors */
len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
- if ( !(index = intMalloc_dist(len1)) )
- ABORT("Malloc fails for index[]");
- if (!(lusup = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
+ if ( !(index = intMalloc_dist(len1)) )
+ ABORT("Malloc fails for index[]");
+ Lrowind_bc_offset[ljb]=len1;
+ Lrowind_bc_cnt += Lrowind_bc_offset[ljb];
+ if (!(lusup = (doublecomplex*)SUPERLU_MALLOC(len*nsupc * sizeof(doublecomplex))))
ABORT("Malloc fails for lusup[]");
- if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
+ Lnzval_bc_offset[ljb]=len*nsupc;
+ Lnzval_bc_cnt += Lnzval_bc_offset[ljb];
+
+ if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) ))
ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]");
- if (!(Linv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
- ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+ Lindval_loc_bc_offset[ljb]=nrbl*3;
+ Lindval_loc_bc_cnt += Lindval_loc_bc_offset[ljb];
+
+ myrow = MYROW( iam, grid );
+ krow = PROW( jb, grid );
+ if(myrow==krow){ /* diagonal block */
+ if (!(Linv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
+ ABORT("Malloc fails for Linv_bc_ptr[ljb][]");
+ Linv_bc_offset[ljb]=nsupc*nsupc;
+ Linv_bc_cnt += Linv_bc_offset[ljb];
if (!(Uinv_bc_ptr[ljb] = (doublecomplex*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(doublecomplex))))
ABORT("Malloc fails for Uinv_bc_ptr[ljb][]");
+ Uinv_bc_offset[ljb]=nsupc*nsupc;
+ Uinv_bc_cnt += Uinv_bc_offset[ljb];
+ }else{
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ }
+
mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 );
mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc );
mybufmax[4] = SUPERLU_MAX( mybufmax[4], len );
@@ -811,9 +915,14 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} else {
Lrowind_bc_ptr[ljb] = NULL;
Lnzval_bc_ptr[ljb] = NULL;
- Linv_bc_ptr[ljb] = NULL;
- Uinv_bc_ptr[ljb] = NULL;
- Lindval_loc_bc_ptr[ljb] = NULL;
+ Linv_bc_ptr[ljb] = NULL;
+ Linv_bc_offset[ljb] = -1;
+ Lrowind_bc_offset[ljb]=-1;
+ Lindval_loc_bc_offset[ljb]=-1;
+ Lnzval_bc_offset[ljb]=-1;
+ Uinv_bc_ptr[ljb] = NULL;
+ Uinv_bc_offset[ljb] = -1;
+ Lindval_loc_bc_ptr[ljb] = NULL;
} /* if nrbl ... */
#if ( PROFlevel>=1 )
t_l += SuperLU_timer_() - t;
@@ -822,6 +931,98 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
} /* for jb ... */
+ Linv_bc_cnt +=1; // safe guard
+ Uinv_bc_cnt +=1;
+ Lrowind_bc_cnt +=1;
+ Lindval_loc_bc_cnt +=1;
+ Lnzval_bc_cnt +=1;
+
+ if ( !(Linv_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Linv_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Linv_bc_dat[].");
+ }
+ if ( !(Uinv_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Uinv_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Uinv_bc_dat[].");
+ }
+
+ if ( !(Lrowind_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lrowind_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lrowind_bc_dat[].");
+ }
+ if ( !(Lindval_loc_bc_dat =
+ (int_t*)SUPERLU_MALLOC(Lindval_loc_bc_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Lindval_loc_bc_dat[].");
+ }
+ if ( !(Lnzval_bc_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Lnzval_bc_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+
+ /* use contingous memory for Linv_bc_ptr, Uinv_bc_ptr, Lrowind_bc_ptr, Lnzval_bc_ptr*/
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Linv_bc_cnt=0;
+ Uinv_bc_cnt=0;
+ Lrowind_bc_cnt=0;
+ Lnzval_bc_cnt=0;
+ Lindval_loc_bc_cnt=0;
+ long int tmp_cnt;
+ for (jb = 0; jb < k; ++jb) { /* for each block column ... */
+ if(Linv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Linv_bc_offset[jb]; ++jj) {
+ Linv_bc_dat[Linv_bc_cnt+jj]=Linv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Linv_bc_ptr[jb]);
+ Linv_bc_ptr[jb]=&Linv_bc_dat[Linv_bc_cnt];
+ tmp_cnt = Linv_bc_offset[jb];
+ Linv_bc_offset[jb]=Linv_bc_cnt;
+ Linv_bc_cnt+=tmp_cnt;
+ }
+ if(Uinv_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Uinv_bc_offset[jb]; ++jj) {
+ Uinv_bc_dat[Uinv_bc_cnt+jj]=Uinv_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Uinv_bc_ptr[jb]);
+ Uinv_bc_ptr[jb]=&Uinv_bc_dat[Uinv_bc_cnt];
+ tmp_cnt = Uinv_bc_offset[jb];
+ Uinv_bc_offset[jb]=Uinv_bc_cnt;
+ Uinv_bc_cnt+=tmp_cnt;
+ }
+ if(Lrowind_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lrowind_bc_offset[jb]; ++jj) {
+ Lrowind_bc_dat[Lrowind_bc_cnt+jj]=Lrowind_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lrowind_bc_ptr[jb]);
+ Lrowind_bc_ptr[jb]=&Lrowind_bc_dat[Lrowind_bc_cnt];
+ tmp_cnt = Lrowind_bc_offset[jb];
+ Lrowind_bc_offset[jb]=Lrowind_bc_cnt;
+ Lrowind_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lnzval_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lnzval_bc_offset[jb]; ++jj) {
+ Lnzval_bc_dat[Lnzval_bc_cnt+jj]=Lnzval_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lnzval_bc_ptr[jb]);
+ Lnzval_bc_ptr[jb]=&Lnzval_bc_dat[Lnzval_bc_cnt];
+ tmp_cnt = Lnzval_bc_offset[jb];
+ Lnzval_bc_offset[jb]=Lnzval_bc_cnt;
+ Lnzval_bc_cnt+=tmp_cnt;
+ }
+
+ if(Lindval_loc_bc_ptr[jb]!=NULL){
+ for (jj = 0; jj < Lindval_loc_bc_offset[jb]; ++jj) {
+ Lindval_loc_bc_dat[Lindval_loc_bc_cnt+jj]=Lindval_loc_bc_ptr[jb][jj];
+ }
+ SUPERLU_FREE(Lindval_loc_bc_ptr[jb]);
+ Lindval_loc_bc_ptr[jb]=&Lindval_loc_bc_dat[Lindval_loc_bc_cnt];
+ tmp_cnt = Lindval_loc_bc_offset[jb];
+ Lindval_loc_bc_offset[jb]=Lindval_loc_bc_cnt;
+ Lindval_loc_bc_cnt+=tmp_cnt;
+ }
+
+ } /* for jb ... */
+
/////////////////////////////////////////////////////////////////
/* Set up additional pointers for the index and value arrays of U.
@@ -835,6 +1036,17 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
ABORT("Malloc fails for Ucb_indptr[]");
if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) )
ABORT("Malloc fails for Ucb_valptr[]");
+ if ( !(Ucb_valoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valoffset[].");
+ }
+ Ucb_valoffset[nub-1] = -1;
+ if ( !(Ucb_indoffset =
+ (long int*)SUPERLU_MALLOC(nub * sizeof(long int))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_indoffset[].");
+ }
+ Ucb_indoffset[nub-1] = -1;
+
nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */
/* Count number of row blocks in a block column.
@@ -857,10 +1069,19 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
for (lb = 0; lb < nub; ++lb) {
if ( Urbs[lb] ) { /* Not an empty block column. */
if ( !(Ucb_indptr[lb]
- = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
+ = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) )
ABORT("Malloc fails for Ucb_indptr[lb][]");
+ Ucb_indoffset[lb]=Urbs[lb];
+ Ucb_indcnt += Ucb_indoffset[lb];
if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) )
ABORT("Malloc fails for Ucb_valptr[lb][]");
+ Ucb_valoffset[lb]=Urbs[lb];
+ Ucb_valcnt += Ucb_valoffset[lb];
+ }else{
+ Ucb_valptr[lb]=NULL;
+ Ucb_valoffset[lb]=-1;
+ Ucb_indptr[lb]=NULL;
+ Ucb_indoffset[lb]=-1;
}
}
for (lk = 0; lk < nlb; ++lk) { /* For each block row. */
@@ -905,6 +1126,81 @@ zdistribute(superlu_dist_options_t *options, int_t n, SuperMatrix *A,
}
}
+ Unzval_br_cnt +=1; // safe guard
+ Ufstnz_br_cnt +=1;
+ Ucb_valcnt +=1 ;
+ Ucb_indcnt +=1;
+ if ( !(Unzval_br_dat =
+ (doublecomplex*)SUPERLU_MALLOC(Unzval_br_cnt * sizeof(doublecomplex))) ) {
+ fprintf(stderr, "Malloc fails for Lnzval_bc_dat[].");
+ }
+ if ( !(Ufstnz_br_dat =
+ (int_t*)SUPERLU_MALLOC(Ufstnz_br_cnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ufstnz_br_dat[].");
+ }
+ if ( !(Ucb_valdat =
+ (int_t*)SUPERLU_MALLOC(Ucb_valcnt * sizeof(int_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_valdat[].");
+ }
+ if ( !(Ucb_inddat =
+ (Ucb_indptr_t*)SUPERLU_MALLOC(Ucb_indcnt * sizeof(Ucb_indptr_t))) ) {
+ fprintf(stderr, "Malloc fails for Ucb_inddat[].");
+ }
+
+ /* use contingous memory for Unzval_br_ptr, Ufstnz_br_ptr, Ucb_valptr */
+ k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
+ Unzval_br_cnt=0;
+ Ufstnz_br_cnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Unzval_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Unzval_br_offset[lb]; ++jj) {
+ Unzval_br_dat[Unzval_br_cnt+jj]=Unzval_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Unzval_br_ptr[lb]);
+ Unzval_br_ptr[lb]=&Unzval_br_dat[Unzval_br_cnt];
+ tmp_cnt = Unzval_br_offset[lb];
+ Unzval_br_offset[lb]=Unzval_br_cnt;
+ Unzval_br_cnt+=tmp_cnt;
+ }
+
+ if(Ufstnz_br_ptr[lb]!=NULL){
+ for (jj = 0; jj < Ufstnz_br_offset[lb]; ++jj) {
+ Ufstnz_br_dat[Ufstnz_br_cnt+jj]=Ufstnz_br_ptr[lb][jj];
+ }
+ SUPERLU_FREE(Ufstnz_br_ptr[lb]);
+ Ufstnz_br_ptr[lb]=&Ufstnz_br_dat[Ufstnz_br_cnt];
+ tmp_cnt = Ufstnz_br_offset[lb];
+ Ufstnz_br_offset[lb]=Ufstnz_br_cnt;
+ Ufstnz_br_cnt+=tmp_cnt;
+ }
+ }
+
+ k = CEILING( nsupers, grid->npcol );/* Number of local block columns */
+ Ucb_valcnt=0;
+ Ucb_indcnt=0;
+ for (lb = 0; lb < k; ++lb) { /* for each block row ... */
+ if(Ucb_valptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_valoffset[lb]; ++jj) {
+ Ucb_valdat[Ucb_valcnt+jj]=Ucb_valptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_valptr[lb]);
+ Ucb_valptr[lb]=&Ucb_valdat[Ucb_valcnt];
+ tmp_cnt = Ucb_valoffset[lb];
+ Ucb_valoffset[lb]=Ucb_valcnt;
+ Ucb_valcnt+=tmp_cnt;
+ }
+ if(Ucb_indptr[lb]!=NULL){
+ for (jj = 0; jj < Ucb_indoffset[lb]; ++jj) {
+ Ucb_inddat[Ucb_indcnt+jj]=Ucb_indptr[lb][jj];
+ }
+ SUPERLU_FREE(Ucb_indptr[lb]);
+ Ucb_indptr[lb]=&Ucb_inddat[Ucb_indcnt];
+ tmp_cnt = Ucb_indoffset[lb];
+ Ucb_indoffset[lb]=Ucb_indcnt;
+ Ucb_indcnt+=tmp_cnt;
+ }
+ }
+
/////////////////////////////////////////////////////////////////
#if ( PROFlevel>=1 )
@@ -1584,12 +1880,31 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
////////////////////////////////////////////////////////
-
Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+ Llu->Lrowind_bc_dat = Lrowind_bc_dat;
+ Llu->Lrowind_bc_offset = Lrowind_bc_offset;
+ Llu->Lrowind_bc_cnt = Lrowind_bc_cnt;
+
Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr;
+ Llu->Lindval_loc_bc_dat = Lindval_loc_bc_dat;
+ Llu->Lindval_loc_bc_offset = Lindval_loc_bc_offset;
+ Llu->Lindval_loc_bc_cnt = Lindval_loc_bc_cnt;
+
Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+ Llu->Lnzval_bc_dat = Lnzval_bc_dat;
+ Llu->Lnzval_bc_offset = Lnzval_bc_offset;
+ Llu->Lnzval_bc_cnt = Lnzval_bc_cnt;
+
Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+ Llu->Ufstnz_br_dat = Ufstnz_br_dat;
+ Llu->Ufstnz_br_offset = Ufstnz_br_offset;
+ Llu->Ufstnz_br_cnt = Ufstnz_br_cnt;
+
Llu->Unzval_br_ptr = Unzval_br_ptr;
+ Llu->Unzval_br_dat = Unzval_br_dat;
+ Llu->Unzval_br_offset = Unzval_br_offset;
+ Llu->Unzval_br_cnt = Unzval_br_cnt;
+
Llu->Unnz = Unnz;
Llu->ToRecv = ToRecv;
Llu->ToSendD = ToSendD;
@@ -1604,15 +1919,79 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
Llu->nbsendx = nbsendx;
Llu->ilsum = ilsum;
Llu->ldalsum = ldaspa;
+
Llu->LRtree_ptr = LRtree_ptr;
Llu->LBtree_ptr = LBtree_ptr;
Llu->URtree_ptr = URtree_ptr;
Llu->UBtree_ptr = UBtree_ptr;
+
Llu->Linv_bc_ptr = Linv_bc_ptr;
+ Llu->Linv_bc_dat = Linv_bc_dat;
+ Llu->Linv_bc_offset = Linv_bc_offset;
+ Llu->Linv_bc_cnt = Linv_bc_cnt;
+
Llu->Uinv_bc_ptr = Uinv_bc_ptr;
+ Llu->Uinv_bc_dat = Uinv_bc_dat;
+ Llu->Uinv_bc_offset = Uinv_bc_offset;
+ Llu->Uinv_bc_cnt = Uinv_bc_cnt;
+
Llu->Urbs = Urbs;
Llu->Ucb_indptr = Ucb_indptr;
+ Llu->Ucb_inddat = Ucb_inddat;
+ Llu->Ucb_indoffset = Ucb_indoffset;
+ Llu->Ucb_indcnt = Ucb_indcnt;
Llu->Ucb_valptr = Ucb_valptr;
+ Llu->Ucb_valdat = Ucb_valdat;
+ Llu->Ucb_valoffset = Ucb_valoffset;
+ Llu->Ucb_valcnt = Ucb_valcnt;
+
+#ifdef GPU_ACC
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_xsup, (n+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_xsup, xsup, (n+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree)));
+ checkGPU(gpuMemcpy(Llu->d_LRtree_ptr, Llu->LRtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_LBtree_ptr, Llu->LBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_URtree_ptr, Llu->URtree_ptr, CEILING( nsupers, grid->nprow ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMemcpy(Llu->d_UBtree_ptr, Llu->UBtree_ptr, CEILING( nsupers, grid->npcol ) * sizeof(C_Tree), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_dat, Llu->Lrowind_bc_dat, (Llu->Lrowind_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_dat, Llu->Lindval_loc_bc_dat, (Llu->Lindval_loc_bc_cnt) * sizeof(int_t), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lrowind_bc_offset, Llu->Lrowind_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lindval_loc_bc_offset, Llu->Lindval_loc_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Lnzval_bc_offset, Llu->Lnzval_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+
+
+ // some dummy allocation to avoid checking whether they are null pointers later
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Ucolind_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_dat, sizeof(doublecomplex) ));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Unzval_bc_offset, sizeof(int64_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_dat, sizeof(int_t)));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uindval_loc_bc_offset, sizeof(int_t)));
+
+
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Linv_bc_offset, Llu->Linv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int)));
+ checkGPU(gpuMemcpy(Llu->d_Uinv_bc_offset, Llu->Uinv_bc_offset, CEILING( nsupers, grid->npcol ) * sizeof(long int), gpuMemcpyHostToDevice));
+ checkGPU(gpuMalloc( (void**)&Llu->d_ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t)));
+ checkGPU(gpuMemcpy(Llu->d_ilsum, Llu->ilsum, (CEILING( nsupers, grid->nprow )+1) * sizeof(int_t), gpuMemcpyHostToDevice));
+
+
+ /* gpuMemcpy for the following is performed in pxgssvx */
+ checkGPU(gpuMalloc( (void**)&Llu->d_Lnzval_bc_dat, (Llu->Lnzval_bc_cnt) * sizeof(doublecomplex) ));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Linv_bc_dat, (Llu->Linv_bc_cnt) * sizeof(doublecomplex) ));
+ checkGPU(gpuMalloc( (void**)&Llu->d_Uinv_bc_dat, (Llu->Uinv_bc_cnt) * sizeof(doublecomplex) ));
+
+#endif /* match ifdef GPU_ACC */
#if ( PRNTlevel>=1 )
if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n",
diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h
index 7717296f..3bdc8b49 100644
--- a/SRC/zlustruct_gpu.h
+++ b/SRC/zlustruct_gpu.h
@@ -95,8 +95,6 @@ typedef struct //LUstruct_gpu_
local_u_blk_info_t *local_u_blk_infoVec;
int_t *local_u_blk_infoPtr;
- int_t *ijb_lookupVec;
- int_t *ijb_lookupPtr;
// GPU buffers for performing Schur Complement Update on GPU
zSCUbuf_gpu_t scubufs[MAX_NGPU_STREAMS];
diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu
index b4596330..3eecfe4e 100644
--- a/SRC/zsuperlu_gpu.cu
+++ b/SRC/zsuperlu_gpu.cu
@@ -807,8 +807,6 @@ int zfree_LUstruct_gpu (
checkGPU(gpuFree(A_gpu->jib_lookupPtr));
checkGPU(gpuFree(A_gpu->local_u_blk_infoVec));
checkGPU(gpuFree(A_gpu->local_u_blk_infoPtr));
- checkGPU(gpuFree(A_gpu->ijb_lookupVec));
- checkGPU(gpuFree(A_gpu->ijb_lookupPtr));
/* Destroy all the meta-structures associated with the streams. */
gpuStreamDestroy(sluGPU->CopyStream);
diff --git a/build_base/CMakeCache.txt b/build_base/CMakeCache.txt
new file mode 100644
index 00000000..ca839d50
--- /dev/null
+++ b/build_base/CMakeCache.txt
@@ -0,0 +1,1360 @@
+# This is the CMakeCache file.
+# For build in directory: /global/cfs/cdirs/m2956/nanding/myprojects/multi-GPU/superlu_dist/build_base
+# It was generated by CMake: /global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+//No help, variable specified on the command line.
+BUILD_SHARED_LIBS:UNINITIALIZED=ON
+
+//Include static libs when building shared
+BUILD_STATIC_LIBS:BOOL=TRUE
+
+//Build the testing tree.
+BUILD_TESTING:BOOL=ON
+
+//Path to a program.
+CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
+
+//Path to a program.
+CMAKE_AR:FILEPATH=/usr/bin/ar
+
+//Choose the type of build, options are: None Debug Release RelWithDebInfo
+// MinSizeRel ...
+CMAKE_BUILD_TYPE:STRING=Release
+
+//Enable/Disable color output during build.
+CMAKE_COLOR_MAKEFILE:BOOL=ON
+
+//saved value of CRAYPE_LINK_TYPE environment variable
+CMAKE_CRAYPE_LINKTYPE:STRING=dynamic
+
+//saved value of LOADEDMODULES environment variable
+CMAKE_CRAYPE_LOADEDMODULES:STRING=craype-x86-milan:libfabric/1.15.2.0:craype-network-ofi:xpmem/2.5.2-2.4_3.20__gd0f7936.shasta:perftools-base/22.09.0:cpe/22.11:xalt/2.10.2:craype-accel-nvidia80:gpu/1.0:nvidia/22.7:craype/2.7.19:cray-dsmml/0.2.2:cray-mpich/8.1.22:PrgEnv-nvidia/8.3.3:Nsight-Compute/2022.1.1:Nsight-Systems/2022.2.1:cudatoolkit/11.7:cray-libsci/22.11.1.2:cmake/3.24.3
+
+//No help, variable specified on the command line.
+CMAKE_CUDA_ARCHITECTURES:UNINITIALIZED=80
+
+//CUDA compiler
+CMAKE_CUDA_COMPILER:STRING=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/nvcc
+
+//Flags used by the CUDA compiler during all build types.
+CMAKE_CUDA_FLAGS:STRING=-I/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/include -ccbin=/opt/cray/pe/craype/2.7.19/bin/CC
+
+CMAKE_CUDA_FLAGS_DDEBUG:STRING=-O0 --expt-relaxed-constexpr -DDEBUG -g
+
+//Flags used by the CUDA compiler during DEBUG builds.
+CMAKE_CUDA_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the CUDA compiler during MINSIZEREL builds.
+CMAKE_CUDA_FLAGS_MINSIZEREL:STRING=-O1 -DNDEBUG
+
+//Flags used by the CUDA compiler during RELEASE builds.
+CMAKE_CUDA_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the CUDA compiler during RELWITHDEBINFO builds.
+CMAKE_CUDA_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//No help, variable specified on the command line.
+CMAKE_CXX_COMPILER:UNINITIALIZED=CC
+
+//Flags used by the CXX compiler during all build types.
+CMAKE_CXX_FLAGS:STRING=
+
+//Flags used by the CXX compiler during DEBUG builds.
+CMAKE_CXX_FLAGS_DEBUG:STRING=-g -O0
+
+//Flags used by the CXX compiler during MINSIZEREL builds.
+CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-O2 -s -DNDEBUG
+
+//Flags used by the CXX compiler during RELEASE builds.
+CMAKE_CXX_FLAGS_RELEASE:STRING=-fast -O3 -DNDEBUG
+
+//Flags used by the CXX compiler during RELWITHDEBINFO builds.
+CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -gopt
+
+//No help, variable specified on the command line.
+CMAKE_C_COMPILER:UNINITIALIZED=cc
+
+//Flags used by the C compiler during all build types.
+CMAKE_C_FLAGS:STRING=-DGPU_SOLVE -std=c11 -DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -DAdd_
+
+//Flags used by the C compiler during DEBUG builds.
+CMAKE_C_FLAGS_DEBUG:STRING=-g -O0
+
+//Flags used by the C compiler during MINSIZEREL builds.
+CMAKE_C_FLAGS_MINSIZEREL:STRING=-O2 -s -DNDEBUG
+
+//Flags used by the C compiler during RELEASE builds.
+CMAKE_C_FLAGS_RELEASE:STRING=-fast -O3 -DNDEBUG
+
+//Flags used by the C compiler during RELWITHDEBINFO builds.
+CMAKE_C_FLAGS_RELWITHDEBINFO:STRING=-O2 -gopt
+
+//Path to a program.
+CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
+
+//Flags used by the linker during all build types.
+CMAKE_EXE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during DEBUG builds.
+CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during MINSIZEREL builds.
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during RELEASE builds.
+CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during RELWITHDEBINFO builds.
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Enable/Disable output of compile commands during generation.
+CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
+
+//Value Computed by CMake.
+CMAKE_FIND_PACKAGE_REDIRECTS_DIR:STATIC=/global/cfs/cdirs/m2956/nanding/myprojects/multi-GPU/superlu_dist/build_base/CMakeFiles/pkgRedirects
+
+//No help, variable specified on the command line.
+CMAKE_Fortran_COMPILER:UNINITIALIZED=ftn
+
+//Flags used by the Fortran compiler during all build types.
+CMAKE_Fortran_FLAGS:STRING=
+
+//Flags used by the Fortran compiler during DEBUG builds.
+CMAKE_Fortran_FLAGS_DEBUG:STRING=-g -O0 -Mbounds
+
+//Flags used by the Fortran compiler during MINSIZEREL builds.
+CMAKE_Fortran_FLAGS_MINSIZEREL:STRING=-O2 -s
+
+//Flags used by the Fortran compiler during RELEASE builds.
+CMAKE_Fortran_FLAGS_RELEASE:STRING=-fast -O3
+
+//Flags used by the Fortran compiler during RELWITHDEBINFO builds.
+CMAKE_Fortran_FLAGS_RELWITHDEBINFO:STRING=-O2 -gopt
+
+//User executables (bin)
+CMAKE_INSTALL_BINDIR:PATH=bin
+
+//Read-only architecture-independent data (DATAROOTDIR)
+CMAKE_INSTALL_DATADIR:PATH=
+
+//Read-only architecture-independent data root (share)
+CMAKE_INSTALL_DATAROOTDIR:PATH=share
+
+//Documentation root (DATAROOTDIR/doc/PROJECT_NAME)
+CMAKE_INSTALL_DOCDIR:PATH=
+
+//C header files (include)
+CMAKE_INSTALL_INCLUDEDIR:PATH=include
+
+//Info documentation (DATAROOTDIR/info)
+CMAKE_INSTALL_INFODIR:PATH=
+
+//Object code libraries (lib)
+CMAKE_INSTALL_LIBDIR:PATH=./lib
+
+//Program executables (libexec)
+CMAKE_INSTALL_LIBEXECDIR:PATH=libexec
+
+//Locale-dependent data (DATAROOTDIR/locale)
+CMAKE_INSTALL_LOCALEDIR:PATH=
+
+//Modifiable single-machine data (var)
+CMAKE_INSTALL_LOCALSTATEDIR:PATH=var
+
+//Man documentation (DATAROOTDIR/man)
+CMAKE_INSTALL_MANDIR:PATH=
+
+//C header files for non-gcc (/usr/include)
+CMAKE_INSTALL_OLDINCLUDEDIR:PATH=/usr/include
+
+//Install path prefix, prepended onto install directories.
+CMAKE_INSTALL_PREFIX:PATH=/global/cfs/cdirs/m2956/nanding/myprojects/multi-GPU/superlu_dist/build_base
+
+//Run-time variable data (LOCALSTATEDIR/run)
+CMAKE_INSTALL_RUNSTATEDIR:PATH=
+
+//System admin executables (sbin)
+CMAKE_INSTALL_SBINDIR:PATH=sbin
+
+//Modifiable architecture-independent data (com)
+CMAKE_INSTALL_SHAREDSTATEDIR:PATH=com
+
+//Read-only single-machine data (etc)
+CMAKE_INSTALL_SYSCONFDIR:PATH=etc
+
+//Path to a program.
+CMAKE_LINKER:FILEPATH=/usr/bin/ld
+
+//Path to a program.
+CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
+
+//Flags used by the linker during the creation of modules during
+// all build types.
+CMAKE_MODULE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of modules during
+// DEBUG builds.
+CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of modules during
+// MINSIZEREL builds.
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELEASE builds.
+CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELWITHDEBINFO builds.
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_NM:FILEPATH=/usr/bin/nm
+
+//Path to a program.
+CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
+
+//Path to a program.
+CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
+
+//Value Computed by CMake
+CMAKE_PROJECT_DESCRIPTION:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_NAME:STATIC=SuperLU_DIST
+
+//Path to a program.
+CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
+
+//Path to a program.
+CMAKE_READELF:FILEPATH=/usr/bin/readelf
+
+//Flags used by the linker during the creation of shared libraries
+// during all build types.
+CMAKE_SHARED_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during DEBUG builds.
+CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during MINSIZEREL builds.
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELEASE builds.
+CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELWITHDEBINFO builds.
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//If set, runtime paths are not added when installing shared libraries,
+// but are added when building.
+CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
+
+//If set, runtime paths are not added when using shared libraries.
+CMAKE_SKIP_RPATH:BOOL=NO
+
+//Flags used by the linker during the creation of static libraries
+// during all build types.
+CMAKE_STATIC_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during DEBUG builds.
+CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during MINSIZEREL builds.
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELEASE builds.
+CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELWITHDEBINFO builds.
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_STRIP:FILEPATH=/usr/bin/strip
+
+//No help, variable specified on the command line.
+CMAKE_VERBOSE_MAKEFILE:BOOL=ON
+
+//Path to the coverage program that CTest uses for performing coverage
+// inspection
+COVERAGE_COMMAND:FILEPATH=/usr/bin/gcov
+
+//Extra command line flags to pass to the coverage tool
+COVERAGE_EXTRA_FLAGS:STRING=-l
+
+//How many times to retry timed-out CTest submissions.
+CTEST_SUBMIT_RETRY_COUNT:STRING=3
+
+//How long to wait between timed-out CTest submissions.
+CTEST_SUBMIT_RETRY_DELAY:STRING=5
+
+//Path to a file.
+CUDAToolkit_CUPTI_INCLUDE_DIR:PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/include
+
+//Path to a library.
+CUDAToolkit_rt_LIBRARY:FILEPATH=/usr/lib64/librt.so
+
+//Compile device code in 64 bit mode
+CUDA_64_BIT_DEVICE_CODE:BOOL=ON
+
+//Attach the build rule to the CUDA source file. Enable only when
+// the CUDA source file is added to at most one target.
+CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE:BOOL=ON
+
+//Generate and parse .cubin files in Device mode.
+CUDA_BUILD_CUBIN:BOOL=OFF
+
+//Build in Emulation mode
+CUDA_BUILD_EMULATION:BOOL=OFF
+
+//Path to a library.
+CUDA_CUDART:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libcudart.so
+
+//"cudart" library
+CUDA_CUDART_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libcudart.so
+
+//"cuda" library (older versions only).
+CUDA_CUDA_LIBRARY:FILEPATH=/usr/lib64/libcuda.so
+
+//Directory to put all the output files. If blank it will default
+// to the CMAKE_CURRENT_BINARY_DIR
+CUDA_GENERATED_OUTPUT_DIR:PATH=
+
+//Generated file extension
+CUDA_HOST_COMPILATION_CPP:BOOL=ON
+
+//Host side compiler used by NVCC
+CUDA_HOST_COMPILER:FILEPATH=/opt/cray/pe/craype/2.7.19/bin/cc
+
+//Path to a program.
+CUDA_NVCC_EXECUTABLE:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/nvcc
+
+//Semi-colon delimit multiple arguments. during all build types.
+CUDA_NVCC_FLAGS:STRING=
+
+//Semi-colon delimit multiple arguments. during DEBUG builds.
+CUDA_NVCC_FLAGS_DEBUG:STRING=
+
+//Semi-colon delimit multiple arguments. during MINSIZEREL builds.
+CUDA_NVCC_FLAGS_MINSIZEREL:STRING=
+
+//Semi-colon delimit multiple arguments. during RELEASE builds.
+CUDA_NVCC_FLAGS_RELEASE:STRING=
+
+//Semi-colon delimit multiple arguments. during RELWITHDEBINFO
+// builds.
+CUDA_NVCC_FLAGS_RELWITHDEBINFO:STRING=
+
+//"OpenCL" library
+CUDA_OpenCL_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libOpenCL.so
+
+//Propagate C/CXX_FLAGS and friends to the host compiler via -Xcompile
+CUDA_PROPAGATE_HOST_FLAGS:BOOL=ON
+
+//Path to a file.
+CUDA_SDK_ROOT_DIR:PATH=CUDA_SDK_ROOT_DIR-NOTFOUND
+
+//Compile CUDA objects with separable compilation enabled. Requires
+// CUDA 5.0+
+CUDA_SEPARABLE_COMPILATION:BOOL=OFF
+
+//Path to a file.
+CUDA_TOOLKIT_INCLUDE:PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/include
+
+//Toolkit location.
+CUDA_TOOLKIT_ROOT_DIR:PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7
+
+//Use the static version of the CUDA runtime library if available
+CUDA_USE_STATIC_CUDA_RUNTIME:BOOL=ON
+
+//Print out the commands run while compiling the CUDA source file.
+// With the Makefile generator this defaults to VERBOSE variable
+// specified on the command line, but can be forced on with this
+// option.
+CUDA_VERBOSE_BUILD:BOOL=OFF
+
+//Version of CUDA as computed from nvcc.
+CUDA_VERSION:STRING=11.7
+
+//Path to a library.
+CUDA_cublasLt_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcublasLt.so
+
+//Path to a library.
+CUDA_cublasLt_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcublasLt_static.a
+
+//"cublas" library
+CUDA_cublas_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcublas.so
+
+//Path to a library.
+CUDA_cublas_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcublas_static.a
+
+//Path to a library.
+CUDA_cuda_driver_LIBRARY:FILEPATH=/usr/lib64/libcuda.so
+
+//"cudadevrt" library
+CUDA_cudadevrt_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libcudadevrt.a
+
+//Path to a library.
+CUDA_cudart_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libcudart.so
+
+//static CUDA runtime library
+CUDA_cudart_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libcudart_static.a
+
+//"cufft" library
+CUDA_cufft_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcufft.so
+
+//Path to a library.
+CUDA_cufft_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcufft_static.a
+
+//Path to a library.
+CUDA_cufft_static_nocallback_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcufft_static_nocallback.a
+
+//Path to a library.
+CUDA_cufftw_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcufftw.so
+
+//Path to a library.
+CUDA_cufftw_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcufftw_static.a
+
+//Path to a library.
+CUDA_culibos_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libculibos.a
+
+//"cupti" library
+CUDA_cupti_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/lib64/libcupti.so
+
+//Path to a library.
+CUDA_cupti_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../extras/CUPTI/lib64/libcupti_static.a
+
+//"curand" library
+CUDA_curand_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcurand.so
+
+//Path to a library.
+CUDA_curand_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcurand_static.a
+
+//"cusolver" library
+CUDA_cusolver_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcusolver.so
+
+//Path to a library.
+CUDA_cusolver_lapack_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcusolver_lapack_static.a
+
+//Path to a library.
+CUDA_cusolver_metis_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libmetis_static.a
+
+//Path to a library.
+CUDA_cusolver_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcusolver_static.a
+
+//"cusparse" library
+CUDA_cusparse_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcusparse.so
+
+//Path to a library.
+CUDA_cusparse_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/profilers/Nsight_Compute/../../math_libs/11.7/lib64/libcusparse_static.a
+
+//"nppc" library
+CUDA_nppc_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppc.so
+
+//Path to a library.
+CUDA_nppc_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppc_static.a
+
+//"nppial" library
+CUDA_nppial_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppial.so
+
+//Path to a library.
+CUDA_nppial_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppial_static.a
+
+//"nppicc" library
+CUDA_nppicc_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppicc.so
+
+//Path to a library.
+CUDA_nppicc_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppicc_static.a
+
+//Path to a library.
+CUDA_nppicom_LIBRARY:FILEPATH=CUDA_nppicom_LIBRARY-NOTFOUND
+
+//Path to a library.
+CUDA_nppicom_static_LIBRARY:FILEPATH=CUDA_nppicom_static_LIBRARY-NOTFOUND
+
+//"nppidei" library
+CUDA_nppidei_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppidei.so
+
+//Path to a library.
+CUDA_nppidei_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppidei_static.a
+
+//"nppif" library
+CUDA_nppif_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppif.so
+
+//Path to a library.
+CUDA_nppif_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppif_static.a
+
+//"nppig" library
+CUDA_nppig_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppig.so
+
+//Path to a library.
+CUDA_nppig_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppig_static.a
+
+//"nppim" library
+CUDA_nppim_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppim.so
+
+//Path to a library.
+CUDA_nppim_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppim_static.a
+
+//"nppist" library
+CUDA_nppist_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppist.so
+
+//Path to a library.
+CUDA_nppist_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppist_static.a
+
+//"nppisu" library
+CUDA_nppisu_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppisu.so
+
+//Path to a library.
+CUDA_nppisu_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppisu_static.a
+
+//"nppitc" library
+CUDA_nppitc_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppitc.so
+
+//Path to a library.
+CUDA_nppitc_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnppitc_static.a
+
+//"npps" library
+CUDA_npps_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnpps.so
+
+//Path to a library.
+CUDA_npps_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnpps_static.a
+
+//"nvToolsExt" library
+CUDA_nvToolsExt_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnvToolsExt.so
+
+//Path to a library.
+CUDA_nvgraph_LIBRARY:FILEPATH=CUDA_nvgraph_LIBRARY-NOTFOUND
+
+//Path to a library.
+CUDA_nvgraph_static_LIBRARY:FILEPATH=CUDA_nvgraph_static_LIBRARY-NOTFOUND
+
+//Path to a library.
+CUDA_nvjpeg_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnvjpeg.so
+
+//Path to a library.
+CUDA_nvjpeg_static_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnvjpeg_static.a
+
+//Path to a library.
+CUDA_nvml_LIBRARY:FILEPATH=/usr/lib64/libnvidia-ml.so
+
+//Path to a library.
+CUDA_nvrtc_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libnvrtc.so
+
+//Path to a library.
+CUDA_rt_LIBRARY:FILEPATH=/usr/lib64/librt.so
+
+//Maximum time allowed before CTest will kill the test.
+DART_TESTING_TIMEOUT:STRING=1500
+
+//Path to a program.
+GITCOMMAND:FILEPATH=/usr/bin/git
+
+//The folder where runtime files will be installed.
+INSTALL_BIN_DIR:STRING=bin
+
+//The folder where headers will be installed.
+INSTALL_INC_DIR:STRING=include
+
+//The folder where libraries will be installed.
+INSTALL_LIB_DIR:STRING=./lib
+
+//Command to build the project
+MAKECOMMAND:STRING=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/bin/cmake --build . --config "${CTEST_CONFIGURATION_TYPE}"
+
+//Path to the memory checking command, used for memory error detection.
+MEMORYCHECK_COMMAND:FILEPATH=/usr/bin/valgrind
+
+//File that contains suppressions for the memory checker
+MEMORYCHECK_SUPPRESSIONS_FILE:FILEPATH=
+
+//Executable for running MPI programs.
+MPIEXEC_EXECUTABLE:FILEPATH=/usr/bin/srun
+
+//Maximum number of processors available to run MPI applications.
+MPIEXEC_MAX_NUMPROCS:STRING=16
+
+//Flag used by MPI to specify the number of processes for mpiexec;
+// the next option will be the number of processes.
+MPIEXEC_NUMPROC_FLAG:STRING=-n
+
+//These flags will be placed after all flags passed to mpiexec.
+MPIEXEC_POSTFLAGS:STRING=
+
+//These flags will be directly before the executable that is being
+// run by mpiexec.
+MPIEXEC_PREFLAGS:STRING=
+
+//MPI CXX additional include directories
+MPI_CXX_ADDITIONAL_INCLUDE_DIRS:STRING=
+
+//MPI compiler for CXX
+MPI_CXX_COMPILER:FILEPATH=/opt/cray/pe/craype/2.7.19/bin/CC
+
+//MPI CXX compiler wrapper include directories
+MPI_CXX_COMPILER_INCLUDE_DIRS:STRING=/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/include;/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/include;/opt/cray/pe/dsmml/0.2.2/dsmml/include;/opt/cray/xpmem/2.5.2-2.4_3.20__gd0f7936.shasta/include
+
+//MPI CXX compilation definitions
+MPI_CXX_COMPILE_DEFINITIONS:STRING=
+
+//MPI CXX compilation options
+MPI_CXX_COMPILE_OPTIONS:STRING=
+
+//MPI CXX libraries to link against
+MPI_CXX_LIB_NAMES:STRING=cuda;sci_nvidia_mpi;sci_nvidia;dl;mpi_nvidia;mpi_gtl_cuda;dsmml;xpmem
+
+//MPI CXX linker flags
+MPI_CXX_LINK_FLAGS:STRING=-Wl,--as-needed,-lcupti,-lcudart,--no-as-needed -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/lib -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/math_libs/11.7/targets/x86_64-linux/lib
+
+//If true, the MPI-2 C++ bindings are disabled using definitions.
+MPI_CXX_SKIP_MPICXX:BOOL=OFF
+
+//MPI C additional include directories
+MPI_C_ADDITIONAL_INCLUDE_DIRS:STRING=
+
+//MPI compiler for C
+MPI_C_COMPILER:FILEPATH=/opt/cray/pe/craype/2.7.19/bin/cc
+
+//MPI C compiler wrapper include directories
+MPI_C_COMPILER_INCLUDE_DIRS:STRING=/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/include;/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/include;/opt/cray/pe/dsmml/0.2.2/dsmml/include;/opt/cray/xpmem/2.5.2-2.4_3.20__gd0f7936.shasta/include
+
+//MPI C compilation definitions
+MPI_C_COMPILE_DEFINITIONS:STRING=
+
+//MPI C compilation options
+MPI_C_COMPILE_OPTIONS:STRING=
+
+//MPI C libraries to link against
+MPI_C_LIB_NAMES:STRING=cuda;sci_nvidia_mpi;sci_nvidia;dl;mpi_nvidia;mpi_gtl_cuda;dsmml;xpmem
+
+//MPI C linker flags
+MPI_C_LINK_FLAGS:STRING=-Wl,--as-needed,-lcupti,-lcudart,--no-as-needed -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/lib -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/math_libs/11.7/targets/x86_64-linux/lib
+
+//MPI Fortran additional include directories
+MPI_Fortran_ADDITIONAL_INCLUDE_DIRS:STRING=
+
+//MPI compiler for Fortran
+MPI_Fortran_COMPILER:FILEPATH=/opt/cray/pe/craype/2.7.19/bin/ftn
+
+//MPI Fortran compiler wrapper include directories
+MPI_Fortran_COMPILER_INCLUDE_DIRS:STRING=/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/include;/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/include;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/include;/opt/cray/pe/dsmml/0.2.2/dsmml/include;/opt/cray/xpmem/2.5.2-2.4_3.20__gd0f7936.shasta/include
+
+//MPI Fortran compilation definitions
+MPI_Fortran_COMPILE_DEFINITIONS:STRING=
+
+//MPI Fortran compilation options
+MPI_Fortran_COMPILE_OPTIONS:STRING=
+
+//MPI Fortran libraries to link against
+MPI_Fortran_LIB_NAMES:STRING=cuda;sci_nvidia_mpi;sci_nvidia;dl;mpifort_nvidia;mpi_nvidia;mpi_gtl_cuda;dsmml;xpmem
+
+//MPI Fortran linker flags
+MPI_Fortran_LINK_FLAGS:STRING=-Wl,--as-needed,-lcupti,-lcudart,--no-as-needed -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/lib -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/lib64 -L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/math_libs/11.7/targets/x86_64-linux/lib
+
+//Location of the cuda library for MPI
+MPI_cuda_LIBRARY:FILEPATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/lib/stubs/libcuda.so
+
+//Location of the dl library for MPI
+MPI_dl_LIBRARY:FILEPATH=/usr/lib64/libdl.so
+
+//Location of the dsmml library for MPI
+MPI_dsmml_LIBRARY:FILEPATH=/opt/cray/pe/dsmml/0.2.2/dsmml/lib/libdsmml.so
+
+//Location of the mpi_gtl_cuda library for MPI
+MPI_mpi_gtl_cuda_LIBRARY:FILEPATH=/opt/cray/pe/mpich/8.1.22/gtl/lib/libmpi_gtl_cuda.so
+
+//Location of the mpi_nvidia library for MPI
+MPI_mpi_nvidia_LIBRARY:FILEPATH=/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/lib/libmpi_nvidia.so
+
+//Location of the mpifort_nvidia library for MPI
+MPI_mpifort_nvidia_LIBRARY:FILEPATH=/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/lib/libmpifort_nvidia.so
+
+//Location of the sci_nvidia library for MPI
+MPI_sci_nvidia_LIBRARY:FILEPATH=/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia.so
+
+//Location of the sci_nvidia_mpi library for MPI
+MPI_sci_nvidia_mpi_LIBRARY:FILEPATH=/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mpi.so
+
+//Location of the xpmem library for MPI
+MPI_xpmem_LIBRARY:FILEPATH=/opt/cray/xpmem/2.5.2-2.4_3.20__gd0f7936.shasta/lib64/libxpmem.so
+
+//CXX compiler flags for OpenMP parallelization
+OpenMP_CXX_FLAGS:STRING=-mp
+
+//CXX compiler libraries for OpenMP parallelization
+OpenMP_CXX_LIB_NAMES:STRING=sci_nvidia_mpi_mp;sci_nvidia_mp
+
+//C compiler flags for OpenMP parallelization
+OpenMP_C_FLAGS:STRING=-mp
+
+//C compiler libraries for OpenMP parallelization
+OpenMP_C_LIB_NAMES:STRING=sci_nvidia_mpi_mp;sci_nvidia_mp
+
+//Fortran compiler flags for OpenMP parallelization
+OpenMP_Fortran_FLAGS:STRING=-mp
+
+//Fortran compiler libraries for OpenMP parallelization
+OpenMP_Fortran_LIB_NAMES:STRING=sci_nvidia_mpi_mp;sci_nvidia_mp
+
+//Path to the sci_nvidia_mp library for OpenMP
+OpenMP_sci_nvidia_mp_LIBRARY:FILEPATH=/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mp.so
+
+//Path to the sci_nvidia_mpi_mp library for OpenMP
+OpenMP_sci_nvidia_mpi_mp_LIBRARY:FILEPATH=/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mpi_mp.so
+
+//Arguments to supply to pkg-config
+PKG_CONFIG_ARGN:STRING=
+
+//pkg-config executable
+PKG_CONFIG_EXECUTABLE:FILEPATH=/usr/bin/pkg-config
+
+//Name of the computer/site where compile is being run
+SITE:STRING=login31
+
+//Value Computed by CMake
+SuperLU_DIST_BINARY_DIR:STATIC=/global/cfs/cdirs/m2956/nanding/myprojects/multi-GPU/superlu_dist/build_base
+
+//Value Computed by CMake
+SuperLU_DIST_IS_TOP_LEVEL:STATIC=ON
+
+//Value Computed by CMake
+SuperLU_DIST_SOURCE_DIR:STATIC=/global/cfs/cdirs/m2956/nanding/myprojects/multi-GPU/superlu_dist
+
+//List of absolute paths to blas libraries [].
+TPL_BLAS_LIBRARIES:BOOL=/opt/cray/pe/libsci/22.11.1.2/nvidia/20.7/x86_64/lib/libsci_nvidia_mp.so
+
+//List of absolute paths to COLAMD include directories [].
+TPL_COLAMD_INCLUDE_DIRS:BOOL=OFF
+
+//List of absolute paths to COLAMD link libraries [].
+TPL_COLAMD_LIBRARIES:BOOL=OFF
+
+//List of absolute paths to CombBLAS include directories [].
+TPL_COMBBLAS_INCLUDE_DIRS:BOOL=OFF
+
+//List of absolute paths to CombBLAS link libraries [].
+TPL_COMBBLAS_LIBRARIES:BOOL=OFF
+
+//Build the COLAMD library
+TPL_ENABLE_COLAMDLIB:BOOL=OFF
+
+//Build the CombBLAS library
+TPL_ENABLE_COMBBLASLIB:BOOL=OFF
+
+//Enable the CUDA libraries
+TPL_ENABLE_CUDALIB:BOOL=ON
+
+//Enable the HIP libraries
+TPL_ENABLE_HIPLIB:BOOL=OFF
+
+//Build the CBLAS library
+TPL_ENABLE_INTERNAL_BLASLIB:BOOL=OFF
+
+//Enable LAPACK library
+TPL_ENABLE_LAPACKLIB:BOOL=ON
+
+//Build the ParMETIS library
+TPL_ENABLE_PARMETISLIB:BOOL=ON
+
+//List of absolute paths to LAPACK libraries [].
+TPL_LAPACK_LIBRARIES:BOOL=/opt/cray/pe/libsci/22.11.1.2/nvidia/20.7/x86_64/lib/libsci_nvidia_mp.so
+
+//List of absolute paths to ParMETIS include directories [].
+TPL_PARMETIS_INCLUDE_DIRS:BOOL=/global/cfs/cdirs/m2956/nanding/software/parmetis-4.0.3-perlmutter-32bit//include;/global/cfs/cdirs/m2956/nanding/software/parmetis-4.0.3-perlmutter-32bit//metis/include
+
+//List of absolute paths to ParMETIS link libraries [].
+TPL_PARMETIS_LIBRARIES:BOOL=/global/cfs/cdirs/m2956/nanding/software/parmetis-4.0.3-perlmutter-32bit//build/Linux-x86_64/libparmetis/libparmetis.so;/global/cfs/cdirs/m2956/nanding/software/parmetis-4.0.3-perlmutter-32bit//build/Linux-x86_64/libmetis/libmetis.so
+
+//Enable Fortran
+XSDK_ENABLE_Fortran:BOOL=ON
+
+//Enable complex16 precision library
+enable_complex16:BOOL=ON
+
+//Build doxygen documentation
+enable_doc:BOOL=OFF
+
+//Enable double precision library
+enable_double:BOOL=ON
+
+//Build examples
+enable_examples:BOOL=ON
+
+//Enable single precision library
+enable_single:BOOL=ON
+
+//Build tests
+enable_tests:BOOL=ON
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//ADVANCED property for variable: CMAKE_ADDR2LINE
+CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_AR
+CMAKE_AR-ADVANCED:INTERNAL=1
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/global/cfs/cdirs/m2956/nanding/myprojects/multi-GPU/superlu_dist/build_base
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=24
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=3
+//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
+CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/bin/cpack
+//ADVANCED property for variable: CMAKE_CRAYPE_LINKTYPE
+CMAKE_CRAYPE_LINKTYPE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CRAYPE_LOADEDMODULES
+CMAKE_CRAYPE_LOADEDMODULES-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CTEST_COMMAND
+CMAKE_CTEST_COMMAND-ADVANCED:INTERNAL=1
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/bin/ctest
+//ADVANCED property for variable: CMAKE_CUDA_COMPILER
+CMAKE_CUDA_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CUDA_FLAGS
+CMAKE_CUDA_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CUDA_FLAGS_DEBUG
+CMAKE_CUDA_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CUDA_FLAGS_MINSIZEREL
+CMAKE_CUDA_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CUDA_FLAGS_RELEASE
+CMAKE_CUDA_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CUDA_FLAGS_RELWITHDEBINFO
+CMAKE_CUDA_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_COMPILER
+CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS
+CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
+CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
+CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
+CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
+CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_COMPILER
+CMAKE_C_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS
+CMAKE_C_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_DEBUG
+CMAKE_C_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_MINSIZEREL
+CMAKE_C_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_RELEASE
+CMAKE_C_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_RELWITHDEBINFO
+CMAKE_C_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_DLLTOOL
+CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
+//Path to cache edit program executable.
+CMAKE_EDIT_COMMAND:INTERNAL=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/bin/ccmake
+//Executable file format
+CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
+CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
+CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
+CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
+CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//ADVANCED property for variable: CMAKE_Fortran_COMPILER
+CMAKE_Fortran_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_Fortran_FLAGS
+CMAKE_Fortran_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_Fortran_FLAGS_DEBUG
+CMAKE_Fortran_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_Fortran_FLAGS_MINSIZEREL
+CMAKE_Fortran_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_Fortran_FLAGS_RELEASE
+CMAKE_Fortran_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_Fortran_FLAGS_RELWITHDEBINFO
+CMAKE_Fortran_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=Unix Makefiles
+//Generator instance identifier.
+CMAKE_GENERATOR_INSTANCE:INTERNAL=
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Test CMAKE_HAVE_LIBC_PTHREAD
+CMAKE_HAVE_LIBC_PTHREAD:INTERNAL=1
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=/global/cfs/cdirs/m2956/nanding/myprojects/multi-GPU/superlu_dist
+//ADVANCED property for variable: CMAKE_INSTALL_BINDIR
+CMAKE_INSTALL_BINDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_DATADIR
+CMAKE_INSTALL_DATADIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_DATAROOTDIR
+CMAKE_INSTALL_DATAROOTDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_DOCDIR
+CMAKE_INSTALL_DOCDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_INCLUDEDIR
+CMAKE_INSTALL_INCLUDEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_INFODIR
+CMAKE_INSTALL_INFODIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_LIBDIR
+CMAKE_INSTALL_LIBDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_LIBEXECDIR
+CMAKE_INSTALL_LIBEXECDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_LOCALEDIR
+CMAKE_INSTALL_LOCALEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_LOCALSTATEDIR
+CMAKE_INSTALL_LOCALSTATEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_MANDIR
+CMAKE_INSTALL_MANDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_OLDINCLUDEDIR
+CMAKE_INSTALL_OLDINCLUDEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_RUNSTATEDIR
+CMAKE_INSTALL_RUNSTATEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_SBINDIR
+CMAKE_INSTALL_SBINDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_SHAREDSTATEDIR
+CMAKE_INSTALL_SHAREDSTATEDIR-ADVANCED:INTERNAL=1
+//Install .so files without execute permission.
+CMAKE_INSTALL_SO_NO_EXE:INTERNAL=0
+//ADVANCED property for variable: CMAKE_INSTALL_SYSCONFDIR
+CMAKE_INSTALL_SYSCONFDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_LINKER
+CMAKE_LINKER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
+CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
+CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
+CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
+CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_NM
+CMAKE_NM-ADVANCED:INTERNAL=1
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=5
+//ADVANCED property for variable: CMAKE_OBJCOPY
+CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJDUMP
+CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
+//Platform information initialized
+CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_RANLIB
+CMAKE_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_READELF
+CMAKE_READELF-ADVANCED:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/share/cmake-3.24
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
+CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
+CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
+CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
+CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_RPATH
+CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
+CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
+CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
+CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STRIP
+CMAKE_STRIP-ADVANCED:INTERNAL=1
+//uname command
+CMAKE_UNAME:INTERNAL=/usr/bin/uname
+//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
+CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: COVERAGE_COMMAND
+COVERAGE_COMMAND-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: COVERAGE_EXTRA_FLAGS
+COVERAGE_EXTRA_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CTEST_SUBMIT_RETRY_COUNT
+CTEST_SUBMIT_RETRY_COUNT-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CTEST_SUBMIT_RETRY_DELAY
+CTEST_SUBMIT_RETRY_DELAY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDAToolkit_rt_LIBRARY
+CUDAToolkit_rt_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_64_BIT_DEVICE_CODE
+CUDA_64_BIT_DEVICE_CODE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE
+CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_BUILD_CUBIN
+CUDA_BUILD_CUBIN-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_BUILD_EMULATION
+CUDA_BUILD_EMULATION-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_CUDART
+CUDA_CUDART-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_CUDART_LIBRARY
+CUDA_CUDART_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_CUDA_LIBRARY
+CUDA_CUDA_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_GENERATED_OUTPUT_DIR
+CUDA_GENERATED_OUTPUT_DIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_HOST_COMPILATION_CPP
+CUDA_HOST_COMPILATION_CPP-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_NVCC_EXECUTABLE
+CUDA_NVCC_EXECUTABLE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_NVCC_FLAGS
+CUDA_NVCC_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_NVCC_FLAGS_DEBUG
+CUDA_NVCC_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_NVCC_FLAGS_MINSIZEREL
+CUDA_NVCC_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_NVCC_FLAGS_RELEASE
+CUDA_NVCC_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_NVCC_FLAGS_RELWITHDEBINFO
+CUDA_NVCC_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_OpenCL_LIBRARY
+CUDA_OpenCL_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_PROPAGATE_HOST_FLAGS
+CUDA_PROPAGATE_HOST_FLAGS-ADVANCED:INTERNAL=1
+//This is the value of the last time CUDA_SDK_ROOT_DIR was set
+// successfully.
+CUDA_SDK_ROOT_DIR_INTERNAL:INTERNAL=CUDA_SDK_ROOT_DIR-NOTFOUND
+//ADVANCED property for variable: CUDA_SEPARABLE_COMPILATION
+CUDA_SEPARABLE_COMPILATION-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_TOOLKIT_INCLUDE
+CUDA_TOOLKIT_INCLUDE-ADVANCED:INTERNAL=1
+//This is the value of the last time CUDA_TOOLKIT_ROOT_DIR was
+// set successfully.
+CUDA_TOOLKIT_ROOT_DIR_INTERNAL:INTERNAL=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7
+//This is the value of the last time CUDA_TOOLKIT_TARGET_DIR was
+// set successfully.
+CUDA_TOOLKIT_TARGET_DIR_INTERNAL:INTERNAL=/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7
+//ADVANCED property for variable: CUDA_VERBOSE_BUILD
+CUDA_VERBOSE_BUILD-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_VERSION
+CUDA_VERSION-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cublasLt_LIBRARY
+CUDA_cublasLt_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cublasLt_static_LIBRARY
+CUDA_cublasLt_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cublas_LIBRARY
+CUDA_cublas_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cublas_static_LIBRARY
+CUDA_cublas_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cuda_driver_LIBRARY
+CUDA_cuda_driver_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cudadevrt_LIBRARY
+CUDA_cudadevrt_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cudart_LIBRARY
+CUDA_cudart_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cudart_static_LIBRARY
+CUDA_cudart_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cufft_LIBRARY
+CUDA_cufft_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cufft_static_LIBRARY
+CUDA_cufft_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cufft_static_nocallback_LIBRARY
+CUDA_cufft_static_nocallback_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cufftw_LIBRARY
+CUDA_cufftw_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cufftw_static_LIBRARY
+CUDA_cufftw_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_culibos_LIBRARY
+CUDA_culibos_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cupti_LIBRARY
+CUDA_cupti_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cupti_static_LIBRARY
+CUDA_cupti_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_curand_LIBRARY
+CUDA_curand_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_curand_static_LIBRARY
+CUDA_curand_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cusolver_LIBRARY
+CUDA_cusolver_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cusolver_lapack_static_LIBRARY
+CUDA_cusolver_lapack_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cusolver_metis_static_LIBRARY
+CUDA_cusolver_metis_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cusolver_static_LIBRARY
+CUDA_cusolver_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cusparse_LIBRARY
+CUDA_cusparse_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_cusparse_static_LIBRARY
+CUDA_cusparse_static_LIBRARY-ADVANCED:INTERNAL=1
+//Location of make2cmake.cmake
+CUDA_make2cmake:INTERNAL=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/share/cmake-3.24/Modules/FindCUDA/make2cmake.cmake
+//ADVANCED property for variable: CUDA_nppc_LIBRARY
+CUDA_nppc_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppc_static_LIBRARY
+CUDA_nppc_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppial_LIBRARY
+CUDA_nppial_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppial_static_LIBRARY
+CUDA_nppial_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppicc_LIBRARY
+CUDA_nppicc_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppicc_static_LIBRARY
+CUDA_nppicc_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppicom_LIBRARY
+CUDA_nppicom_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppicom_static_LIBRARY
+CUDA_nppicom_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppidei_LIBRARY
+CUDA_nppidei_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppidei_static_LIBRARY
+CUDA_nppidei_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppif_LIBRARY
+CUDA_nppif_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppif_static_LIBRARY
+CUDA_nppif_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppig_LIBRARY
+CUDA_nppig_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppig_static_LIBRARY
+CUDA_nppig_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppim_LIBRARY
+CUDA_nppim_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppim_static_LIBRARY
+CUDA_nppim_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppist_LIBRARY
+CUDA_nppist_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppist_static_LIBRARY
+CUDA_nppist_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppisu_LIBRARY
+CUDA_nppisu_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppisu_static_LIBRARY
+CUDA_nppisu_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppitc_LIBRARY
+CUDA_nppitc_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nppitc_static_LIBRARY
+CUDA_nppitc_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_npps_LIBRARY
+CUDA_npps_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_npps_static_LIBRARY
+CUDA_npps_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nvToolsExt_LIBRARY
+CUDA_nvToolsExt_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nvgraph_LIBRARY
+CUDA_nvgraph_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nvgraph_static_LIBRARY
+CUDA_nvgraph_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nvjpeg_LIBRARY
+CUDA_nvjpeg_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nvjpeg_static_LIBRARY
+CUDA_nvjpeg_static_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nvml_LIBRARY
+CUDA_nvml_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CUDA_nvrtc_LIBRARY
+CUDA_nvrtc_LIBRARY-ADVANCED:INTERNAL=1
+//Location of parse_cubin.cmake
+CUDA_parse_cubin:INTERNAL=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/share/cmake-3.24/Modules/FindCUDA/parse_cubin.cmake
+//Location of run_nvcc.cmake
+CUDA_run_nvcc:INTERNAL=/global/common/software/nersc/pm-2022q4/spack/linux-sles15-zen/cmake-3.24.3-k5msymx/share/cmake-3.24/Modules/FindCUDA/run_nvcc.cmake
+//ADVANCED property for variable: DART_TESTING_TIMEOUT
+DART_TESTING_TIMEOUT-ADVANCED:INTERNAL=1
+//Details about finding CUDA
+FIND_PACKAGE_MESSAGE_DETAILS_CUDA:INTERNAL=[/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7][/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/nvcc][/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/include][/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libcudart_static.a][v11.7()]
+//Details about finding CUDAToolkit
+FIND_PACKAGE_MESSAGE_DETAILS_CUDAToolkit:INTERNAL=[/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/include][/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/libcudart.so][/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin][v11.7.64()]
+//Details about finding MPI
+FIND_PACKAGE_MESSAGE_DETAILS_MPI:INTERNAL=[TRUE][TRUE][TRUE][c ][v3.1()]
+//Details about finding MPI_C
+FIND_PACKAGE_MESSAGE_DETAILS_MPI_C:INTERNAL=[/opt/cray/pe/craype/2.7.19/bin/cc][TRUE][v3.1()]
+//Details about finding MPI_CXX
+FIND_PACKAGE_MESSAGE_DETAILS_MPI_CXX:INTERNAL=[/opt/cray/pe/craype/2.7.19/bin/CC][TRUE][v3.1()]
+//Details about finding MPI_Fortran
+FIND_PACKAGE_MESSAGE_DETAILS_MPI_Fortran:INTERNAL=[/opt/cray/pe/craype/2.7.19/bin/ftn][TRUE][v3.1()]
+//Details about finding OpenMP
+FIND_PACKAGE_MESSAGE_DETAILS_OpenMP:INTERNAL=[TRUE][TRUE][TRUE][c ][v()]
+//Details about finding OpenMP_C
+FIND_PACKAGE_MESSAGE_DETAILS_OpenMP_C:INTERNAL=[-mp][/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mpi_mp.so][/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mp.so][v()]
+//Details about finding OpenMP_CXX
+FIND_PACKAGE_MESSAGE_DETAILS_OpenMP_CXX:INTERNAL=[-mp][/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mpi_mp.so][/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mp.so][v()]
+//Details about finding OpenMP_Fortran
+FIND_PACKAGE_MESSAGE_DETAILS_OpenMP_Fortran:INTERNAL=[-mp][/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mpi_mp.so][/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib/libsci_nvidia_mp.so][v()]
+//Details about finding Threads
+FIND_PACKAGE_MESSAGE_DETAILS_Threads:INTERNAL=[TRUE][v()]
+//Fortran/CXX compatibility
+FortranCInterface_VERIFIED_CXX:INTERNAL=1
+//ADVANCED property for variable: GITCOMMAND
+GITCOMMAND-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MAKECOMMAND
+MAKECOMMAND-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MEMORYCHECK_COMMAND
+MEMORYCHECK_COMMAND-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MEMORYCHECK_SUPPRESSIONS_FILE
+MEMORYCHECK_SUPPRESSIONS_FILE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPIEXEC_EXECUTABLE
+MPIEXEC_EXECUTABLE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPIEXEC_MAX_NUMPROCS
+MPIEXEC_MAX_NUMPROCS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPIEXEC_NUMPROC_FLAG
+MPIEXEC_NUMPROC_FLAG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPIEXEC_POSTFLAGS
+MPIEXEC_POSTFLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPIEXEC_PREFLAGS
+MPIEXEC_PREFLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_CXX_ADDITIONAL_INCLUDE_DIRS
+MPI_CXX_ADDITIONAL_INCLUDE_DIRS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_CXX_COMPILER
+MPI_CXX_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_CXX_COMPILER_INCLUDE_DIRS
+MPI_CXX_COMPILER_INCLUDE_DIRS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_CXX_COMPILE_DEFINITIONS
+MPI_CXX_COMPILE_DEFINITIONS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_CXX_COMPILE_OPTIONS
+MPI_CXX_COMPILE_OPTIONS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_CXX_LIB_NAMES
+MPI_CXX_LIB_NAMES-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_CXX_LINK_FLAGS
+MPI_CXX_LINK_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_CXX_SKIP_MPICXX
+MPI_CXX_SKIP_MPICXX-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_C_ADDITIONAL_INCLUDE_DIRS
+MPI_C_ADDITIONAL_INCLUDE_DIRS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_C_COMPILER
+MPI_C_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_C_COMPILER_INCLUDE_DIRS
+MPI_C_COMPILER_INCLUDE_DIRS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_C_COMPILE_DEFINITIONS
+MPI_C_COMPILE_DEFINITIONS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_C_COMPILE_OPTIONS
+MPI_C_COMPILE_OPTIONS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_C_LIB_NAMES
+MPI_C_LIB_NAMES-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_C_LINK_FLAGS
+MPI_C_LINK_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_Fortran_ADDITIONAL_INCLUDE_DIRS
+MPI_Fortran_ADDITIONAL_INCLUDE_DIRS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_Fortran_COMPILER
+MPI_Fortran_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_Fortran_COMPILER_INCLUDE_DIRS
+MPI_Fortran_COMPILER_INCLUDE_DIRS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_Fortran_COMPILE_DEFINITIONS
+MPI_Fortran_COMPILE_DEFINITIONS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_Fortran_COMPILE_OPTIONS
+MPI_Fortran_COMPILE_OPTIONS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_Fortran_LIB_NAMES
+MPI_Fortran_LIB_NAMES-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_Fortran_LINK_FLAGS
+MPI_Fortran_LINK_FLAGS-ADVANCED:INTERNAL=1
+//Result of TRY_COMPILE
+MPI_RESULT_CXX_test_mpi_MPICXX:INTERNAL=TRUE
+//Result of TRY_COMPILE
+MPI_RESULT_CXX_test_mpi_normal:INTERNAL=TRUE
+//Result of TRY_COMPILE
+MPI_RESULT_C_test_mpi_normal:INTERNAL=TRUE
+//Result of TRY_COMPILE
+MPI_RESULT_Fortran_mpiver_F90_MODULE:INTERNAL=TRUE
+//Result of TRY_COMPILE
+MPI_RESULT_Fortran_test_mpi_F08_MODULE:INTERNAL=FALSE
+//Result of TRY_COMPILE
+MPI_RESULT_Fortran_test_mpi_F77_HEADER:INTERNAL=TRUE
+//Result of TRY_COMPILE
+MPI_RESULT_Fortran_test_mpi_F90_MODULE:INTERNAL=TRUE
+//ADVANCED property for variable: MPI_cuda_LIBRARY
+MPI_cuda_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_dl_LIBRARY
+MPI_dl_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_dsmml_LIBRARY
+MPI_dsmml_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_mpi_gtl_cuda_LIBRARY
+MPI_mpi_gtl_cuda_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_mpi_nvidia_LIBRARY
+MPI_mpi_nvidia_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_mpifort_nvidia_LIBRARY
+MPI_mpifort_nvidia_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_sci_nvidia_LIBRARY
+MPI_sci_nvidia_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_sci_nvidia_mpi_LIBRARY
+MPI_sci_nvidia_mpi_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: MPI_xpmem_LIBRARY
+MPI_xpmem_LIBRARY-ADVANCED:INTERNAL=1
+//Result of TRY_COMPILE
+OpenMP_COMPILE_RESULT_CXX_mp:INTERNAL=TRUE
+//Result of TRY_COMPILE
+OpenMP_COMPILE_RESULT_C_mp:INTERNAL=TRUE
+//Result of TRY_COMPILE
+OpenMP_COMPILE_RESULT_FortranHeader_mp:INTERNAL=TRUE
+//Result of TRY_COMPILE
+OpenMP_COMPILE_RESULT_FortranModule_mp:INTERNAL=TRUE
+//ADVANCED property for variable: OpenMP_CXX_FLAGS
+OpenMP_CXX_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: OpenMP_CXX_LIB_NAMES
+OpenMP_CXX_LIB_NAMES-ADVANCED:INTERNAL=1
+//CXX compiler's OpenMP specification date
+OpenMP_CXX_SPEC_DATE:INTERNAL=202011
+//ADVANCED property for variable: OpenMP_C_FLAGS
+OpenMP_C_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: OpenMP_C_LIB_NAMES
+OpenMP_C_LIB_NAMES-ADVANCED:INTERNAL=1
+//C compiler's OpenMP specification date
+OpenMP_C_SPEC_DATE:INTERNAL=202011
+//ADVANCED property for variable: OpenMP_Fortran_FLAGS
+OpenMP_Fortran_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: OpenMP_Fortran_LIB_NAMES
+OpenMP_Fortran_LIB_NAMES-ADVANCED:INTERNAL=1
+//Fortran compiler's OpenMP specification date
+OpenMP_Fortran_SPEC_DATE:INTERNAL=202011
+//Result of TRY_COMPILE
+OpenMP_SPECTEST_CXX_:INTERNAL=TRUE
+//Result of TRY_COMPILE
+OpenMP_SPECTEST_C_:INTERNAL=TRUE
+//Result of TRY_COMPILE
+OpenMP_SPECTEST_Fortran_:INTERNAL=TRUE
+//ADVANCED property for variable: OpenMP_sci_nvidia_mp_LIBRARY
+OpenMP_sci_nvidia_mp_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: OpenMP_sci_nvidia_mpi_mp_LIBRARY
+OpenMP_sci_nvidia_mpi_mp_LIBRARY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: PKG_CONFIG_ARGN
+PKG_CONFIG_ARGN-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: PKG_CONFIG_EXECUTABLE
+PKG_CONFIG_EXECUTABLE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: SITE
+SITE-ADVANCED:INTERNAL=1
+//linker supports push/pop state
+_CMAKE_LINKER_PUSHPOP_STATE_SUPPORTED:INTERNAL=TRUE
+//CMAKE_INSTALL_PREFIX during last run
+_GNUInstallDirs_LAST_CMAKE_INSTALL_PREFIX:INTERNAL=/global/cfs/cdirs/m2956/nanding/myprojects/multi-GPU/superlu_dist/build_base
+
diff --git a/build_base/CMakeFiles/3.24.3/CMakeCCompiler.cmake b/build_base/CMakeFiles/3.24.3/CMakeCCompiler.cmake
new file mode 100644
index 00000000..9c079aad
--- /dev/null
+++ b/build_base/CMakeFiles/3.24.3/CMakeCCompiler.cmake
@@ -0,0 +1,72 @@
+set(CMAKE_C_COMPILER "/opt/cray/pe/craype/2.7.19/bin/cc")
+set(CMAKE_C_COMPILER_ARG1 "")
+set(CMAKE_C_COMPILER_ID "NVHPC")
+set(CMAKE_C_COMPILER_VERSION "22.7.0")
+set(CMAKE_C_COMPILER_VERSION_INTERNAL "")
+set(CMAKE_C_COMPILER_WRAPPER "CrayPrgEnv")
+set(CMAKE_C_STANDARD_COMPUTED_DEFAULT "11")
+set(CMAKE_C_EXTENSIONS_COMPUTED_DEFAULT "ON")
+set(CMAKE_C_COMPILE_FEATURES "c_std_90;c_function_prototypes;c_std_99;c_restrict;c_variadic_macros;c_std_11;c_static_assert;c_std_17")
+set(CMAKE_C90_COMPILE_FEATURES "c_std_90;c_function_prototypes")
+set(CMAKE_C99_COMPILE_FEATURES "c_std_99;c_restrict;c_variadic_macros")
+set(CMAKE_C11_COMPILE_FEATURES "c_std_11;c_static_assert")
+set(CMAKE_C17_COMPILE_FEATURES "c_std_17")
+set(CMAKE_C23_COMPILE_FEATURES "")
+
+set(CMAKE_C_PLATFORM_ID "Linux")
+set(CMAKE_C_SIMULATE_ID "")
+set(CMAKE_C_COMPILER_FRONTEND_VARIANT "")
+set(CMAKE_C_SIMULATE_VERSION "")
+
+
+
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_C_COMPILER_AR "")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_C_COMPILER_RANLIB "")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_MT "")
+set(CMAKE_COMPILER_IS_GNUCC )
+set(CMAKE_C_COMPILER_LOADED 1)
+set(CMAKE_C_COMPILER_WORKS TRUE)
+set(CMAKE_C_ABI_COMPILED TRUE)
+
+set(CMAKE_C_COMPILER_ENV_VAR "CC")
+
+set(CMAKE_C_COMPILER_ID_RUN 1)
+set(CMAKE_C_SOURCE_FILE_EXTENSIONS c;m)
+set(CMAKE_C_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
+set(CMAKE_C_LINKER_PREFERENCE 10)
+
+# Save compiler ABI information.
+set(CMAKE_C_SIZEOF_DATA_PTR "8")
+set(CMAKE_C_COMPILER_ABI "")
+set(CMAKE_C_BYTE_ORDER "LITTLE_ENDIAN")
+set(CMAKE_C_LIBRARY_ARCHITECTURE "")
+
+if(CMAKE_C_SIZEOF_DATA_PTR)
+ set(CMAKE_SIZEOF_VOID_P "${CMAKE_C_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_C_COMPILER_ABI)
+ set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_C_COMPILER_ABI}")
+endif()
+
+if(CMAKE_C_LIBRARY_ARCHITECTURE)
+ set(CMAKE_LIBRARY_ARCHITECTURE "")
+endif()
+
+set(CMAKE_C_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_C_CL_SHOWINCLUDES_PREFIX)
+ set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_C_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+
+set(CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES "/usr/include")
+set(CMAKE_C_IMPLICIT_LINK_LIBRARIES "cupti;cudart;cuda;sci_nvidia_mpi;sci_nvidia;dl;mpi_nvidia;mpi_gtl_cuda;dsmml;xpmem;acchost;accdevaux;accdevice;dl;cudadevice;nvf;nvomp;dl;nvhpcatm;atomic;pthread;nvcpumath;nsnvc;nvc;rt;pthread;gcc;c;gcc_s;m")
+set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/lib;/opt/cray/pe/mpich/8.1.22/gtl/lib;/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/stubs;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/math_libs/11.7/lib64;/opt/cray/pe/dsmml/0.2.2/dsmml/lib;/opt/cray/xpmem/2.5.2-2.4_3.20__gd0f7936.shasta/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/lib;/usr/lib64;/usr/lib64/gcc/x86_64-suse-linux/7")
+set(CMAKE_C_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/build_base/CMakeFiles/3.24.3/CMakeCUDACompiler.cmake b/build_base/CMakeFiles/3.24.3/CMakeCUDACompiler.cmake
new file mode 100644
index 00000000..cd974dcf
--- /dev/null
+++ b/build_base/CMakeFiles/3.24.3/CMakeCUDACompiler.cmake
@@ -0,0 +1,75 @@
+set(CMAKE_CUDA_COMPILER "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/nvcc")
+set(CMAKE_CUDA_HOST_COMPILER "")
+set(CMAKE_CUDA_HOST_LINK_LAUNCHER "/opt/cray/pe/craype/2.7.19/bin/CC")
+set(CMAKE_CUDA_COMPILER_ID "NVIDIA")
+set(CMAKE_CUDA_COMPILER_VERSION "11.7.64")
+set(CMAKE_CUDA_DEVICE_LINKER "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/nvlink")
+set(CMAKE_CUDA_FATBINARY "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/fatbinary")
+set(CMAKE_CUDA_STANDARD_COMPUTED_DEFAULT "14")
+set(CMAKE_CUDA_EXTENSIONS_COMPUTED_DEFAULT "ON")
+set(CMAKE_CUDA_COMPILE_FEATURES "cuda_std_03;cuda_std_11;cuda_std_14;cuda_std_17")
+set(CMAKE_CUDA03_COMPILE_FEATURES "cuda_std_03")
+set(CMAKE_CUDA11_COMPILE_FEATURES "cuda_std_11")
+set(CMAKE_CUDA14_COMPILE_FEATURES "cuda_std_14")
+set(CMAKE_CUDA17_COMPILE_FEATURES "cuda_std_17")
+set(CMAKE_CUDA20_COMPILE_FEATURES "")
+set(CMAKE_CUDA23_COMPILE_FEATURES "")
+
+set(CMAKE_CUDA_PLATFORM_ID "Linux")
+set(CMAKE_CUDA_SIMULATE_ID "GNU")
+set(CMAKE_CUDA_COMPILER_FRONTEND_VARIANT "")
+set(CMAKE_CUDA_SIMULATE_VERSION "7.5")
+
+
+
+set(CMAKE_CUDA_COMPILER_ENV_VAR "CUDACXX")
+set(CMAKE_CUDA_HOST_COMPILER_ENV_VAR "CUDAHOSTCXX")
+
+set(CMAKE_CUDA_COMPILER_LOADED 1)
+set(CMAKE_CUDA_COMPILER_ID_RUN 1)
+set(CMAKE_CUDA_SOURCE_FILE_EXTENSIONS cu)
+set(CMAKE_CUDA_LINKER_PREFERENCE 15)
+set(CMAKE_CUDA_LINKER_PREFERENCE_PROPAGATES 1)
+
+set(CMAKE_CUDA_SIZEOF_DATA_PTR "8")
+set(CMAKE_CUDA_COMPILER_ABI "")
+set(CMAKE_CUDA_BYTE_ORDER "LITTLE_ENDIAN")
+set(CMAKE_CUDA_LIBRARY_ARCHITECTURE "")
+
+if(CMAKE_CUDA_SIZEOF_DATA_PTR)
+ set(CMAKE_SIZEOF_VOID_P "${CMAKE_CUDA_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_CUDA_COMPILER_ABI)
+ set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CUDA_COMPILER_ABI}")
+endif()
+
+if(CMAKE_CUDA_LIBRARY_ARCHITECTURE)
+ set(CMAKE_LIBRARY_ARCHITECTURE "")
+endif()
+
+set(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7")
+set(CMAKE_CUDA_COMPILER_TOOLKIT_LIBRARY_ROOT "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7")
+set(CMAKE_CUDA_COMPILER_TOOLKIT_VERSION "11.7.64")
+set(CMAKE_CUDA_COMPILER_LIBRARY_ROOT "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7")
+
+set(CMAKE_CUDA_ARCHITECTURES_ALL "35-real;37-real;50-real;52-real;53-real;60-real;61-real;62-real;70-real;72-real;75-real;80-real;86-real;87")
+set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR "35-real;50-real;60-real;70-real;80")
+set(CMAKE_CUDA_ARCHITECTURES_NATIVE "80-real")
+
+set(CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/include")
+
+set(CMAKE_CUDA_HOST_IMPLICIT_LINK_LIBRARIES "")
+set(CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/lib/stubs;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/lib")
+set(CMAKE_CUDA_HOST_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
+
+set(CMAKE_CUDA_IMPLICIT_INCLUDE_DIRECTORIES "/usr/include")
+set(CMAKE_CUDA_IMPLICIT_LINK_LIBRARIES "cupti;cuda;sci_nvidia_mpi;sci_nvidia;mpi_nvidia;mpi_gtl_cuda;dsmml;xpmem;acchost;accdevaux;accdevice;cudadevice;atomic;nvhpcatm;stdc++;nvf;nvomp;nvhpcatm;atomic;nvcpumath;nsnvc;nvc;gcc;c;gcc_s;m")
+set(CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/lib/stubs;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/targets/x86_64-linux/lib;/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/lib;/opt/cray/pe/mpich/8.1.22/gtl/lib;/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/stubs;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/math_libs/11.7/lib64;/opt/cray/pe/dsmml/0.2.2/dsmml/lib;/opt/cray/xpmem/2.5.2-2.4_3.20__gd0f7936.shasta/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/lib;/usr/lib64;/usr/lib64/gcc/x86_64-suse-linux/7")
+set(CMAKE_CUDA_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
+
+set(CMAKE_CUDA_RUNTIME_LIBRARY_DEFAULT "STATIC")
+
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_MT "")
diff --git a/build_base/CMakeFiles/3.24.3/CMakeCXXCompiler.cmake b/build_base/CMakeFiles/3.24.3/CMakeCXXCompiler.cmake
new file mode 100644
index 00000000..01f6b008
--- /dev/null
+++ b/build_base/CMakeFiles/3.24.3/CMakeCXXCompiler.cmake
@@ -0,0 +1,83 @@
+set(CMAKE_CXX_COMPILER "/opt/cray/pe/craype/2.7.19/bin/CC")
+set(CMAKE_CXX_COMPILER_ARG1 "")
+set(CMAKE_CXX_COMPILER_ID "NVHPC")
+set(CMAKE_CXX_COMPILER_VERSION "22.7.0")
+set(CMAKE_CXX_COMPILER_VERSION_INTERNAL "")
+set(CMAKE_CXX_COMPILER_WRAPPER "CrayPrgEnv")
+set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "14")
+set(CMAKE_CXX_EXTENSIONS_COMPUTED_DEFAULT "ON")
+set(CMAKE_CXX_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters;cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates;cxx_std_17;cxx_std_20")
+set(CMAKE_CXX98_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters")
+set(CMAKE_CXX11_COMPILE_FEATURES "cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
+set(CMAKE_CXX14_COMPILE_FEATURES "cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
+set(CMAKE_CXX17_COMPILE_FEATURES "cxx_std_17")
+set(CMAKE_CXX20_COMPILE_FEATURES "cxx_std_20")
+set(CMAKE_CXX23_COMPILE_FEATURES "")
+
+set(CMAKE_CXX_PLATFORM_ID "Linux")
+set(CMAKE_CXX_SIMULATE_ID "")
+set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "")
+set(CMAKE_CXX_SIMULATE_VERSION "")
+
+
+
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_CXX_COMPILER_AR "")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_MT "")
+set(CMAKE_COMPILER_IS_GNUCXX )
+set(CMAKE_CXX_COMPILER_LOADED 1)
+set(CMAKE_CXX_COMPILER_WORKS TRUE)
+set(CMAKE_CXX_ABI_COMPILED TRUE)
+
+set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
+
+set(CMAKE_CXX_COMPILER_ID_RUN 1)
+set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;m;mm;mpp;CPP;ixx;cppm)
+set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
+
+foreach (lang C OBJC OBJCXX)
+ if (CMAKE_${lang}_COMPILER_ID_RUN)
+ foreach(extension IN LISTS CMAKE_${lang}_SOURCE_FILE_EXTENSIONS)
+ list(REMOVE_ITEM CMAKE_CXX_SOURCE_FILE_EXTENSIONS ${extension})
+ endforeach()
+ endif()
+endforeach()
+
+set(CMAKE_CXX_LINKER_PREFERENCE 30)
+set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
+
+# Save compiler ABI information.
+set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
+set(CMAKE_CXX_COMPILER_ABI "")
+set(CMAKE_CXX_BYTE_ORDER "LITTLE_ENDIAN")
+set(CMAKE_CXX_LIBRARY_ARCHITECTURE "")
+
+if(CMAKE_CXX_SIZEOF_DATA_PTR)
+ set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_CXX_COMPILER_ABI)
+ set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
+endif()
+
+if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
+ set(CMAKE_LIBRARY_ARCHITECTURE "")
+endif()
+
+set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
+ set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+
+set(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "/usr/include")
+set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "cupti;cudart;cuda;sci_nvidia_mpi;sci_nvidia;dl;mpi_nvidia;mpi_gtl_cuda;dsmml;xpmem;acchost;accdevaux;accdevice;dl;cudadevice;atomic;nvhpcatm;stdc++;nvf;nvomp;dl;nvhpcatm;atomic;pthread;nvcpumath;nsnvc;nvc;rt;pthread;gcc;c;gcc_s;m")
+set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/lib;/opt/cray/pe/mpich/8.1.22/gtl/lib;/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/stubs;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/math_libs/11.7/lib64;/opt/cray/pe/dsmml/0.2.2/dsmml/lib;/opt/cray/xpmem/2.5.2-2.4_3.20__gd0f7936.shasta/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/lib;/usr/lib64;/usr/lib64/gcc/x86_64-suse-linux/7")
+set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_C.bin b/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_C.bin
new file mode 100755
index 00000000..4941fb48
Binary files /dev/null and b/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_C.bin differ
diff --git a/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_CUDA.bin b/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_CUDA.bin
new file mode 100755
index 00000000..baa1130e
Binary files /dev/null and b/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_CUDA.bin differ
diff --git a/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_CXX.bin b/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_CXX.bin
new file mode 100755
index 00000000..cdca0c71
Binary files /dev/null and b/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_CXX.bin differ
diff --git a/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_Fortran.bin b/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_Fortran.bin
new file mode 100755
index 00000000..446168a6
Binary files /dev/null and b/build_base/CMakeFiles/3.24.3/CMakeDetermineCompilerABI_Fortran.bin differ
diff --git a/build_base/CMakeFiles/3.24.3/CMakeFortranCompiler.cmake b/build_base/CMakeFiles/3.24.3/CMakeFortranCompiler.cmake
new file mode 100644
index 00000000..956e3bf9
--- /dev/null
+++ b/build_base/CMakeFiles/3.24.3/CMakeFortranCompiler.cmake
@@ -0,0 +1,61 @@
+set(CMAKE_Fortran_COMPILER "/opt/cray/pe/craype/2.7.19/bin/ftn")
+set(CMAKE_Fortran_COMPILER_ARG1 "")
+set(CMAKE_Fortran_COMPILER_ID "NVHPC")
+set(CMAKE_Fortran_COMPILER_VERSION "22.7.0")
+set(CMAKE_Fortran_COMPILER_WRAPPER "CrayPrgEnv")
+set(CMAKE_Fortran_PLATFORM_ID "Linux")
+set(CMAKE_Fortran_SIMULATE_ID "")
+set(CMAKE_Fortran_COMPILER_FRONTEND_VARIANT "")
+set(CMAKE_Fortran_SIMULATE_VERSION "")
+
+
+
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_Fortran_COMPILER_AR "")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_Fortran_COMPILER_RANLIB "")
+set(CMAKE_COMPILER_IS_GNUG77 )
+set(CMAKE_Fortran_COMPILER_LOADED 1)
+set(CMAKE_Fortran_COMPILER_WORKS TRUE)
+set(CMAKE_Fortran_ABI_COMPILED TRUE)
+
+set(CMAKE_Fortran_COMPILER_ENV_VAR "FC")
+
+set(CMAKE_Fortran_COMPILER_SUPPORTS_F90 1)
+
+set(CMAKE_Fortran_COMPILER_ID_RUN 1)
+set(CMAKE_Fortran_SOURCE_FILE_EXTENSIONS f;F;fpp;FPP;f77;F77;f90;F90;for;For;FOR;f95;F95)
+set(CMAKE_Fortran_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
+set(CMAKE_Fortran_LINKER_PREFERENCE 20)
+if(UNIX)
+ set(CMAKE_Fortran_OUTPUT_EXTENSION .o)
+else()
+ set(CMAKE_Fortran_OUTPUT_EXTENSION .obj)
+endif()
+
+# Save compiler ABI information.
+set(CMAKE_Fortran_SIZEOF_DATA_PTR "8")
+set(CMAKE_Fortran_COMPILER_ABI "")
+set(CMAKE_Fortran_LIBRARY_ARCHITECTURE "")
+
+if(CMAKE_Fortran_SIZEOF_DATA_PTR AND NOT CMAKE_SIZEOF_VOID_P)
+ set(CMAKE_SIZEOF_VOID_P "${CMAKE_Fortran_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_Fortran_COMPILER_ABI)
+ set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_Fortran_COMPILER_ABI}")
+endif()
+
+if(CMAKE_Fortran_LIBRARY_ARCHITECTURE)
+ set(CMAKE_LIBRARY_ARCHITECTURE "")
+endif()
+
+
+
+
+
+set(CMAKE_Fortran_IMPLICIT_INCLUDE_DIRECTORIES "")
+set(CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "cupti;cudart;cuda;sci_nvidia_mpi;sci_nvidia;dl;mpifort_nvidia;mpi_nvidia;mpi_gtl_cuda;dsmml;xpmem;acchost;accdevaux;accdevice;dl;cudadevice;nvf;nvomp;dl;nvhpcatm;atomic;pthread;nvcpumath;nsnvc;nvc;rt;pthread;gcc;c;gcc_s;m")
+set(CMAKE_Fortran_IMPLICIT_LINK_DIRECTORIES "/opt/cray/pe/mpich/8.1.22/ofi/nvidia/20.7/lib;/opt/cray/pe/mpich/8.1.22/gtl/lib;/opt/cray/pe/libsci/22.11.1.2/NVIDIA/20.7/x86_64/lib;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64/stubs;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/nvvm/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/CUPTI/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/extras/Debugger/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/math_libs/11.7/lib64;/opt/cray/pe/dsmml/0.2.2/dsmml/lib;/opt/cray/xpmem/2.5.2-2.4_3.20__gd0f7936.shasta/lib64;/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/lib;/usr/lib64;/usr/lib64/gcc/x86_64-suse-linux/7")
+set(CMAKE_Fortran_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/build_base/CMakeFiles/3.24.3/CMakeSystem.cmake b/build_base/CMakeFiles/3.24.3/CMakeSystem.cmake
new file mode 100644
index 00000000..6c12dc61
--- /dev/null
+++ b/build_base/CMakeFiles/3.24.3/CMakeSystem.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_HOST_SYSTEM "Linux-5.14.21-150400.24.11_12.0.57-cray_shasta_c")
+set(CMAKE_HOST_SYSTEM_NAME "Linux")
+set(CMAKE_HOST_SYSTEM_VERSION "5.14.21-150400.24.11_12.0.57-cray_shasta_c")
+set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
+
+
+
+set(CMAKE_SYSTEM "Linux-5.14.21-150400.24.11_12.0.57-cray_shasta_c")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_VERSION "5.14.21-150400.24.11_12.0.57-cray_shasta_c")
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+
+set(CMAKE_CROSSCOMPILING "FALSE")
+
+set(CMAKE_SYSTEM_LOADED 1)
diff --git a/build_base/CMakeFiles/3.24.3/CompilerIdC/CMakeCCompilerId.c b/build_base/CMakeFiles/3.24.3/CompilerIdC/CMakeCCompilerId.c
new file mode 100644
index 00000000..2b43aa69
--- /dev/null
+++ b/build_base/CMakeFiles/3.24.3/CompilerIdC/CMakeCCompilerId.c
@@ -0,0 +1,838 @@
+#ifdef __cplusplus
+# error "A C++ compiler has been selected for C."
+#endif
+
+#if defined(__18CXX)
+# define ID_VOID_MAIN
+#endif
+#if defined(__CLASSIC_C__)
+/* cv-qualifiers did not exist in K&R C */
+# define const
+# define volatile
+#endif
+
+#if !defined(__has_include)
+/* If the compiler does not have __has_include, pretend the answer is
+ always no. */
+# define __has_include(x) 0
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+ Version date components: YYYY=Year, MM=Month, DD=Day */
+
+#if defined(__INTEL_COMPILER) || defined(__ICC)
+# define COMPILER_ID "Intel"
+# if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+# endif
+# if defined(__GNUC__)
+# define SIMULATE_ID "GNU"
+# endif
+ /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
+ except that a few beta releases use the old format with V=2021. */
+# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
+# if defined(__INTEL_COMPILER_UPDATE)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
+# else
+# define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER % 10)
+# endif
+# else
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
+ /* The third version component from --version is an update index,
+ but no macro is provided for it. */
+# define COMPILER_VERSION_PATCH DEC(0)
+# endif
+# if defined(__INTEL_COMPILER_BUILD_DATE)
+ /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
+# define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
+# endif
+# if defined(_MSC_VER)
+ /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# if defined(__GNUC__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# elif defined(__GNUG__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
+# define COMPILER_ID "IntelLLVM"
+#if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_ID "GNU"
+#endif
+/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
+ * later. Look for 6 digit vs. 8 digit version number to decide encoding.
+ * VVVV is no smaller than the current year when a version is released.
+ */
+#if __INTEL_LLVM_COMPILER < 1000000L
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 10)
+#else
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER % 100)
+#endif
+#if defined(_MSC_VER)
+ /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#elif defined(__GNUG__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+#endif
+#if defined(__GNUC_MINOR__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#endif
+#if defined(__GNUC_PATCHLEVEL__)
+# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#endif
+
+#elif defined(__PATHCC__)
+# define COMPILER_ID "PathScale"
+# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
+# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
+# if defined(__PATHCC_PATCHLEVEL__)
+# define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
+# endif
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# define COMPILER_ID "Embarcadero"
+# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
+# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
+# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__ & 0xFFFF)
+
+#elif defined(__BORLANDC__)
+# define COMPILER_ID "Borland"
+ /* __BORLANDC__ = 0xVRR */
+# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
+# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# define COMPILER_ID "Watcom"
+ /* __WATCOMC__ = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__WATCOMC__)
+# define COMPILER_ID "OpenWatcom"
+ /* __WATCOMC__ = VVRP + 1100 */
+# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+# define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__SUNPRO_C)
+# define COMPILER_ID "SunPro"
+# if __SUNPRO_C >= 0x5100
+ /* __SUNPRO_C = 0xVRRP */
+# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>12)
+# define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xFF)
+# define COMPILER_VERSION_PATCH HEX(__SUNPRO_C & 0xF)
+# else
+ /* __SUNPRO_CC = 0xVRP */
+# define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>8)
+# define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xF)
+# define COMPILER_VERSION_PATCH HEX(__SUNPRO_C & 0xF)
+# endif
+
+#elif defined(__HP_cc)
+# define COMPILER_ID "HP"
+ /* __HP_cc = VVRRPP */
+# define COMPILER_VERSION_MAJOR DEC(__HP_cc/10000)
+# define COMPILER_VERSION_MINOR DEC(__HP_cc/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__HP_cc % 100)
+
+#elif defined(__DECC)
+# define COMPILER_ID "Compaq"
+ /* __DECC_VER = VVRRTPPPP */
+# define COMPILER_VERSION_MAJOR DEC(__DECC_VER/10000000)
+# define COMPILER_VERSION_MINOR DEC(__DECC_VER/100000 % 100)
+# define COMPILER_VERSION_PATCH DEC(__DECC_VER % 10000)
+
+#elif defined(__IBMC__) && defined(__COMPILER_VER__)
+# define COMPILER_ID "zOS"
+ /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
+
+#elif defined(__open_xl__) && defined(__clang__)
+# define COMPILER_ID "IBMClang"
+# define COMPILER_VERSION_MAJOR DEC(__open_xl_version__)
+# define COMPILER_VERSION_MINOR DEC(__open_xl_release__)
+# define COMPILER_VERSION_PATCH DEC(__open_xl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__open_xl_ptf_fix_level__)
+
+
+#elif defined(__ibmxl__) && defined(__clang__)
+# define COMPILER_ID "XLClang"
+# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
+# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
+# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
+
+
+#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800
+# define COMPILER_ID "XL"
+ /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
+
+#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800
+# define COMPILER_ID "VisualAge"
+ /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__ % 10)
+
+#elif defined(__NVCOMPILER)
+# define COMPILER_ID "NVHPC"
+# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
+# if defined(__NVCOMPILER_PATCHLEVEL__)
+# define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
+# endif
+
+#elif defined(__PGI)
+# define COMPILER_ID "PGI"
+# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
+# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
+# if defined(__PGIC_PATCHLEVEL__)
+# define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
+# endif
+
+#elif defined(_CRAYC)
+# define COMPILER_ID "Cray"
+# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
+
+#elif defined(__TI_COMPILER_VERSION__)
+# define COMPILER_ID "TI"
+ /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
+# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
+# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000 % 1000)
+# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__ % 1000)
+
+#elif defined(__CLANG_FUJITSU)
+# define COMPILER_ID "FujitsuClang"
+# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(__FUJITSU)
+# define COMPILER_ID "Fujitsu"
+# if defined(__FCC_version__)
+# define COMPILER_VERSION __FCC_version__
+# elif defined(__FCC_major__)
+# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# endif
+# if defined(__fcc_version)
+# define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
+# elif defined(__FCC_VERSION)
+# define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
+# endif
+
+
+#elif defined(__ghs__)
+# define COMPILER_ID "GHS"
+/* __GHS_VERSION_NUMBER = VVVVRP */
+# ifdef __GHS_VERSION_NUMBER
+# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
+# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER % 10)
+# endif
+
+#elif defined(__TINYC__)
+# define COMPILER_ID "TinyCC"
+
+#elif defined(__BCC__)
+# define COMPILER_ID "Bruce"
+
+#elif defined(__SCO_VERSION__)
+# define COMPILER_ID "SCO"
+
+#elif defined(__ARMCC_VERSION) && !defined(__clang__)
+# define COMPILER_ID "ARMCC"
+#if __ARMCC_VERSION >= 1000000
+ /* __ARMCC_VERSION = VRRPPPP */
+ # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
+ # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
+ # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000)
+#else
+ /* __ARMCC_VERSION = VRPPPP */
+ # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
+ # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
+ # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION % 10000)
+#endif
+
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# define COMPILER_ID "AppleClang"
+# if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+ /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
+
+#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
+# define COMPILER_ID "ARMClang"
+ # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
+ # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
+ # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION % 10000)
+# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+ /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__LCC__) && (defined(__GNUC__) || defined(__GNUG__) || defined(__MCST__))
+# define COMPILER_ID "LCC"
+# define COMPILER_VERSION_MAJOR DEC(1)
+# if defined(__LCC__)
+# define COMPILER_VERSION_MINOR DEC(__LCC__- 100)
+# endif
+# if defined(__LCC_MINOR__)
+# define COMPILER_VERSION_PATCH DEC(__LCC_MINOR__)
+# endif
+# if defined(__GNUC__) && defined(__GNUC_MINOR__)
+# define SIMULATE_ID "GNU"
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# if defined(__GNUC_PATCHLEVEL__)
+# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+# endif
+
+#elif defined(__GNUC__)
+# define COMPILER_ID "GNU"
+# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
+# if defined(__GNUC_MINOR__)
+# define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+# define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(_MSC_VER)
+# define COMPILER_ID "MSVC"
+ /* _MSC_VER = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
+# if defined(_MSC_FULL_VER)
+# if _MSC_VER >= 1400
+ /* _MSC_FULL_VER = VVRRPPPPP */
+# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
+# else
+ /* _MSC_FULL_VER = VVRRPPPP */
+# define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
+# endif
+# endif
+# if defined(_MSC_BUILD)
+# define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
+# endif
+
+#elif defined(_ADI_COMPILER)
+# define COMPILER_ID "ADSP"
+#if defined(__VERSIONNUM__)
+ /* __VERSIONNUM__ = 0xVVRRPPTT */
+# define COMPILER_VERSION_MAJOR DEC(__VERSIONNUM__ >> 24 & 0xFF)
+# define COMPILER_VERSION_MINOR DEC(__VERSIONNUM__ >> 16 & 0xFF)
+# define COMPILER_VERSION_PATCH DEC(__VERSIONNUM__ >> 8 & 0xFF)
+# define COMPILER_VERSION_TWEAK DEC(__VERSIONNUM__ & 0xFF)
+#endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# define COMPILER_ID "IAR"
+# if defined(__VER__) && defined(__ICCARM__)
+# define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
+# define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
+# define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
+# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
+# define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
+# define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
+# define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
+# define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# endif
+
+#elif defined(__SDCC_VERSION_MAJOR) || defined(SDCC)
+# define COMPILER_ID "SDCC"
+# if defined(__SDCC_VERSION_MAJOR)
+# define COMPILER_VERSION_MAJOR DEC(__SDCC_VERSION_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(__SDCC_VERSION_MINOR)
+# define COMPILER_VERSION_PATCH DEC(__SDCC_VERSION_PATCH)
+# else
+ /* SDCC = VRP */
+# define COMPILER_VERSION_MAJOR DEC(SDCC/100)
+# define COMPILER_VERSION_MINOR DEC(SDCC/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(SDCC % 10)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+ identification macro. Try to identify the platform and guess that
+ it is the native compiler. */
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+ getting matched. Store it in a pointer rather than an array
+ because some compilers will just produce instructions to fill the
+ array rather than assigning a pointer to a static array. */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#ifdef __QNXNTO__
+char const* qnxnto = "INFO" ":" "qnxnto[]";
+#endif
+
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name. */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__MSYS__)
+# define PLATFORM_ID "MSYS"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+# define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+# define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+# define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+# define PLATFORM_ID "Windows3x"
+
+# elif defined(__VXWORKS__)
+# define PLATFORM_ID "VxWorks"
+
+# else /* unknown platform */
+# define PLATFORM_ID
+# endif
+
+#elif defined(__INTEGRITY)
+# if defined(INT_178B)
+# define PLATFORM_ID "Integrity178"
+
+# else /* regular Integrity */
+# define PLATFORM_ID "Integrity"
+# endif
+
+# elif defined(_ADI_COMPILER)
+# define PLATFORM_ID "ADSP"
+
+#else /* unknown platform */
+# define PLATFORM_ID
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+ the architecture of the compiler being used. This is because
+ the compilers do not have flags that can change the architecture,
+ but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+# define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_ARM64EC)
+# define ARCHITECTURE_ID "ARM64EC"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+# define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+# define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM64)
+# define ARCHITECTURE_ID "ARM64"
+
+# elif defined(_M_ARM)
+# if _M_ARM == 4
+# define ARCHITECTURE_ID "ARMV4I"
+# elif _M_ARM == 5
+# define ARCHITECTURE_ID "ARMV5I"
+# else
+# define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+# endif
+
+# elif defined(_M_MIPS)
+# define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+# define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+# define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+# define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# if defined(__ICCARM__)
+# define ARCHITECTURE_ID "ARM"
+
+# elif defined(__ICCRX__)
+# define ARCHITECTURE_ID "RX"
+
+# elif defined(__ICCRH850__)
+# define ARCHITECTURE_ID "RH850"
+
+# elif defined(__ICCRL78__)
+# define ARCHITECTURE_ID "RL78"
+
+# elif defined(__ICCRISCV__)
+# define ARCHITECTURE_ID "RISCV"
+
+# elif defined(__ICCAVR__)
+# define ARCHITECTURE_ID "AVR"
+
+# elif defined(__ICC430__)
+# define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__ICCV850__)
+# define ARCHITECTURE_ID "V850"
+
+# elif defined(__ICC8051__)
+# define ARCHITECTURE_ID "8051"
+
+# elif defined(__ICCSTM8__)
+# define ARCHITECTURE_ID "STM8"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__ghs__)
+# if defined(__PPC64__)
+# define ARCHITECTURE_ID "PPC64"
+
+# elif defined(__ppc__)
+# define ARCHITECTURE_ID "PPC"
+
+# elif defined(__ARM__)
+# define ARCHITECTURE_ID "ARM"
+
+# elif defined(__x86_64__)
+# define ARCHITECTURE_ID "x64"
+
+# elif defined(__i386__)
+# define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__TI_COMPILER_VERSION__)
+# if defined(__TI_ARM__)
+# define ARCHITECTURE_ID "ARM"
+
+# elif defined(__MSP430__)
+# define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__TMS320C28XX__)
+# define ARCHITECTURE_ID "TMS320C28x"
+
+# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
+# define ARCHITECTURE_ID "TMS320C6x"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+# elif defined(__ADSPSHARC__)
+# define ARCHITECTURE_ID "SHARC"
+
+# elif defined(__ADSPBLACKFIN__)
+# define ARCHITECTURE_ID "Blackfin"
+
+#else
+# define ARCHITECTURE_ID
+#endif
+
+/* Convert integer to decimal digit literals. */
+#define DEC(n) \
+ ('0' + (((n) / 10000000)%10)), \
+ ('0' + (((n) / 1000000)%10)), \
+ ('0' + (((n) / 100000)%10)), \
+ ('0' + (((n) / 10000)%10)), \
+ ('0' + (((n) / 1000)%10)), \
+ ('0' + (((n) / 100)%10)), \
+ ('0' + (((n) / 10)%10)), \
+ ('0' + ((n) % 10))
+
+/* Convert integer to hex digit literals. */
+#define HEX(n) \
+ ('0' + ((n)>>28 & 0xF)), \
+ ('0' + ((n)>>24 & 0xF)), \
+ ('0' + ((n)>>20 & 0xF)), \
+ ('0' + ((n)>>16 & 0xF)), \
+ ('0' + ((n)>>12 & 0xF)), \
+ ('0' + ((n)>>8 & 0xF)), \
+ ('0' + ((n)>>4 & 0xF)), \
+ ('0' + ((n) & 0xF))
+
+/* Construct a string literal encoding the version number. */
+#ifdef COMPILER_VERSION
+char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
+
+/* Construct a string literal encoding the version number components. */
+#elif defined(COMPILER_VERSION_MAJOR)
+char const info_version[] = {
+ 'I', 'N', 'F', 'O', ':',
+ 'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+ COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+ '.', COMPILER_VERSION_MINOR,
+# ifdef COMPILER_VERSION_PATCH
+ '.', COMPILER_VERSION_PATCH,
+# ifdef COMPILER_VERSION_TWEAK
+ '.', COMPILER_VERSION_TWEAK,
+# endif
+# endif
+# endif
+ ']','\0'};
+#endif
+
+/* Construct a string literal encoding the internal version number. */
+#ifdef COMPILER_VERSION_INTERNAL
+char const info_version_internal[] = {
+ 'I', 'N', 'F', 'O', ':',
+ 'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
+ 'i','n','t','e','r','n','a','l','[',
+ COMPILER_VERSION_INTERNAL,']','\0'};
+#elif defined(COMPILER_VERSION_INTERNAL_STR)
+char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+ 'I', 'N', 'F', 'O', ':',
+ 's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+ SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+ '.', SIMULATE_VERSION_MINOR,
+# ifdef SIMULATE_VERSION_PATCH
+ '.', SIMULATE_VERSION_PATCH,
+# ifdef SIMULATE_VERSION_TWEAK
+ '.', SIMULATE_VERSION_TWEAK,
+# endif
+# endif
+# endif
+ ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+ getting matched. Store it in a pointer rather than an array
+ because some compilers will just produce instructions to fill the
+ array rather than assigning a pointer to a static array. */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+#if !defined(__STDC__) && !defined(__clang__)
+# if defined(_MSC_VER) || defined(__ibmxl__) || defined(__IBMC__)
+# define C_VERSION "90"
+# else
+# define C_VERSION
+# endif
+#elif __STDC_VERSION__ > 201710L
+# define C_VERSION "23"
+#elif __STDC_VERSION__ >= 201710L
+# define C_VERSION "17"
+#elif __STDC_VERSION__ >= 201000L
+# define C_VERSION "11"
+#elif __STDC_VERSION__ >= 199901L
+# define C_VERSION "99"
+#else
+# define C_VERSION "90"
+#endif
+const char* info_language_standard_default =
+ "INFO" ":" "standard_default[" C_VERSION "]";
+
+const char* info_language_extensions_default = "INFO" ":" "extensions_default["
+#if (defined(__clang__) || defined(__GNUC__) || defined(__xlC__) || \
+ defined(__TI_COMPILER_VERSION__)) && \
+ !defined(__STRICT_ANSI__)
+ "ON"
+#else
+ "OFF"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+#ifdef ID_VOID_MAIN
+void main() {}
+#else
+# if defined(__CLASSIC_C__)
+int main(argc, argv) int argc; char *argv[];
+# else
+int main(int argc, char* argv[])
+# endif
+{
+ int require = 0;
+ require += info_compiler[argc];
+ require += info_platform[argc];
+ require += info_arch[argc];
+#ifdef COMPILER_VERSION_MAJOR
+ require += info_version[argc];
+#endif
+#ifdef COMPILER_VERSION_INTERNAL
+ require += info_version_internal[argc];
+#endif
+#ifdef SIMULATE_ID
+ require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+ require += info_simulate_version[argc];
+#endif
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+ require += info_cray[argc];
+#endif
+ require += info_language_standard_default[argc];
+ require += info_language_extensions_default[argc];
+ (void)argv;
+ return require;
+}
+#endif
diff --git a/build_base/CMakeFiles/3.24.3/CompilerIdC/a.out b/build_base/CMakeFiles/3.24.3/CompilerIdC/a.out
new file mode 100755
index 00000000..2d8a9eaf
Binary files /dev/null and b/build_base/CMakeFiles/3.24.3/CompilerIdC/a.out differ
diff --git a/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/CMakeCUDACompilerId.cu b/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/CMakeCUDACompilerId.cu
new file mode 100644
index 00000000..acca5835
--- /dev/null
+++ b/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/CMakeCUDACompilerId.cu
@@ -0,0 +1,444 @@
+#ifndef __CUDACC__
+# error "A C or C++ compiler has been selected for CUDA"
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+ Version date components: YYYY=Year, MM=Month, DD=Day */
+
+#if defined(__NVCC__)
+# define COMPILER_ID "NVIDIA"
+# if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+# elif defined(__clang__)
+# define SIMULATE_ID "Clang"
+# elif defined(__GNUC__)
+# define SIMULATE_ID "GNU"
+# endif
+# if defined(__CUDACC_VER_MAJOR__)
+# define COMPILER_VERSION_MAJOR DEC(__CUDACC_VER_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__CUDACC_VER_MINOR__)
+# define COMPILER_VERSION_PATCH DEC(__CUDACC_VER_BUILD__)
+# endif
+# if defined(_MSC_VER)
+ /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# elif defined(__clang__)
+# define SIMULATE_VERSION_MAJOR DEC(__clang_major__)
+# define SIMULATE_VERSION_MINOR DEC(__clang_minor__)
+# elif defined(__GNUC__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+ /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+ identification macro. Try to identify the platform and guess that
+ it is the native compiler. */
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+ getting matched. Store it in a pointer rather than an array
+ because some compilers will just produce instructions to fill the
+ array rather than assigning a pointer to a static array. */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name. */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__MSYS__)
+# define PLATFORM_ID "MSYS"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+# define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+# define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+# define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+# define PLATFORM_ID "Windows3x"
+
+# elif defined(__VXWORKS__)
+# define PLATFORM_ID "VxWorks"
+
+# else /* unknown platform */
+# define PLATFORM_ID
+# endif
+
+#elif defined(__INTEGRITY)
+# if defined(INT_178B)
+# define PLATFORM_ID "Integrity178"
+
+# else /* regular Integrity */
+# define PLATFORM_ID "Integrity"
+# endif
+
+# elif defined(_ADI_COMPILER)
+# define PLATFORM_ID "ADSP"
+
+#else /* unknown platform */
+# define PLATFORM_ID
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+ the architecture of the compiler being used. This is because
+ the compilers do not have flags that can change the architecture,
+ but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+# define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_ARM64EC)
+# define ARCHITECTURE_ID "ARM64EC"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+# define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+# define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM64)
+# define ARCHITECTURE_ID "ARM64"
+
+# elif defined(_M_ARM)
+# if _M_ARM == 4
+# define ARCHITECTURE_ID "ARMV4I"
+# elif _M_ARM == 5
+# define ARCHITECTURE_ID "ARMV5I"
+# else
+# define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+# endif
+
+# elif defined(_M_MIPS)
+# define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+# define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+# define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+# define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# if defined(__ICCARM__)
+# define ARCHITECTURE_ID "ARM"
+
+# elif defined(__ICCRX__)
+# define ARCHITECTURE_ID "RX"
+
+# elif defined(__ICCRH850__)
+# define ARCHITECTURE_ID "RH850"
+
+# elif defined(__ICCRL78__)
+# define ARCHITECTURE_ID "RL78"
+
+# elif defined(__ICCRISCV__)
+# define ARCHITECTURE_ID "RISCV"
+
+# elif defined(__ICCAVR__)
+# define ARCHITECTURE_ID "AVR"
+
+# elif defined(__ICC430__)
+# define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__ICCV850__)
+# define ARCHITECTURE_ID "V850"
+
+# elif defined(__ICC8051__)
+# define ARCHITECTURE_ID "8051"
+
+# elif defined(__ICCSTM8__)
+# define ARCHITECTURE_ID "STM8"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__ghs__)
+# if defined(__PPC64__)
+# define ARCHITECTURE_ID "PPC64"
+
+# elif defined(__ppc__)
+# define ARCHITECTURE_ID "PPC"
+
+# elif defined(__ARM__)
+# define ARCHITECTURE_ID "ARM"
+
+# elif defined(__x86_64__)
+# define ARCHITECTURE_ID "x64"
+
+# elif defined(__i386__)
+# define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__TI_COMPILER_VERSION__)
+# if defined(__TI_ARM__)
+# define ARCHITECTURE_ID "ARM"
+
+# elif defined(__MSP430__)
+# define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__TMS320C28XX__)
+# define ARCHITECTURE_ID "TMS320C28x"
+
+# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
+# define ARCHITECTURE_ID "TMS320C6x"
+
+# else /* unknown architecture */
+# define ARCHITECTURE_ID ""
+# endif
+
+# elif defined(__ADSPSHARC__)
+# define ARCHITECTURE_ID "SHARC"
+
+# elif defined(__ADSPBLACKFIN__)
+# define ARCHITECTURE_ID "Blackfin"
+
+#else
+# define ARCHITECTURE_ID
+#endif
+
+/* Convert integer to decimal digit literals. */
+#define DEC(n) \
+ ('0' + (((n) / 10000000)%10)), \
+ ('0' + (((n) / 1000000)%10)), \
+ ('0' + (((n) / 100000)%10)), \
+ ('0' + (((n) / 10000)%10)), \
+ ('0' + (((n) / 1000)%10)), \
+ ('0' + (((n) / 100)%10)), \
+ ('0' + (((n) / 10)%10)), \
+ ('0' + ((n) % 10))
+
+/* Convert integer to hex digit literals. */
+#define HEX(n) \
+ ('0' + ((n)>>28 & 0xF)), \
+ ('0' + ((n)>>24 & 0xF)), \
+ ('0' + ((n)>>20 & 0xF)), \
+ ('0' + ((n)>>16 & 0xF)), \
+ ('0' + ((n)>>12 & 0xF)), \
+ ('0' + ((n)>>8 & 0xF)), \
+ ('0' + ((n)>>4 & 0xF)), \
+ ('0' + ((n) & 0xF))
+
+/* Construct a string literal encoding the version number. */
+#ifdef COMPILER_VERSION
+char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
+
+/* Construct a string literal encoding the version number components. */
+#elif defined(COMPILER_VERSION_MAJOR)
+char const info_version[] = {
+ 'I', 'N', 'F', 'O', ':',
+ 'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+ COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+ '.', COMPILER_VERSION_MINOR,
+# ifdef COMPILER_VERSION_PATCH
+ '.', COMPILER_VERSION_PATCH,
+# ifdef COMPILER_VERSION_TWEAK
+ '.', COMPILER_VERSION_TWEAK,
+# endif
+# endif
+# endif
+ ']','\0'};
+#endif
+
+/* Construct a string literal encoding the internal version number. */
+#ifdef COMPILER_VERSION_INTERNAL
+char const info_version_internal[] = {
+ 'I', 'N', 'F', 'O', ':',
+ 'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
+ 'i','n','t','e','r','n','a','l','[',
+ COMPILER_VERSION_INTERNAL,']','\0'};
+#elif defined(COMPILER_VERSION_INTERNAL_STR)
+char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+ 'I', 'N', 'F', 'O', ':',
+ 's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+ SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+ '.', SIMULATE_VERSION_MINOR,
+# ifdef SIMULATE_VERSION_PATCH
+ '.', SIMULATE_VERSION_PATCH,
+# ifdef SIMULATE_VERSION_TWEAK
+ '.', SIMULATE_VERSION_TWEAK,
+# endif
+# endif
+# endif
+ ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+ getting matched. Store it in a pointer rather than an array
+ because some compilers will just produce instructions to fill the
+ array rather than assigning a pointer to a static array. */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+const char* info_language_standard_default = "INFO" ":" "standard_default["
+#if __cplusplus > 202002L
+ "23"
+#elif __cplusplus > 201703L
+ "20"
+#elif __cplusplus >= 201703L
+ "17"
+#elif __cplusplus >= 201402L
+ "14"
+#elif __cplusplus >= 201103L
+ "11"
+#else
+ "03"
+#endif
+"]";
+
+const char* info_language_extensions_default = "INFO" ":" "extensions_default["
+#if (defined(__clang__) || defined(__GNUC__)) && \
+ !defined(__STRICT_ANSI__)
+ "ON"
+#else
+ "OFF"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+ int require = 0;
+ require += info_compiler[argc];
+ require += info_platform[argc];
+#ifdef COMPILER_VERSION_MAJOR
+ require += info_version[argc];
+#endif
+#ifdef SIMULATE_ID
+ require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+ require += info_simulate_version[argc];
+#endif
+ require += info_language_standard_default[argc];
+ require += info_language_extensions_default[argc];
+ (void)argv;
+ return require;
+}
diff --git a/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/a.out b/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/a.out
new file mode 100755
index 00000000..55d31ed3
Binary files /dev/null and b/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/a.out differ
diff --git a/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/tmp/CMakeCUDACompilerId.cpp1.ii b/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/tmp/CMakeCUDACompilerId.cpp1.ii
new file mode 100644
index 00000000..d63ea9b5
--- /dev/null
+++ b/build_base/CMakeFiles/3.24.3/CompilerIdCUDA/tmp/CMakeCUDACompilerId.cpp1.ii
@@ -0,0 +1,64473 @@
+# 1 "CMakeCUDACompilerId.cu"
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/_cplus_macros.h" 1 3
+# 1 "CMakeCUDACompilerId.cu" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/_cplus_preinclude.h" 1 3
+
+
+
+
+
+
+
+struct __va_list_tag {
+ unsigned int gp_offset;
+ unsigned int fp_offset;
+ char *overflow_arg_area;
+ char *reg_save_area;
+};
+
+# 27 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/_cplus_preinclude.h" 3
+
+typedef struct __va_list_tag __pgi_va_list[1];
+
+
+
+# 41 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/_cplus_preinclude.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 1 "CMakeCUDACompilerId.cu" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/_cplus_nvcc.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "CMakeCUDACompilerId.cu" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/openacc_predef.h" 1 3
+
+
+
+
+
+
+
+
+#pragma acc routine seq
+extern "C" void __cxa_vec_ctor(void *__array_address,
+ unsigned long int __element_count,
+ unsigned long int __element_size,
+ void (*__constructor)(void *),
+ void (*__destructor)(void *));
+
+#pragma acc routine seq
+extern "C" void __cxa_vec_cctor(void *__destination_array,
+ void *__source_array,
+ unsigned long int __element_count,
+ unsigned long int __element_size,
+ void (*__constructor)(void *, void *),
+ void (*__destructor)(void *));
+
+#pragma acc routine seq
+extern "C" void __cxa_vec_dtor(void *__array_address,
+ unsigned long int __element_count,
+ unsigned long int __element_size,
+ void (*__destructor)(void *));
+
+#pragma acc routine seq
+extern "C" void *__cxa_vec_new(unsigned long int __element_count,
+ unsigned long int __element_size,
+ unsigned long int __padding_size,
+ void (*__constructor)(void *),
+ void (*__destructor)(void *));
+
+#pragma acc routine seq
+extern "C" void *__cxa_vec_new2(unsigned long int __element_count,
+ unsigned long int __element_size,
+ unsigned long int __padding_size,
+ void (*__constructor)(void *),
+ void (*__destructor)(void *),
+ void *(*__allocator)(unsigned long int),
+ void (*__deallocator)(void *));
+
+#pragma acc routine seq
+extern "C" void *__cxa_vec_new3(unsigned long int __element_count,
+ unsigned long int __element_size,
+ unsigned long int __padding_size,
+ void (*__constructor)(void *),
+ void (*__destructor)(void *),
+ void *(*__allocator)(unsigned long int),
+ void (*__deallocator)(void *, unsigned long int));
+
+#pragma acc routine seq
+extern "C" void __cxa_vec_delete(void *__array_address,
+ unsigned long int __element_size,
+ unsigned long int __padding_size,
+ void (*__destructor)(void *));
+
+#pragma acc routine seq
+extern "C" void __cxa_vec_delete2(void *__array_address,
+ unsigned long int __element_size,
+ unsigned long int __padding_size,
+ void (*__destructor)(void *),
+ void (*__deallocator)(void *));
+
+#pragma acc routine seq
+extern "C" void __cxa_vec_delete3(void *__array_address,
+ unsigned long int __element_size,
+ unsigned long int __padding_size,
+ void (*__destructor)(void *),
+ void (*__deallocator)(void *, unsigned long int));
+
+# 1 "CMakeCUDACompilerId.cu" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 71 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h"
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+
+
+
+
+
+
+
+
+
+
+
+# 77 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 115 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+# 127 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+
+
+
+
+
+
+
+
+
+# 145 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+
+
+
+# 163 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+
+
+# 197 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+
+
+
+# 1 "/usr/include/features.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 146 "/usr/include/features.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 166 "/usr/include/features.h" 3
+
+
+
+
+
+# 177 "/usr/include/features.h" 3
+
+
+
+
+
+
+
+
+# 191 "/usr/include/features.h" 3
+
+
+# 217 "/usr/include/features.h" 3
+
+
+
+# 229 "/usr/include/features.h" 3
+
+
+# 237 "/usr/include/features.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 283 "/usr/include/features.h" 3
+
+# 299 "/usr/include/features.h" 3
+
+
+
+
+
+
+
+# 312 "/usr/include/features.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 338 "/usr/include/features.h" 3
+
+
+
+
+
+
+
+# 370 "/usr/include/features.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 409 "/usr/include/features.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 440 "/usr/include/features.h" 3
+
+
+
+# 1 "/usr/include/stdc-predef.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 43 "/usr/include/stdc-predef.h" 3
+
+# 51 "/usr/include/stdc-predef.h" 3
+
+
+
+
+
+
+
+
+
+# 444 "/usr/include/features.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/sys/cdefs.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 48 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+# 72 "/usr/include/sys/cdefs.h" 3
+
+# 87 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 120 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+# 137 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 160 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 188 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 221 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 248 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 266 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 289 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 312 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+# 324 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 354 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 382 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+# 400 "/usr/include/sys/cdefs.h" 3
+
+# 408 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+# 424 "/usr/include/sys/cdefs.h" 3
+
+# 433 "/usr/include/sys/cdefs.h" 3
+
+
+# 443 "/usr/include/sys/cdefs.h" 3
+
+# 451 "/usr/include/sys/cdefs.h" 3
+
+# 1 "/usr/include/bits/wordsize.h" 1 3
+
+
+# 10 "/usr/include/bits/wordsize.h" 3
+
+
+
+
+# 453 "/usr/include/sys/cdefs.h" 2 3
+# 1 "/usr/include/bits/long-double.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 454 "/usr/include/sys/cdefs.h" 2 3
+
+# 486 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+# 499 "/usr/include/sys/cdefs.h" 3
+
+
+
+
+
+
+
+
+# 516 "/usr/include/sys/cdefs.h" 3
+
+# 466 "/usr/include/features.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/gnu/stubs.h" 1 3
+
+
+
+
+
+# 1 "/usr/include/gnu/stubs-64.h" 1 3
+
+
+
+
+
+
+
+
+
+# 11 "/usr/include/gnu/stubs.h" 2 3
+# 490 "/usr/include/features.h" 2 3
+
+
+# 202 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h" 2
+
+# 244 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+
+
+
+
+
+# 1 "/usr/include/c++/7/cstdarg" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 1 3
+# 1 "/usr/include/bits/wordsize.h" 1 3
+
+
+# 10 "/usr/include/bits/wordsize.h" 3
+
+
+
+
+# 4 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 2 3
+# 1964 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 2039 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 2071 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+# 2079 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+# 2087 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+# 2095 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+# 2110 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+# 2122 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+# 2130 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+namespace std
+{
+ typedef unsigned long int size_t;
+ typedef long int ptrdiff_t;
+
+
+ typedef decltype(nullptr) nullptr_t;
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+namespace std
+{
+ inline namespace __cxx11 __attribute__((__abi_tag__ ("cxx11"))) { }
+}
+namespace __gnu_cxx
+{
+ inline namespace __cxx11 __attribute__((__abi_tag__ ("cxx11"))) { }
+}
+# 2232 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+# 2296 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+# 2348 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 2370 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+# 2382 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+# 2410 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 2446 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 2480 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/c++/7/x86_64-suse-linux/bits/os_defines.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 2497 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 2 3
+
+
+# 1 "/usr/include/c++/7/x86_64-suse-linux/bits/cpu_defines.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 2500 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 2535 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+# 2543 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 2599 "/usr/include/c++/7/x86_64-suse-linux/bits/c++config.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 19 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 2 3
+
+
+
+
+
+
+
+
+
+# 43 "/usr/include/c++/7/cstdarg" 2 3
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stdarg.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 35 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stdarg.h" 3
+
+
+
+
+
+typedef __pgi_va_list __gnuc_va_list;
+
+
+
+
+
+
+# 55 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stdarg.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+# 85 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stdarg.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+typedef __gnuc_va_list va_list;
+# 119 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stdarg.h" 3
+
+
+
+
+
+
+
+# 44 "/usr/include/c++/7/cstdarg" 2 3
+
+
+
+
+
+
+
+
+
+namespace std
+{
+ using ::va_list;
+}
+
+# 254 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h" 2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 286 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_config.h"
+
+
+
+
+
+
+
+
+
+# 84 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h" 2
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/builtin_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/device_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 105 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 161 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 172 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+
+
+
+
+
+
+# 185 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 232 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 252 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+
+
+
+# 60 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/device_types.h" 2
+
+
+
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaRoundMode
+{
+ cudaRoundNearest,
+ cudaRoundZero,
+ cudaRoundPosInf,
+ cudaRoundMinInf
+};
+
+
+
+
+
+
+# 57 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/builtin_types.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 255 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 60 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/vector_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 255 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 66 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/vector_types.h" 2
+
+
+
+
+
+
+
+
+# 91 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/vector_types.h"
+
+
+
+
+
+
+
+
+
+struct __attribute__((device_builtin)) char1
+{
+ signed char x;
+};
+
+struct __attribute__((device_builtin)) uchar1
+{
+ unsigned char x;
+};
+
+
+struct __attribute__((device_builtin)) __attribute__((aligned(2))) char2
+{
+ signed char x, y;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(2))) uchar2
+{
+ unsigned char x, y;
+};
+
+struct __attribute__((device_builtin)) char3
+{
+ signed char x, y, z;
+};
+
+struct __attribute__((device_builtin)) uchar3
+{
+ unsigned char x, y, z;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(4))) char4
+{
+ signed char x, y, z, w;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(4))) uchar4
+{
+ unsigned char x, y, z, w;
+};
+
+struct __attribute__((device_builtin)) short1
+{
+ short x;
+};
+
+struct __attribute__((device_builtin)) ushort1
+{
+ unsigned short x;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(4))) short2
+{
+ short x, y;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(4))) ushort2
+{
+ unsigned short x, y;
+};
+
+struct __attribute__((device_builtin)) short3
+{
+ short x, y, z;
+};
+
+struct __attribute__((device_builtin)) ushort3
+{
+ unsigned short x, y, z;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(8))) short4 { short x; short y; short z; short w; };
+struct __attribute__((device_builtin)) __attribute__((aligned(8))) ushort4 { unsigned short x; unsigned short y; unsigned short z; unsigned short w; };
+
+struct __attribute__((device_builtin)) int1
+{
+ int x;
+};
+
+struct __attribute__((device_builtin)) uint1
+{
+ unsigned int x;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(8))) int2 { int x; int y; };
+struct __attribute__((device_builtin)) __attribute__((aligned(8))) uint2 { unsigned int x; unsigned int y; };
+
+struct __attribute__((device_builtin)) int3
+{
+ int x, y, z;
+};
+
+struct __attribute__((device_builtin)) uint3
+{
+ unsigned int x, y, z;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) int4
+{
+ int x, y, z, w;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) uint4
+{
+ unsigned int x, y, z, w;
+};
+
+struct __attribute__((device_builtin)) long1
+{
+ long int x;
+};
+
+struct __attribute__((device_builtin)) ulong1
+{
+ unsigned long x;
+};
+
+
+
+
+
+
+struct __attribute__((device_builtin)) __attribute__((aligned(2*sizeof(long int)))) long2
+{
+ long int x, y;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(2*sizeof(unsigned long int)))) ulong2
+{
+ unsigned long int x, y;
+};
+
+
+
+struct __attribute__((device_builtin)) long3
+{
+ long int x, y, z;
+};
+
+struct __attribute__((device_builtin)) ulong3
+{
+ unsigned long int x, y, z;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) long4
+{
+ long int x, y, z, w;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) ulong4
+{
+ unsigned long int x, y, z, w;
+};
+
+struct __attribute__((device_builtin)) float1
+{
+ float x;
+};
+
+# 275 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/vector_types.h"
+
+struct __attribute__((device_builtin)) __attribute__((aligned(8))) float2 { float x; float y; };
+
+
+
+
+struct __attribute__((device_builtin)) float3
+{
+ float x, y, z;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) float4
+{
+ float x, y, z, w;
+};
+
+struct __attribute__((device_builtin)) longlong1
+{
+ long long int x;
+};
+
+struct __attribute__((device_builtin)) ulonglong1
+{
+ unsigned long long int x;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) longlong2
+{
+ long long int x, y;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) ulonglong2
+{
+ unsigned long long int x, y;
+};
+
+struct __attribute__((device_builtin)) longlong3
+{
+ long long int x, y, z;
+};
+
+struct __attribute__((device_builtin)) ulonglong3
+{
+ unsigned long long int x, y, z;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) longlong4
+{
+ long long int x, y, z ,w;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) ulonglong4
+{
+ unsigned long long int x, y, z, w;
+};
+
+struct __attribute__((device_builtin)) double1
+{
+ double x;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) double2
+{
+ double x, y;
+};
+
+struct __attribute__((device_builtin)) double3
+{
+ double x, y, z;
+};
+
+struct __attribute__((device_builtin)) __attribute__((aligned(16))) double4
+{
+ double x, y, z, w;
+};
+
+
+
+
+
+
+
+
+
+
+
+
+
+typedef __attribute__((device_builtin)) struct char1 char1;
+typedef __attribute__((device_builtin)) struct uchar1 uchar1;
+typedef __attribute__((device_builtin)) struct char2 char2;
+typedef __attribute__((device_builtin)) struct uchar2 uchar2;
+typedef __attribute__((device_builtin)) struct char3 char3;
+typedef __attribute__((device_builtin)) struct uchar3 uchar3;
+typedef __attribute__((device_builtin)) struct char4 char4;
+typedef __attribute__((device_builtin)) struct uchar4 uchar4;
+typedef __attribute__((device_builtin)) struct short1 short1;
+typedef __attribute__((device_builtin)) struct ushort1 ushort1;
+typedef __attribute__((device_builtin)) struct short2 short2;
+typedef __attribute__((device_builtin)) struct ushort2 ushort2;
+typedef __attribute__((device_builtin)) struct short3 short3;
+typedef __attribute__((device_builtin)) struct ushort3 ushort3;
+typedef __attribute__((device_builtin)) struct short4 short4;
+typedef __attribute__((device_builtin)) struct ushort4 ushort4;
+typedef __attribute__((device_builtin)) struct int1 int1;
+typedef __attribute__((device_builtin)) struct uint1 uint1;
+typedef __attribute__((device_builtin)) struct int2 int2;
+typedef __attribute__((device_builtin)) struct uint2 uint2;
+typedef __attribute__((device_builtin)) struct int3 int3;
+typedef __attribute__((device_builtin)) struct uint3 uint3;
+typedef __attribute__((device_builtin)) struct int4 int4;
+typedef __attribute__((device_builtin)) struct uint4 uint4;
+typedef __attribute__((device_builtin)) struct long1 long1;
+typedef __attribute__((device_builtin)) struct ulong1 ulong1;
+typedef __attribute__((device_builtin)) struct long2 long2;
+typedef __attribute__((device_builtin)) struct ulong2 ulong2;
+typedef __attribute__((device_builtin)) struct long3 long3;
+typedef __attribute__((device_builtin)) struct ulong3 ulong3;
+typedef __attribute__((device_builtin)) struct long4 long4;
+typedef __attribute__((device_builtin)) struct ulong4 ulong4;
+typedef __attribute__((device_builtin)) struct float1 float1;
+typedef __attribute__((device_builtin)) struct float2 float2;
+typedef __attribute__((device_builtin)) struct float3 float3;
+typedef __attribute__((device_builtin)) struct float4 float4;
+typedef __attribute__((device_builtin)) struct longlong1 longlong1;
+typedef __attribute__((device_builtin)) struct ulonglong1 ulonglong1;
+typedef __attribute__((device_builtin)) struct longlong2 longlong2;
+typedef __attribute__((device_builtin)) struct ulonglong2 ulonglong2;
+typedef __attribute__((device_builtin)) struct longlong3 longlong3;
+typedef __attribute__((device_builtin)) struct ulonglong3 ulonglong3;
+typedef __attribute__((device_builtin)) struct longlong4 longlong4;
+typedef __attribute__((device_builtin)) struct ulonglong4 ulonglong4;
+typedef __attribute__((device_builtin)) struct double1 double1;
+typedef __attribute__((device_builtin)) struct double2 double2;
+typedef __attribute__((device_builtin)) struct double3 double3;
+typedef __attribute__((device_builtin)) struct double4 double4;
+
+
+
+
+
+
+
+struct __attribute__((device_builtin)) dim3
+{
+ unsigned int x, y, z;
+
+
+ __attribute__((host)) __attribute__((device)) constexpr dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+ __attribute__((host)) __attribute__((device)) constexpr dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+ __attribute__((host)) __attribute__((device)) constexpr operator uint3(void) const { return uint3{x, y, z}; }
+# 432 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/vector_types.h"
+};
+
+typedef __attribute__((device_builtin)) struct dim3 dim3;
+
+
+
+
+
+
+
+
+# 62 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h" 2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/syslimits.h" 1 3
+
+
+
+
+
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 192 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 3
+
+# 1 "/usr/include/limits.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/libc-header-start.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 44 "/usr/include/bits/libc-header-start.h" 3
+
+
+
+
+
+# 61 "/usr/include/bits/libc-header-start.h" 3
+
+
+
+
+
+# 78 "/usr/include/bits/libc-header-start.h" 3
+
+
+
+# 27 "/usr/include/limits.h" 2 3
+
+
+
+
+
+
+
+
+
+
+# 114 "/usr/include/limits.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 141 "/usr/include/limits.h" 3
+
+
+
+
+# 180 "/usr/include/limits.h" 3
+
+
+
+# 1 "/usr/include/bits/posix1_lim.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/wordsize.h" 1 3
+
+
+# 10 "/usr/include/bits/wordsize.h" 3
+
+
+
+
+# 28 "/usr/include/bits/posix1_lim.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/local_lim.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 36 "/usr/include/bits/local_lim.h" 3
+
+
+# 1 "/usr/include/linux/limits.h" 1 3
+
+
+
+
+
+
+# 18 "/usr/include/linux/limits.h" 3
+
+
+
+# 39 "/usr/include/bits/local_lim.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 162 "/usr/include/bits/posix1_lim.h" 2 3
+
+
+
+
+
+
+# 174 "/usr/include/bits/posix1_lim.h" 3
+
+
+
+
+
+
+
+
+
+# 184 "/usr/include/limits.h" 2 3
+
+
+# 1 "/usr/include/bits/posix2_lim.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 86 "/usr/include/bits/posix2_lim.h" 3
+
+
+
+
+# 188 "/usr/include/limits.h" 2 3
+
+
+# 1 "/usr/include/bits/xopen_lim.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/uio_lim.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 65 "/usr/include/bits/xopen_lim.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 123 "/usr/include/bits/xopen_lim.h" 3
+
+
+# 136 "/usr/include/bits/xopen_lim.h" 3
+
+# 147 "/usr/include/bits/xopen_lim.h" 3
+
+# 192 "/usr/include/limits.h" 2 3
+# 195 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 2 3
+
+
+# 8 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/syslimits.h" 2 3
+# 35 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 84 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 3
+
+
+# 101 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 3
+
+
+
+
+
+
+
+
+# 115 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 148 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 186 "/usr/lib64/gcc/x86_64-suse-linux/7/include-fixed/limits.h" 3
+
+
+
+
+
+# 82 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h" 2
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 91 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 116 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 149 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+typedef long int ptrdiff_t;
+# 159 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+# 216 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+typedef unsigned long int size_t;
+# 240 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+# 349 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+# 361 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+# 395 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+# 413 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+typedef struct {
+ long long __max_align_ll __attribute__((__aligned__(__alignof__(long long))));
+ long double __max_align_ld __attribute__((__aligned__(__alignof__(long double))));
+
+
+
+
+
+
+
+
+} max_align_t;
+
+
+
+
+
+
+ typedef decltype(nullptr) nullptr_t;
+
+
+
+
+
+# 83 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h" 2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 143 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h"
+
+# 151 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 193 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h"
+
+
+
+
+
+
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaError
+{
+
+
+
+
+
+ cudaSuccess = 0,
+
+
+
+
+
+ cudaErrorInvalidValue = 1,
+
+
+
+
+
+ cudaErrorMemoryAllocation = 2,
+
+
+
+
+
+ cudaErrorInitializationError = 3,
+
+
+
+
+
+
+ cudaErrorCudartUnloading = 4,
+
+
+
+
+
+
+ cudaErrorProfilerDisabled = 5,
+
+
+
+
+
+
+
+ cudaErrorProfilerNotInitialized = 6,
+
+
+
+
+
+
+ cudaErrorProfilerAlreadyStarted = 7,
+
+
+
+
+
+
+ cudaErrorProfilerAlreadyStopped = 8,
+
+
+
+
+
+
+
+
+ cudaErrorInvalidConfiguration = 9,
+
+
+
+
+
+ cudaErrorInvalidPitchValue = 12,
+
+
+
+
+
+ cudaErrorInvalidSymbol = 13,
+
+
+
+
+
+
+
+ cudaErrorInvalidHostPointer = 16,
+
+
+
+
+
+
+
+ cudaErrorInvalidDevicePointer = 17,
+
+
+
+
+
+ cudaErrorInvalidTexture = 18,
+
+
+
+
+
+ cudaErrorInvalidTextureBinding = 19,
+
+
+
+
+
+
+ cudaErrorInvalidChannelDescriptor = 20,
+
+
+
+
+
+ cudaErrorInvalidMemcpyDirection = 21,
+
+
+
+
+
+
+
+
+
+ cudaErrorAddressOfConstant = 22,
+
+
+
+
+
+
+
+
+ cudaErrorTextureFetchFailed = 23,
+
+
+
+
+
+
+
+
+ cudaErrorTextureNotBound = 24,
+
+
+
+
+
+
+
+
+ cudaErrorSynchronizationError = 25,
+
+
+
+
+
+ cudaErrorInvalidFilterSetting = 26,
+
+
+
+
+
+ cudaErrorInvalidNormSetting = 27,
+
+
+
+
+
+
+
+ cudaErrorMixedDeviceExecution = 28,
+
+
+
+
+
+
+
+ cudaErrorNotYetImplemented = 31,
+
+
+
+
+
+
+
+
+ cudaErrorMemoryValueTooLarge = 32,
+
+
+
+
+
+
+ cudaErrorStubLibrary = 34,
+
+
+
+
+
+
+ cudaErrorInsufficientDriver = 35,
+
+
+
+
+
+
+ cudaErrorCallRequiresNewerDriver = 36,
+
+
+
+
+
+ cudaErrorInvalidSurface = 37,
+
+
+
+
+
+ cudaErrorDuplicateVariableName = 43,
+
+
+
+
+
+ cudaErrorDuplicateTextureName = 44,
+
+
+
+
+
+ cudaErrorDuplicateSurfaceName = 45,
+
+
+
+
+
+
+
+
+
+ cudaErrorDevicesUnavailable = 46,
+
+
+
+
+
+
+
+
+
+
+
+
+ cudaErrorIncompatibleDriverContext = 49,
+
+
+
+
+
+ cudaErrorMissingConfiguration = 52,
+
+
+
+
+
+
+
+
+ cudaErrorPriorLaunchFailure = 53,
+
+
+
+
+
+
+ cudaErrorLaunchMaxDepthExceeded = 65,
+
+
+
+
+
+
+
+ cudaErrorLaunchFileScopedTex = 66,
+
+
+
+
+
+
+
+ cudaErrorLaunchFileScopedSurf = 67,
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ cudaErrorSyncDepthExceeded = 68,
+
+
+
+
+
+
+
+
+
+
+
+ cudaErrorLaunchPendingCountExceeded = 69,
+
+
+
+
+
+ cudaErrorInvalidDeviceFunction = 98,
+
+
+
+
+
+ cudaErrorNoDevice = 100,
+
+
+
+
+
+
+ cudaErrorInvalidDevice = 101,
+
+
+
+
+ cudaErrorDeviceNotLicensed = 102,
+
+
+
+
+
+
+
+
+ cudaErrorSoftwareValidityNotEstablished = 103,
+
+
+
+
+ cudaErrorStartupFailure = 127,
+
+
+
+
+ cudaErrorInvalidKernelImage = 200,
+
+
+
+
+
+
+
+
+
+ cudaErrorDeviceUninitialized = 201,
+
+
+
+
+ cudaErrorMapBufferObjectFailed = 205,
+
+
+
+
+ cudaErrorUnmapBufferObjectFailed = 206,
+
+
+
+
+
+ cudaErrorArrayIsMapped = 207,
+
+
+
+
+ cudaErrorAlreadyMapped = 208,
+
+
+
+
+
+
+
+ cudaErrorNoKernelImageForDevice = 209,
+
+
+
+
+ cudaErrorAlreadyAcquired = 210,
+
+
+
+
+ cudaErrorNotMapped = 211,
+
+
+
+
+
+ cudaErrorNotMappedAsArray = 212,
+
+
+
+
+
+ cudaErrorNotMappedAsPointer = 213,
+
+
+
+
+
+ cudaErrorECCUncorrectable = 214,
+
+
+
+
+
+ cudaErrorUnsupportedLimit = 215,
+
+
+
+
+
+ cudaErrorDeviceAlreadyInUse = 216,
+
+
+
+
+
+ cudaErrorPeerAccessUnsupported = 217,
+
+
+
+
+
+ cudaErrorInvalidPtx = 218,
+
+
+
+
+ cudaErrorInvalidGraphicsContext = 219,
+
+
+
+
+
+ cudaErrorNvlinkUncorrectable = 220,
+
+
+
+
+
+
+ cudaErrorJitCompilerNotFound = 221,
+
+
+
+
+
+
+ cudaErrorUnsupportedPtxVersion = 222,
+
+
+
+
+
+
+ cudaErrorJitCompilationDisabled = 223,
+
+
+
+
+ cudaErrorUnsupportedExecAffinity = 224,
+
+
+
+
+ cudaErrorInvalidSource = 300,
+
+
+
+
+ cudaErrorFileNotFound = 301,
+
+
+
+
+ cudaErrorSharedObjectSymbolNotFound = 302,
+
+
+
+
+ cudaErrorSharedObjectInitFailed = 303,
+
+
+
+
+ cudaErrorOperatingSystem = 304,
+
+
+
+
+
+
+ cudaErrorInvalidResourceHandle = 400,
+
+
+
+
+
+ cudaErrorIllegalState = 401,
+
+
+
+
+
+
+ cudaErrorSymbolNotFound = 500,
+
+
+
+
+
+
+
+ cudaErrorNotReady = 600,
+
+
+
+
+
+
+
+ cudaErrorIllegalAddress = 700,
+
+
+
+
+
+
+
+
+ cudaErrorLaunchOutOfResources = 701,
+
+
+
+
+
+
+
+
+
+
+ cudaErrorLaunchTimeout = 702,
+
+
+
+
+
+ cudaErrorLaunchIncompatibleTexturing = 703,
+
+
+
+
+
+
+ cudaErrorPeerAccessAlreadyEnabled = 704,
+
+
+
+
+
+
+ cudaErrorPeerAccessNotEnabled = 705,
+
+
+
+
+
+
+
+
+
+
+
+
+ cudaErrorSetOnActiveProcess = 708,
+
+
+
+
+
+
+ cudaErrorContextIsDestroyed = 709,
+
+
+
+
+
+
+ cudaErrorAssert = 710,
+
+
+
+
+
+
+ cudaErrorTooManyPeers = 711,
+
+
+
+
+
+ cudaErrorHostMemoryAlreadyRegistered = 712,
+
+
+
+
+
+ cudaErrorHostMemoryNotRegistered = 713,
+
+
+
+
+
+
+
+
+ cudaErrorHardwareStackError = 714,
+
+
+
+
+
+
+
+ cudaErrorIllegalInstruction = 715,
+
+
+
+
+
+
+
+
+ cudaErrorMisalignedAddress = 716,
+
+
+
+
+
+
+
+
+
+
+ cudaErrorInvalidAddressSpace = 717,
+
+
+
+
+
+
+
+ cudaErrorInvalidPc = 718,
+
+
+
+
+
+
+
+
+
+
+ cudaErrorLaunchFailure = 719,
+
+
+
+
+
+
+
+
+ cudaErrorCooperativeLaunchTooLarge = 720,
+
+
+
+
+ cudaErrorNotPermitted = 800,
+
+
+
+
+
+ cudaErrorNotSupported = 801,
+
+
+
+
+
+
+
+
+ cudaErrorSystemNotReady = 802,
+
+
+
+
+
+
+ cudaErrorSystemDriverMismatch = 803,
+
+
+
+
+
+
+
+
+ cudaErrorCompatNotSupportedOnDevice = 804,
+
+
+
+
+ cudaErrorMpsConnectionFailed = 805,
+
+
+
+
+ cudaErrorMpsRpcFailure = 806,
+
+
+
+
+
+ cudaErrorMpsServerNotReady = 807,
+
+
+
+
+ cudaErrorMpsMaxClientsReached = 808,
+
+
+
+
+ cudaErrorMpsMaxConnectionsReached = 809,
+
+
+
+
+ cudaErrorStreamCaptureUnsupported = 900,
+
+
+
+
+
+ cudaErrorStreamCaptureInvalidated = 901,
+
+
+
+
+
+ cudaErrorStreamCaptureMerge = 902,
+
+
+
+
+ cudaErrorStreamCaptureUnmatched = 903,
+
+
+
+
+
+ cudaErrorStreamCaptureUnjoined = 904,
+
+
+
+
+
+
+ cudaErrorStreamCaptureIsolation = 905,
+
+
+
+
+
+ cudaErrorStreamCaptureImplicit = 906,
+
+
+
+
+
+ cudaErrorCapturedEvent = 907,
+
+
+
+
+
+
+ cudaErrorStreamCaptureWrongThread = 908,
+
+
+
+
+ cudaErrorTimeout = 909,
+
+
+
+
+
+ cudaErrorGraphExecUpdateFailure = 910,
+
+
+
+
+
+
+
+
+
+ cudaErrorExternalDevice = 911,
+
+
+
+
+
+
+
+
+
+
+
+
+ cudaErrorUnknown = 999,
+
+
+
+
+
+
+
+ cudaErrorApiFailureBase = 10000
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaChannelFormatKind
+{
+ cudaChannelFormatKindSigned = 0,
+ cudaChannelFormatKindUnsigned = 1,
+ cudaChannelFormatKindFloat = 2,
+ cudaChannelFormatKindNone = 3,
+ cudaChannelFormatKindNV12 = 4,
+ cudaChannelFormatKindUnsignedNormalized8X1 = 5,
+ cudaChannelFormatKindUnsignedNormalized8X2 = 6,
+ cudaChannelFormatKindUnsignedNormalized8X4 = 7,
+ cudaChannelFormatKindUnsignedNormalized16X1 = 8,
+ cudaChannelFormatKindUnsignedNormalized16X2 = 9,
+ cudaChannelFormatKindUnsignedNormalized16X4 = 10,
+ cudaChannelFormatKindSignedNormalized8X1 = 11,
+ cudaChannelFormatKindSignedNormalized8X2 = 12,
+ cudaChannelFormatKindSignedNormalized8X4 = 13,
+ cudaChannelFormatKindSignedNormalized16X1 = 14,
+ cudaChannelFormatKindSignedNormalized16X2 = 15,
+ cudaChannelFormatKindSignedNormalized16X4 = 16,
+ cudaChannelFormatKindUnsignedBlockCompressed1 = 17,
+ cudaChannelFormatKindUnsignedBlockCompressed1SRGB = 18,
+ cudaChannelFormatKindUnsignedBlockCompressed2 = 19,
+ cudaChannelFormatKindUnsignedBlockCompressed2SRGB = 20,
+ cudaChannelFormatKindUnsignedBlockCompressed3 = 21,
+ cudaChannelFormatKindUnsignedBlockCompressed3SRGB = 22,
+ cudaChannelFormatKindUnsignedBlockCompressed4 = 23,
+ cudaChannelFormatKindSignedBlockCompressed4 = 24,
+ cudaChannelFormatKindUnsignedBlockCompressed5 = 25,
+ cudaChannelFormatKindSignedBlockCompressed5 = 26,
+ cudaChannelFormatKindUnsignedBlockCompressed6H = 27,
+ cudaChannelFormatKindSignedBlockCompressed6H = 28,
+ cudaChannelFormatKindUnsignedBlockCompressed7 = 29,
+ cudaChannelFormatKindUnsignedBlockCompressed7SRGB = 30
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaChannelFormatDesc
+{
+ int x;
+ int y;
+ int z;
+ int w;
+ enum cudaChannelFormatKind f;
+};
+
+
+
+
+typedef struct cudaArray *cudaArray_t;
+
+
+
+
+typedef const struct cudaArray *cudaArray_const_t;
+
+struct cudaArray;
+
+
+
+
+typedef struct cudaMipmappedArray *cudaMipmappedArray_t;
+
+
+
+
+typedef const struct cudaMipmappedArray *cudaMipmappedArray_const_t;
+
+struct cudaMipmappedArray;
+
+
+
+
+
+
+
+
+
+struct __attribute__((device_builtin)) cudaArraySparseProperties {
+ struct {
+ unsigned int width;
+ unsigned int height;
+ unsigned int depth;
+ } tileExtent;
+ unsigned int miptailFirstLevel;
+ unsigned long long miptailSize;
+ unsigned int flags;
+ unsigned int reserved[4];
+};
+
+
+
+
+
+struct __attribute__((device_builtin)) cudaArrayMemoryRequirements {
+ size_t size;
+ size_t alignment;
+ unsigned int reserved[4];
+};
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemoryType
+{
+ cudaMemoryTypeUnregistered = 0,
+ cudaMemoryTypeHost = 1,
+ cudaMemoryTypeDevice = 2,
+ cudaMemoryTypeManaged = 3
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemcpyKind
+{
+ cudaMemcpyHostToHost = 0,
+ cudaMemcpyHostToDevice = 1,
+ cudaMemcpyDeviceToHost = 2,
+ cudaMemcpyDeviceToDevice = 3,
+ cudaMemcpyDefault = 4
+};
+
+
+
+
+
+
+struct __attribute__((device_builtin)) cudaPitchedPtr
+{
+ void *ptr;
+ size_t pitch;
+ size_t xsize;
+ size_t ysize;
+};
+
+
+
+
+
+
+struct __attribute__((device_builtin)) cudaExtent
+{
+ size_t width;
+ size_t height;
+ size_t depth;
+};
+
+
+
+
+
+
+struct __attribute__((device_builtin)) cudaPos
+{
+ size_t x;
+ size_t y;
+ size_t z;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaMemcpy3DParms
+{
+ cudaArray_t srcArray;
+ struct cudaPos srcPos;
+ struct cudaPitchedPtr srcPtr;
+
+ cudaArray_t dstArray;
+ struct cudaPos dstPos;
+ struct cudaPitchedPtr dstPtr;
+
+ struct cudaExtent extent;
+ enum cudaMemcpyKind kind;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaMemcpy3DPeerParms
+{
+ cudaArray_t srcArray;
+ struct cudaPos srcPos;
+ struct cudaPitchedPtr srcPtr;
+ int srcDevice;
+
+ cudaArray_t dstArray;
+ struct cudaPos dstPos;
+ struct cudaPitchedPtr dstPtr;
+ int dstDevice;
+
+ struct cudaExtent extent;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaMemsetParams {
+ void *dst;
+ size_t pitch;
+ unsigned int value;
+ unsigned int elementSize;
+ size_t width;
+ size_t height;
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaAccessProperty {
+ cudaAccessPropertyNormal = 0,
+ cudaAccessPropertyStreaming = 1,
+ cudaAccessPropertyPersisting = 2
+};
+
+
+
+
+
+
+
+
+
+
+
+
+struct __attribute__((device_builtin)) cudaAccessPolicyWindow {
+ void *base_ptr;
+ size_t num_bytes;
+ float hitRatio;
+ enum cudaAccessProperty hitProp;
+ enum cudaAccessProperty missProp;
+};
+
+
+
+
+
+
+
+
+
+
+
+typedef void ( *cudaHostFn_t)(void *userData);
+
+
+
+
+struct __attribute__((device_builtin)) cudaHostNodeParams {
+ cudaHostFn_t fn;
+ void* userData;
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaStreamCaptureStatus {
+ cudaStreamCaptureStatusNone = 0,
+ cudaStreamCaptureStatusActive = 1,
+ cudaStreamCaptureStatusInvalidated = 2
+
+};
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaStreamCaptureMode {
+ cudaStreamCaptureModeGlobal = 0,
+ cudaStreamCaptureModeThreadLocal = 1,
+ cudaStreamCaptureModeRelaxed = 2
+};
+
+enum __attribute__((device_builtin)) cudaSynchronizationPolicy {
+ cudaSyncPolicyAuto = 1,
+ cudaSyncPolicySpin = 2,
+ cudaSyncPolicyYield = 3,
+ cudaSyncPolicyBlockingSync = 4
+};
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaStreamUpdateCaptureDependenciesFlags {
+ cudaStreamAddCaptureDependencies = 0x0,
+ cudaStreamSetCaptureDependencies = 0x1
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaUserObjectFlags {
+ cudaUserObjectNoDestructorSync = 0x1
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaUserObjectRetainFlags {
+ cudaGraphUserObjectMove = 0x1
+};
+
+
+
+
+struct cudaGraphicsResource;
+
+
+
+
+enum __attribute__((device_builtin)) cudaGraphicsRegisterFlags
+{
+ cudaGraphicsRegisterFlagsNone = 0,
+ cudaGraphicsRegisterFlagsReadOnly = 1,
+ cudaGraphicsRegisterFlagsWriteDiscard = 2,
+ cudaGraphicsRegisterFlagsSurfaceLoadStore = 4,
+ cudaGraphicsRegisterFlagsTextureGather = 8
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaGraphicsMapFlags
+{
+ cudaGraphicsMapFlagsNone = 0,
+ cudaGraphicsMapFlagsReadOnly = 1,
+ cudaGraphicsMapFlagsWriteDiscard = 2
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaGraphicsCubeFace
+{
+ cudaGraphicsCubeFacePositiveX = 0x00,
+ cudaGraphicsCubeFaceNegativeX = 0x01,
+ cudaGraphicsCubeFacePositiveY = 0x02,
+ cudaGraphicsCubeFaceNegativeY = 0x03,
+ cudaGraphicsCubeFacePositiveZ = 0x04,
+ cudaGraphicsCubeFaceNegativeZ = 0x05
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaResourceType
+{
+ cudaResourceTypeArray = 0x00,
+ cudaResourceTypeMipmappedArray = 0x01,
+ cudaResourceTypeLinear = 0x02,
+ cudaResourceTypePitch2D = 0x03
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaResourceViewFormat
+{
+ cudaResViewFormatNone = 0x00,
+ cudaResViewFormatUnsignedChar1 = 0x01,
+ cudaResViewFormatUnsignedChar2 = 0x02,
+ cudaResViewFormatUnsignedChar4 = 0x03,
+ cudaResViewFormatSignedChar1 = 0x04,
+ cudaResViewFormatSignedChar2 = 0x05,
+ cudaResViewFormatSignedChar4 = 0x06,
+ cudaResViewFormatUnsignedShort1 = 0x07,
+ cudaResViewFormatUnsignedShort2 = 0x08,
+ cudaResViewFormatUnsignedShort4 = 0x09,
+ cudaResViewFormatSignedShort1 = 0x0a,
+ cudaResViewFormatSignedShort2 = 0x0b,
+ cudaResViewFormatSignedShort4 = 0x0c,
+ cudaResViewFormatUnsignedInt1 = 0x0d,
+ cudaResViewFormatUnsignedInt2 = 0x0e,
+ cudaResViewFormatUnsignedInt4 = 0x0f,
+ cudaResViewFormatSignedInt1 = 0x10,
+ cudaResViewFormatSignedInt2 = 0x11,
+ cudaResViewFormatSignedInt4 = 0x12,
+ cudaResViewFormatHalf1 = 0x13,
+ cudaResViewFormatHalf2 = 0x14,
+ cudaResViewFormatHalf4 = 0x15,
+ cudaResViewFormatFloat1 = 0x16,
+ cudaResViewFormatFloat2 = 0x17,
+ cudaResViewFormatFloat4 = 0x18,
+ cudaResViewFormatUnsignedBlockCompressed1 = 0x19,
+ cudaResViewFormatUnsignedBlockCompressed2 = 0x1a,
+ cudaResViewFormatUnsignedBlockCompressed3 = 0x1b,
+ cudaResViewFormatUnsignedBlockCompressed4 = 0x1c,
+ cudaResViewFormatSignedBlockCompressed4 = 0x1d,
+ cudaResViewFormatUnsignedBlockCompressed5 = 0x1e,
+ cudaResViewFormatSignedBlockCompressed5 = 0x1f,
+ cudaResViewFormatUnsignedBlockCompressed6H = 0x20,
+ cudaResViewFormatSignedBlockCompressed6H = 0x21,
+ cudaResViewFormatUnsignedBlockCompressed7 = 0x22
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaResourceDesc {
+ enum cudaResourceType resType;
+
+ union {
+ struct {
+ cudaArray_t array;
+ } array;
+ struct {
+ cudaMipmappedArray_t mipmap;
+ } mipmap;
+ struct {
+ void *devPtr;
+ struct cudaChannelFormatDesc desc;
+ size_t sizeInBytes;
+ } linear;
+ struct {
+ void *devPtr;
+ struct cudaChannelFormatDesc desc;
+ size_t width;
+ size_t height;
+ size_t pitchInBytes;
+ } pitch2D;
+ } res;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaResourceViewDesc
+{
+ enum cudaResourceViewFormat format;
+ size_t width;
+ size_t height;
+ size_t depth;
+ unsigned int firstMipmapLevel;
+ unsigned int lastMipmapLevel;
+ unsigned int firstLayer;
+ unsigned int lastLayer;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaPointerAttributes
+{
+
+
+
+
+ enum cudaMemoryType type;
+
+
+
+
+
+
+
+
+
+
+ int device;
+
+
+
+
+
+ void *devicePointer;
+
+
+
+
+
+
+
+
+ void *hostPointer;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaFuncAttributes
+{
+
+
+
+
+
+ size_t sharedSizeBytes;
+
+
+
+
+
+ size_t constSizeBytes;
+
+
+
+
+ size_t localSizeBytes;
+
+
+
+
+
+
+ int maxThreadsPerBlock;
+
+
+
+
+ int numRegs;
+
+
+
+
+
+
+ int ptxVersion;
+
+
+
+
+
+
+ int binaryVersion;
+
+
+
+
+
+ int cacheModeCA;
+
+
+
+
+
+
+ int maxDynamicSharedSizeBytes;
+
+
+
+
+
+
+
+
+ int preferredShmemCarveout;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaFuncAttribute
+{
+ cudaFuncAttributeMaxDynamicSharedMemorySize = 8,
+ cudaFuncAttributePreferredSharedMemoryCarveout = 9,
+
+
+
+
+
+
+
+
+ cudaFuncAttributeMax
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaFuncCache
+{
+ cudaFuncCachePreferNone = 0,
+ cudaFuncCachePreferShared = 1,
+ cudaFuncCachePreferL1 = 2,
+ cudaFuncCachePreferEqual = 3
+};
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaSharedMemConfig
+{
+ cudaSharedMemBankSizeDefault = 0,
+ cudaSharedMemBankSizeFourByte = 1,
+ cudaSharedMemBankSizeEightByte = 2
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaSharedCarveout {
+ cudaSharedmemCarveoutDefault = -1,
+ cudaSharedmemCarveoutMaxShared = 100,
+ cudaSharedmemCarveoutMaxL1 = 0
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaComputeMode
+{
+ cudaComputeModeDefault = 0,
+ cudaComputeModeExclusive = 1,
+ cudaComputeModeProhibited = 2,
+ cudaComputeModeExclusiveProcess = 3
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaLimit
+{
+ cudaLimitStackSize = 0x00,
+ cudaLimitPrintfFifoSize = 0x01,
+ cudaLimitMallocHeapSize = 0x02,
+ cudaLimitDevRuntimeSyncDepth = 0x03,
+ cudaLimitDevRuntimePendingLaunchCount = 0x04,
+ cudaLimitMaxL2FetchGranularity = 0x05,
+ cudaLimitPersistingL2CacheSize = 0x06
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemoryAdvise
+{
+ cudaMemAdviseSetReadMostly = 1,
+ cudaMemAdviseUnsetReadMostly = 2,
+ cudaMemAdviseSetPreferredLocation = 3,
+ cudaMemAdviseUnsetPreferredLocation = 4,
+ cudaMemAdviseSetAccessedBy = 5,
+ cudaMemAdviseUnsetAccessedBy = 6
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemRangeAttribute
+{
+ cudaMemRangeAttributeReadMostly = 1,
+ cudaMemRangeAttributePreferredLocation = 2,
+ cudaMemRangeAttributeAccessedBy = 3,
+ cudaMemRangeAttributeLastPrefetchLocation = 4
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaOutputMode
+{
+ cudaKeyValuePair = 0x00,
+ cudaCSV = 0x01
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaFlushGPUDirectRDMAWritesOptions {
+ cudaFlushGPUDirectRDMAWritesOptionHost = 1<<0,
+ cudaFlushGPUDirectRDMAWritesOptionMemOps = 1<<1
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaGPUDirectRDMAWritesOrdering {
+ cudaGPUDirectRDMAWritesOrderingNone = 0,
+ cudaGPUDirectRDMAWritesOrderingOwner = 100,
+ cudaGPUDirectRDMAWritesOrderingAllDevices = 200
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaFlushGPUDirectRDMAWritesScope {
+ cudaFlushGPUDirectRDMAWritesToOwner = 100,
+ cudaFlushGPUDirectRDMAWritesToAllDevices = 200
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaFlushGPUDirectRDMAWritesTarget {
+ cudaFlushGPUDirectRDMAWritesTargetCurrentDevice
+};
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaDeviceAttr
+{
+ cudaDevAttrMaxThreadsPerBlock = 1,
+ cudaDevAttrMaxBlockDimX = 2,
+ cudaDevAttrMaxBlockDimY = 3,
+ cudaDevAttrMaxBlockDimZ = 4,
+ cudaDevAttrMaxGridDimX = 5,
+ cudaDevAttrMaxGridDimY = 6,
+ cudaDevAttrMaxGridDimZ = 7,
+ cudaDevAttrMaxSharedMemoryPerBlock = 8,
+ cudaDevAttrTotalConstantMemory = 9,
+ cudaDevAttrWarpSize = 10,
+ cudaDevAttrMaxPitch = 11,
+ cudaDevAttrMaxRegistersPerBlock = 12,
+ cudaDevAttrClockRate = 13,
+ cudaDevAttrTextureAlignment = 14,
+ cudaDevAttrGpuOverlap = 15,
+ cudaDevAttrMultiProcessorCount = 16,
+ cudaDevAttrKernelExecTimeout = 17,
+ cudaDevAttrIntegrated = 18,
+ cudaDevAttrCanMapHostMemory = 19,
+ cudaDevAttrComputeMode = 20,
+ cudaDevAttrMaxTexture1DWidth = 21,
+ cudaDevAttrMaxTexture2DWidth = 22,
+ cudaDevAttrMaxTexture2DHeight = 23,
+ cudaDevAttrMaxTexture3DWidth = 24,
+ cudaDevAttrMaxTexture3DHeight = 25,
+ cudaDevAttrMaxTexture3DDepth = 26,
+ cudaDevAttrMaxTexture2DLayeredWidth = 27,
+ cudaDevAttrMaxTexture2DLayeredHeight = 28,
+ cudaDevAttrMaxTexture2DLayeredLayers = 29,
+ cudaDevAttrSurfaceAlignment = 30,
+ cudaDevAttrConcurrentKernels = 31,
+ cudaDevAttrEccEnabled = 32,
+ cudaDevAttrPciBusId = 33,
+ cudaDevAttrPciDeviceId = 34,
+ cudaDevAttrTccDriver = 35,
+ cudaDevAttrMemoryClockRate = 36,
+ cudaDevAttrGlobalMemoryBusWidth = 37,
+ cudaDevAttrL2CacheSize = 38,
+ cudaDevAttrMaxThreadsPerMultiProcessor = 39,
+ cudaDevAttrAsyncEngineCount = 40,
+ cudaDevAttrUnifiedAddressing = 41,
+ cudaDevAttrMaxTexture1DLayeredWidth = 42,
+ cudaDevAttrMaxTexture1DLayeredLayers = 43,
+ cudaDevAttrMaxTexture2DGatherWidth = 45,
+ cudaDevAttrMaxTexture2DGatherHeight = 46,
+ cudaDevAttrMaxTexture3DWidthAlt = 47,
+ cudaDevAttrMaxTexture3DHeightAlt = 48,
+ cudaDevAttrMaxTexture3DDepthAlt = 49,
+ cudaDevAttrPciDomainId = 50,
+ cudaDevAttrTexturePitchAlignment = 51,
+ cudaDevAttrMaxTextureCubemapWidth = 52,
+ cudaDevAttrMaxTextureCubemapLayeredWidth = 53,
+ cudaDevAttrMaxTextureCubemapLayeredLayers = 54,
+ cudaDevAttrMaxSurface1DWidth = 55,
+ cudaDevAttrMaxSurface2DWidth = 56,
+ cudaDevAttrMaxSurface2DHeight = 57,
+ cudaDevAttrMaxSurface3DWidth = 58,
+ cudaDevAttrMaxSurface3DHeight = 59,
+ cudaDevAttrMaxSurface3DDepth = 60,
+ cudaDevAttrMaxSurface1DLayeredWidth = 61,
+ cudaDevAttrMaxSurface1DLayeredLayers = 62,
+ cudaDevAttrMaxSurface2DLayeredWidth = 63,
+ cudaDevAttrMaxSurface2DLayeredHeight = 64,
+ cudaDevAttrMaxSurface2DLayeredLayers = 65,
+ cudaDevAttrMaxSurfaceCubemapWidth = 66,
+ cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67,
+ cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68,
+ cudaDevAttrMaxTexture1DLinearWidth = 69,
+ cudaDevAttrMaxTexture2DLinearWidth = 70,
+ cudaDevAttrMaxTexture2DLinearHeight = 71,
+ cudaDevAttrMaxTexture2DLinearPitch = 72,
+ cudaDevAttrMaxTexture2DMipmappedWidth = 73,
+ cudaDevAttrMaxTexture2DMipmappedHeight = 74,
+ cudaDevAttrComputeCapabilityMajor = 75,
+ cudaDevAttrComputeCapabilityMinor = 76,
+ cudaDevAttrMaxTexture1DMipmappedWidth = 77,
+ cudaDevAttrStreamPrioritiesSupported = 78,
+ cudaDevAttrGlobalL1CacheSupported = 79,
+ cudaDevAttrLocalL1CacheSupported = 80,
+ cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81,
+ cudaDevAttrMaxRegistersPerMultiprocessor = 82,
+ cudaDevAttrManagedMemory = 83,
+ cudaDevAttrIsMultiGpuBoard = 84,
+ cudaDevAttrMultiGpuBoardGroupID = 85,
+ cudaDevAttrHostNativeAtomicSupported = 86,
+ cudaDevAttrSingleToDoublePrecisionPerfRatio = 87,
+ cudaDevAttrPageableMemoryAccess = 88,
+ cudaDevAttrConcurrentManagedAccess = 89,
+ cudaDevAttrComputePreemptionSupported = 90,
+ cudaDevAttrCanUseHostPointerForRegisteredMem = 91,
+ cudaDevAttrReserved92 = 92,
+ cudaDevAttrReserved93 = 93,
+ cudaDevAttrReserved94 = 94,
+ cudaDevAttrCooperativeLaunch = 95,
+ cudaDevAttrCooperativeMultiDeviceLaunch = 96,
+ cudaDevAttrMaxSharedMemoryPerBlockOptin = 97,
+ cudaDevAttrCanFlushRemoteWrites = 98,
+ cudaDevAttrHostRegisterSupported = 99,
+ cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100,
+ cudaDevAttrDirectManagedMemAccessFromHost = 101,
+ cudaDevAttrMaxBlocksPerMultiprocessor = 106,
+ cudaDevAttrMaxPersistingL2CacheSize = 108,
+ cudaDevAttrMaxAccessPolicyWindowSize = 109,
+ cudaDevAttrReservedSharedMemoryPerBlock = 111,
+ cudaDevAttrSparseCudaArraySupported = 112,
+ cudaDevAttrHostRegisterReadOnlySupported = 113,
+ cudaDevAttrTimelineSemaphoreInteropSupported = 114,
+ cudaDevAttrMaxTimelineSemaphoreInteropSupported = 114,
+ cudaDevAttrMemoryPoolsSupported = 115,
+ cudaDevAttrGPUDirectRDMASupported = 116,
+ cudaDevAttrGPUDirectRDMAFlushWritesOptions = 117,
+ cudaDevAttrGPUDirectRDMAWritesOrdering = 118,
+ cudaDevAttrMemoryPoolSupportedHandleTypes = 119,
+
+
+
+
+ cudaDevAttrDeferredMappingCudaArraySupported = 121,
+
+ cudaDevAttrMax
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemPoolAttr
+{
+
+
+
+
+
+
+
+
+ cudaMemPoolReuseFollowEventDependencies = 0x1,
+
+
+
+
+
+
+ cudaMemPoolReuseAllowOpportunistic = 0x2,
+
+
+
+
+
+
+
+ cudaMemPoolReuseAllowInternalDependencies = 0x3,
+
+
+
+
+
+
+
+
+
+
+ cudaMemPoolAttrReleaseThreshold = 0x4,
+
+
+
+
+
+ cudaMemPoolAttrReservedMemCurrent = 0x5,
+
+
+
+
+
+
+ cudaMemPoolAttrReservedMemHigh = 0x6,
+
+
+
+
+
+ cudaMemPoolAttrUsedMemCurrent = 0x7,
+
+
+
+
+
+
+ cudaMemPoolAttrUsedMemHigh = 0x8
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemLocationType {
+ cudaMemLocationTypeInvalid = 0,
+ cudaMemLocationTypeDevice = 1
+};
+
+
+
+
+
+
+struct __attribute__((device_builtin)) cudaMemLocation {
+ enum cudaMemLocationType type;
+ int id;
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemAccessFlags {
+ cudaMemAccessFlagsProtNone = 0,
+ cudaMemAccessFlagsProtRead = 1,
+ cudaMemAccessFlagsProtReadWrite = 3
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaMemAccessDesc {
+ struct cudaMemLocation location;
+ enum cudaMemAccessFlags flags;
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemAllocationType {
+ cudaMemAllocationTypeInvalid = 0x0,
+
+
+
+ cudaMemAllocationTypePinned = 0x1,
+ cudaMemAllocationTypeMax = 0x7FFFFFFF
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaMemAllocationHandleType {
+ cudaMemHandleTypeNone = 0x0,
+ cudaMemHandleTypePosixFileDescriptor = 0x1,
+ cudaMemHandleTypeWin32 = 0x2,
+ cudaMemHandleTypeWin32Kmt = 0x4
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaMemPoolProps {
+ enum cudaMemAllocationType allocType;
+ enum cudaMemAllocationHandleType handleTypes;
+ struct cudaMemLocation location;
+
+
+
+
+
+
+ void *win32SecurityAttributes;
+ unsigned char reserved[64];
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaMemPoolPtrExportData {
+ unsigned char reserved[64];
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaMemAllocNodeParams {
+
+
+
+
+ struct cudaMemPoolProps poolProps;
+ const struct cudaMemAccessDesc *accessDescs;
+ size_t accessDescCount;
+ size_t bytesize;
+ void *dptr;
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaGraphMemAttributeType {
+
+
+
+
+ cudaGraphMemAttrUsedMemCurrent = 0x0,
+
+
+
+
+
+
+ cudaGraphMemAttrUsedMemHigh = 0x1,
+
+
+
+
+
+
+ cudaGraphMemAttrReservedMemCurrent = 0x2,
+
+
+
+
+
+
+ cudaGraphMemAttrReservedMemHigh = 0x3
+};
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaDeviceP2PAttr {
+ cudaDevP2PAttrPerformanceRank = 1,
+ cudaDevP2PAttrAccessSupported = 2,
+ cudaDevP2PAttrNativeAtomicSupported = 3,
+ cudaDevP2PAttrCudaArrayAccessSupported = 4
+};
+
+
+
+
+
+
+struct __attribute__((device_builtin)) CUuuid_st {
+ char bytes[16];
+};
+typedef __attribute__((device_builtin)) struct CUuuid_st CUuuid;
+
+typedef __attribute__((device_builtin)) struct CUuuid_st cudaUUID_t;
+
+
+
+
+struct __attribute__((device_builtin)) cudaDeviceProp
+{
+ char name[256];
+ cudaUUID_t uuid;
+ char luid[8];
+ unsigned int luidDeviceNodeMask;
+ size_t totalGlobalMem;
+ size_t sharedMemPerBlock;
+ int regsPerBlock;
+ int warpSize;
+ size_t memPitch;
+ int maxThreadsPerBlock;
+ int maxThreadsDim[3];
+ int maxGridSize[3];
+ int clockRate;
+ size_t totalConstMem;
+ int major;
+ int minor;
+ size_t textureAlignment;
+ size_t texturePitchAlignment;
+ int deviceOverlap;
+ int multiProcessorCount;
+ int kernelExecTimeoutEnabled;
+ int integrated;
+ int canMapHostMemory;
+ int computeMode;
+ int maxTexture1D;
+ int maxTexture1DMipmap;
+ int maxTexture1DLinear;
+ int maxTexture2D[2];
+ int maxTexture2DMipmap[2];
+ int maxTexture2DLinear[3];
+ int maxTexture2DGather[2];
+ int maxTexture3D[3];
+ int maxTexture3DAlt[3];
+ int maxTextureCubemap;
+ int maxTexture1DLayered[2];
+ int maxTexture2DLayered[3];
+ int maxTextureCubemapLayered[2];
+ int maxSurface1D;
+ int maxSurface2D[2];
+ int maxSurface3D[3];
+ int maxSurface1DLayered[2];
+ int maxSurface2DLayered[3];
+ int maxSurfaceCubemap;
+ int maxSurfaceCubemapLayered[2];
+ size_t surfaceAlignment;
+ int concurrentKernels;
+ int ECCEnabled;
+ int pciBusID;
+ int pciDeviceID;
+ int pciDomainID;
+ int tccDriver;
+ int asyncEngineCount;
+ int unifiedAddressing;
+ int memoryClockRate;
+ int memoryBusWidth;
+ int l2CacheSize;
+ int persistingL2CacheMaxSize;
+ int maxThreadsPerMultiProcessor;
+ int streamPrioritiesSupported;
+ int globalL1CacheSupported;
+ int localL1CacheSupported;
+ size_t sharedMemPerMultiprocessor;
+ int regsPerMultiprocessor;
+ int managedMemory;
+ int isMultiGpuBoard;
+ int multiGpuBoardGroupID;
+ int hostNativeAtomicSupported;
+ int singleToDoublePrecisionPerfRatio;
+ int pageableMemoryAccess;
+ int concurrentManagedAccess;
+ int computePreemptionSupported;
+ int canUseHostPointerForRegisteredMem;
+ int cooperativeLaunch;
+ int cooperativeMultiDeviceLaunch;
+ size_t sharedMemPerBlockOptin;
+ int pageableMemoryAccessUsesHostPageTables;
+ int directManagedMemAccessFromHost;
+ int maxBlocksPerMultiProcessor;
+ int accessPolicyMaxWindowSize;
+ size_t reservedSharedMemPerBlock;
+};
+
+# 2348 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_types.h"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+typedef __attribute__((device_builtin)) struct __attribute__((device_builtin)) cudaIpcEventHandle_st
+{
+ char reserved[64];
+}cudaIpcEventHandle_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct __attribute__((device_builtin)) cudaIpcMemHandle_st
+{
+ char reserved[64];
+}cudaIpcMemHandle_t;
+
+
+
+
+enum __attribute__((device_builtin)) cudaExternalMemoryHandleType {
+
+
+
+ cudaExternalMemoryHandleTypeOpaqueFd = 1,
+
+
+
+ cudaExternalMemoryHandleTypeOpaqueWin32 = 2,
+
+
+
+ cudaExternalMemoryHandleTypeOpaqueWin32Kmt = 3,
+
+
+
+ cudaExternalMemoryHandleTypeD3D12Heap = 4,
+
+
+
+ cudaExternalMemoryHandleTypeD3D12Resource = 5,
+
+
+
+ cudaExternalMemoryHandleTypeD3D11Resource = 6,
+
+
+
+ cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+
+
+
+ cudaExternalMemoryHandleTypeNvSciBuf = 8
+};
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalMemoryHandleDesc {
+
+
+
+ enum cudaExternalMemoryHandleType type;
+ union {
+
+
+
+
+
+ int fd;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ struct {
+
+
+
+ void *handle;
+
+
+
+
+ const void *name;
+ } win32;
+
+
+
+
+ const void *nvSciBufObject;
+ } handle;
+
+
+
+ unsigned long long size;
+
+
+
+ unsigned int flags;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalMemoryBufferDesc {
+
+
+
+ unsigned long long offset;
+
+
+
+ unsigned long long size;
+
+
+
+ unsigned int flags;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalMemoryMipmappedArrayDesc {
+
+
+
+
+ unsigned long long offset;
+
+
+
+ struct cudaChannelFormatDesc formatDesc;
+
+
+
+ struct cudaExtent extent;
+
+
+
+
+ unsigned int flags;
+
+
+
+ unsigned int numLevels;
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaExternalSemaphoreHandleType {
+
+
+
+ cudaExternalSemaphoreHandleTypeOpaqueFd = 1,
+
+
+
+ cudaExternalSemaphoreHandleTypeOpaqueWin32 = 2,
+
+
+
+ cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3,
+
+
+
+ cudaExternalSemaphoreHandleTypeD3D12Fence = 4,
+
+
+
+ cudaExternalSemaphoreHandleTypeD3D11Fence = 5,
+
+
+
+ cudaExternalSemaphoreHandleTypeNvSciSync = 6,
+
+
+
+ cudaExternalSemaphoreHandleTypeKeyedMutex = 7,
+
+
+
+ cudaExternalSemaphoreHandleTypeKeyedMutexKmt = 8,
+
+
+
+ cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd = 9,
+
+
+
+ cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 = 10
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalSemaphoreHandleDesc {
+
+
+
+ enum cudaExternalSemaphoreHandleType type;
+ union {
+
+
+
+
+
+
+ int fd;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ struct {
+
+
+
+ void *handle;
+
+
+
+
+ const void *name;
+ } win32;
+
+
+
+ const void* nvSciSyncObj;
+ } handle;
+
+
+
+ unsigned int flags;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalSemaphoreSignalParams_v1 {
+ struct {
+
+
+
+ struct {
+
+
+
+ unsigned long long value;
+ } fence;
+ union {
+
+
+
+
+ void *fence;
+ unsigned long long reserved;
+ } nvSciSync;
+
+
+
+ struct {
+
+
+
+ unsigned long long key;
+ } keyedMutex;
+ } params;
+
+
+
+
+
+
+
+
+
+
+ unsigned int flags;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalSemaphoreWaitParams_v1 {
+ struct {
+
+
+
+ struct {
+
+
+
+ unsigned long long value;
+ } fence;
+ union {
+
+
+
+
+ void *fence;
+ unsigned long long reserved;
+ } nvSciSync;
+
+
+
+ struct {
+
+
+
+ unsigned long long key;
+
+
+
+ unsigned int timeoutMs;
+ } keyedMutex;
+ } params;
+
+
+
+
+
+
+
+
+
+
+ unsigned int flags;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalSemaphoreSignalParams{
+ struct {
+
+
+
+ struct {
+
+
+
+ unsigned long long value;
+ } fence;
+ union {
+
+
+
+
+ void *fence;
+ unsigned long long reserved;
+ } nvSciSync;
+
+
+
+ struct {
+
+
+
+ unsigned long long key;
+ } keyedMutex;
+ unsigned int reserved[12];
+ } params;
+
+
+
+
+
+
+
+
+
+
+ unsigned int flags;
+ unsigned int reserved[16];
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalSemaphoreWaitParams {
+ struct {
+
+
+
+ struct {
+
+
+
+ unsigned long long value;
+ } fence;
+ union {
+
+
+
+
+ void *fence;
+ unsigned long long reserved;
+ } nvSciSync;
+
+
+
+ struct {
+
+
+
+ unsigned long long key;
+
+
+
+ unsigned int timeoutMs;
+ } keyedMutex;
+ unsigned int reserved[10];
+ } params;
+
+
+
+
+
+
+
+
+
+
+ unsigned int flags;
+ unsigned int reserved[16];
+};
+
+
+
+
+
+
+
+
+
+
+typedef __attribute__((device_builtin)) enum cudaError cudaError_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUstream_st *cudaStream_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUevent_st *cudaEvent_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct cudaGraphicsResource *cudaGraphicsResource_t;
+
+
+
+
+typedef __attribute__((device_builtin)) enum cudaOutputMode cudaOutputMode_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUexternalMemory_st *cudaExternalMemory_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUexternalSemaphore_st *cudaExternalSemaphore_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUgraph_st *cudaGraph_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUgraphNode_st *cudaGraphNode_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUuserObject_st *cudaUserObject_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUfunc_st *cudaFunction_t;
+
+
+
+
+typedef __attribute__((device_builtin)) struct CUmemPoolHandle_st *cudaMemPool_t;
+
+
+
+
+enum __attribute__((device_builtin)) cudaCGScope {
+ cudaCGScopeInvalid = 0,
+ cudaCGScopeGrid = 1,
+ cudaCGScopeMultiGrid = 2
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaLaunchParams
+{
+ void *func;
+ dim3 gridDim;
+ dim3 blockDim;
+ void **args;
+ size_t sharedMem;
+ cudaStream_t stream;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaKernelNodeParams {
+ void* func;
+ dim3 gridDim;
+ dim3 blockDim;
+ unsigned int sharedMemBytes;
+ void **kernelParams;
+ void **extra;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalSemaphoreSignalNodeParams {
+ cudaExternalSemaphore_t* extSemArray;
+ const struct cudaExternalSemaphoreSignalParams* paramsArray;
+ unsigned int numExtSems;
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaExternalSemaphoreWaitNodeParams {
+ cudaExternalSemaphore_t* extSemArray;
+ const struct cudaExternalSemaphoreWaitParams* paramsArray;
+ unsigned int numExtSems;
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaGraphNodeType {
+ cudaGraphNodeTypeKernel = 0x00,
+ cudaGraphNodeTypeMemcpy = 0x01,
+ cudaGraphNodeTypeMemset = 0x02,
+ cudaGraphNodeTypeHost = 0x03,
+ cudaGraphNodeTypeGraph = 0x04,
+ cudaGraphNodeTypeEmpty = 0x05,
+ cudaGraphNodeTypeWaitEvent = 0x06,
+ cudaGraphNodeTypeEventRecord = 0x07,
+ cudaGraphNodeTypeExtSemaphoreSignal = 0x08,
+ cudaGraphNodeTypeExtSemaphoreWait = 0x09,
+ cudaGraphNodeTypeMemAlloc = 0x0a,
+ cudaGraphNodeTypeMemFree = 0x0b,
+ cudaGraphNodeTypeCount
+};
+
+
+
+
+typedef struct CUgraphExec_st* cudaGraphExec_t;
+
+
+
+
+enum __attribute__((device_builtin)) cudaGraphExecUpdateResult {
+ cudaGraphExecUpdateSuccess = 0x0,
+ cudaGraphExecUpdateError = 0x1,
+ cudaGraphExecUpdateErrorTopologyChanged = 0x2,
+ cudaGraphExecUpdateErrorNodeTypeChanged = 0x3,
+ cudaGraphExecUpdateErrorFunctionChanged = 0x4,
+ cudaGraphExecUpdateErrorParametersChanged = 0x5,
+ cudaGraphExecUpdateErrorNotSupported = 0x6,
+ cudaGraphExecUpdateErrorUnsupportedFunctionChange = 0x7,
+ cudaGraphExecUpdateErrorAttributesChanged = 0x8
+};
+
+
+
+
+
+enum __attribute__((device_builtin)) cudaGetDriverEntryPointFlags {
+ cudaEnableDefault = 0x0,
+ cudaEnableLegacyStream = 0x1,
+ cudaEnablePerThreadDefaultStream = 0x2
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaGraphDebugDotFlags {
+ cudaGraphDebugDotFlagsVerbose = 1<<0,
+ cudaGraphDebugDotFlagsKernelNodeParams = 1<<2,
+ cudaGraphDebugDotFlagsMemcpyNodeParams = 1<<3,
+ cudaGraphDebugDotFlagsMemsetNodeParams = 1<<4,
+ cudaGraphDebugDotFlagsHostNodeParams = 1<<5,
+ cudaGraphDebugDotFlagsEventNodeParams = 1<<6,
+ cudaGraphDebugDotFlagsExtSemasSignalNodeParams = 1<<7,
+ cudaGraphDebugDotFlagsExtSemasWaitNodeParams = 1<<8,
+ cudaGraphDebugDotFlagsKernelNodeAttributes = 1<<9,
+ cudaGraphDebugDotFlagsHandles = 1<<10
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaGraphInstantiateFlags {
+ cudaGraphInstantiateFlagAutoFreeOnLaunch = 1
+
+ , cudaGraphInstantiateFlagUseNodePriority = 8
+
+
+};
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+typedef __attribute__((device_builtin)) enum cudaStreamAttrID {
+ cudaStreamAttributeAccessPolicyWindow = 1,
+ cudaStreamAttributeSynchronizationPolicy = 3
+} cudaStreamAttrID;
+
+
+
+
+
+
+
+
+
+
+typedef __attribute__((device_builtin)) union cudaStreamAttrValue {
+ struct cudaAccessPolicyWindow accessPolicyWindow;
+ enum cudaSynchronizationPolicy syncPolicy;
+} cudaStreamAttrValue;
+
+
+
+
+
+
+
+
+typedef __attribute__((device_builtin)) enum cudaKernelNodeAttrID {
+ cudaKernelNodeAttributeAccessPolicyWindow = 1
+ , cudaKernelNodeAttributeCooperative = 2
+
+ , cudaKernelNodeAttributePriority = 8
+
+} cudaKernelNodeAttrID;
+
+
+
+
+
+
+
+
+
+
+
+typedef __attribute__((device_builtin)) union cudaKernelNodeAttrValue {
+ struct cudaAccessPolicyWindow accessPolicyWindow;
+ int cooperative;
+
+ int priority;
+
+} cudaKernelNodeAttrValue;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 60 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/builtin_types.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/surface_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 80 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/surface_types.h"
+
+
+
+
+enum __attribute__((device_builtin)) cudaSurfaceBoundaryMode
+{
+ cudaBoundaryModeZero = 0,
+ cudaBoundaryModeClamp = 1,
+ cudaBoundaryModeTrap = 2
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaSurfaceFormatMode
+{
+ cudaFormatModeForced = 0,
+ cudaFormatModeAuto = 1
+};
+
+
+
+
+struct __attribute__((device_builtin)) surfaceReference
+{
+
+
+
+ struct cudaChannelFormatDesc channelDesc;
+};
+
+
+
+
+typedef __attribute__((device_builtin)) unsigned long long cudaSurfaceObject_t;
+
+
+
+
+# 63 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/builtin_types.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/texture_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 80 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/texture_types.h"
+
+
+
+
+enum __attribute__((device_builtin)) cudaTextureAddressMode
+{
+ cudaAddressModeWrap = 0,
+ cudaAddressModeClamp = 1,
+ cudaAddressModeMirror = 2,
+ cudaAddressModeBorder = 3
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaTextureFilterMode
+{
+ cudaFilterModePoint = 0,
+ cudaFilterModeLinear = 1
+};
+
+
+
+
+enum __attribute__((device_builtin)) cudaTextureReadMode
+{
+ cudaReadModeElementType = 0,
+ cudaReadModeNormalizedFloat = 1
+};
+
+
+
+
+struct __attribute__((device_builtin)) textureReference
+{
+
+
+
+ int normalized;
+
+
+
+ enum cudaTextureFilterMode filterMode;
+
+
+
+ enum cudaTextureAddressMode addressMode[3];
+
+
+
+ struct cudaChannelFormatDesc channelDesc;
+
+
+
+ int sRGB;
+
+
+
+ unsigned int maxAnisotropy;
+
+
+
+ enum cudaTextureFilterMode mipmapFilterMode;
+
+
+
+ float mipmapLevelBias;
+
+
+
+ float minMipmapLevelClamp;
+
+
+
+ float maxMipmapLevelClamp;
+
+
+
+ int disableTrilinearOptimization;
+ int __cudaReserved[14];
+};
+
+
+
+
+struct __attribute__((device_builtin)) cudaTextureDesc
+{
+
+
+
+ enum cudaTextureAddressMode addressMode[3];
+
+
+
+ enum cudaTextureFilterMode filterMode;
+
+
+
+ enum cudaTextureReadMode readMode;
+
+
+
+ int sRGB;
+
+
+
+ float borderColor[4];
+
+
+
+ int normalizedCoords;
+
+
+
+ unsigned int maxAnisotropy;
+
+
+
+ enum cudaTextureFilterMode mipmapFilterMode;
+
+
+
+ float mipmapLevelBias;
+
+
+
+ float minMipmapLevelClamp;
+
+
+
+ float maxMipmapLevelClamp;
+
+
+
+ int disableTrilinearOptimization;
+
+
+
+ int seamlessCubemap;
+};
+
+
+
+
+typedef __attribute__((device_builtin)) unsigned long long cudaTextureObject_t;
+
+
+
+
+# 64 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/builtin_types.h" 2
+# 92 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/library_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+typedef enum cudaDataType_t
+{
+ CUDA_R_16F = 2,
+ CUDA_C_16F = 6,
+ CUDA_R_16BF = 14,
+ CUDA_C_16BF = 15,
+ CUDA_R_32F = 0,
+ CUDA_C_32F = 4,
+ CUDA_R_64F = 1,
+ CUDA_C_64F = 5,
+ CUDA_R_4I = 16,
+ CUDA_C_4I = 17,
+ CUDA_R_4U = 18,
+ CUDA_C_4U = 19,
+ CUDA_R_8I = 3,
+ CUDA_C_8I = 7,
+ CUDA_R_8U = 8,
+ CUDA_C_8U = 9,
+ CUDA_R_16I = 20,
+ CUDA_C_16I = 21,
+ CUDA_R_16U = 22,
+ CUDA_C_16U = 23,
+ CUDA_R_32I = 10,
+ CUDA_C_32I = 11,
+ CUDA_R_32U = 12,
+ CUDA_C_32U = 13,
+ CUDA_R_64I = 24,
+ CUDA_C_64I = 25,
+ CUDA_R_64U = 26,
+ CUDA_C_64U = 27,
+
+
+
+
+} cudaDataType;
+
+
+typedef enum libraryPropertyType_t
+{
+ MAJOR_VERSION,
+ MINOR_VERSION,
+ PATCH_LEVEL
+} libraryPropertyType;
+
+
+
+
+
+
+
+# 93 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/channel_descriptor.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime_api.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 255 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 148 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime_api.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/builtin_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 150 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime_api.h" 2
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern "C" {
+
+
+struct cudaFuncAttributes;
+
+
+inline __attribute__((device)) cudaError_t cudaMalloc(void **p, size_t s)
+{
+ return cudaErrorUnknown;
+}
+
+inline __attribute__((device)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
+{
+ return cudaErrorUnknown;
+}
+
+inline __attribute__((device)) cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+ return cudaErrorUnknown;
+}
+
+inline __attribute__((device)) cudaError_t cudaGetDevice(int *device)
+{
+ return cudaErrorUnknown;
+}
+
+inline __attribute__((device)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+ return cudaErrorUnknown;
+}
+
+inline __attribute__((device)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+ return cudaErrorUnknown;
+}
+
+
+
+}
+
+
+
+
+
+
+# 118 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h"
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 255 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 130 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h" 2
+
+
+
+
+
+
+extern "C"
+{
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+extern __attribute__((device)) __attribute__((cudart_builtin)) __attribute__((deprecated("Use of " "cudaDeviceSynchronize" " from device code is deprecated and will not be supported in a future release. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING."))) cudaError_t cudaDeviceSynchronize(void);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t __cudaDeviceSynchronizeDeprecationAvoidance(void);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaGetLastError(void);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaPeekAtLastError(void);
+extern __attribute__((device)) __attribute__((cudart_builtin)) const char* cudaGetErrorString(cudaError_t error);
+extern __attribute__((device)) __attribute__((cudart_builtin)) const char* cudaGetErrorName(cudaError_t error);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaGetDeviceCount(int *count);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaGetDevice(int *device);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaStreamDestroy(cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaEventDestroy(cudaEvent_t event);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaFree(void *devPtr);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMalloc(void **devPtr, size_t size);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaRuntimeGetVersion(int *runtimeVersion);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((device)) __attribute__((cudart_builtin)) void * cudaGetParameterBuffer(size_t alignment, size_t size);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((device)) __attribute__((cudart_builtin)) void * cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
+
+# 244 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_device_runtime_api.h"
+ extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+ extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
+
+
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+extern __attribute__((device)) __attribute__((cudart_builtin)) unsigned long long cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaCGSynchronize(unsigned long long handle, unsigned int flags);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
+extern __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
+}
+
+template static __inline__ __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaMalloc(T **devPtr, size_t size);
+template static __inline__ __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
+template static __inline__ __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
+template static __inline__ __attribute__((device)) __attribute__((cudart_builtin)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+
+
+
+
+
+
+
+# 152 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime_api.h" 2
+
+# 161 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime_api.h"
+
+
+
+
+# 225 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime_api.h"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 266 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime_api.h"
+
+
+
+extern "C" {
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceReset(void);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceSynchronize(void);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceSetLimit(enum cudaLimit limit, size_t value);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceGetByPCIBusId(int *device, const char *pciBusId);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceGetPCIBusId(char *pciBusId, int len, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaIpcCloseMemHandle(void *devPtr);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWritesTarget target, enum cudaFlushGPUDirectRDMAWritesScope scope);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaThreadExit(void);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaThreadSynchronize(void);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaThreadSetLimit(enum cudaLimit limit, size_t value);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaGetLastError(void);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaPeekAtLastError(void);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) const char* cudaGetErrorName(cudaError_t error);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) const char* cudaGetErrorString(cudaError_t error);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaGetDeviceCount(int *count);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceSetMemPool(int device, cudaMemPool_t memPool);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, int device, int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaChooseDevice(int *device, const struct cudaDeviceProp *prop);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaSetDevice(int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaGetDevice(int *device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaSetValidDevices(int *device_arr, int len);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaSetDeviceFlags( unsigned int flags );
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetDeviceFlags( unsigned int *flags );
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamCreate(cudaStream_t *pStream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamGetPriority(cudaStream_t hStream, int *priority);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaCtxResetPersistingL2Cache(void);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamGetAttribute(
+ cudaStream_t hStream, cudaStreamAttrID attr,
+ cudaStreamAttrValue *value_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamSetAttribute(
+ cudaStream_t hStream, cudaStreamAttrID attr,
+ const cudaStreamAttrValue *value);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamDestroy(cudaStream_t stream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags = 0);
+
+
+
+
+
+
+
+typedef void ( *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamAddCallback(cudaStream_t stream,
+ cudaStreamCallback_t callback, void *userData, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamSynchronize(cudaStream_t stream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamQuery(cudaStream_t stream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length = 0, unsigned int flags = 0x04);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus, unsigned long long *pId);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out = 0, cudaGraph_t *graph_out = 0, const cudaGraphNode_t **dependencies_out = 0, size_t *numDependencies_out = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaEventCreate(cudaEvent_t *event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream = 0, unsigned int flags = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaEventQuery(cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaEventSynchronize(cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaEventDestroy(cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaImportExternalMemory(cudaExternalMemory_t *extMem_out, const struct cudaExternalMemoryHandleDesc *memHandleDesc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaExternalMemoryGetMappedBuffer(void **devPtr, cudaExternalMemory_t extMem, const struct cudaExternalMemoryBufferDesc *bufferDesc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem, const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDestroyExternalMemory(cudaExternalMemory_t extMem);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaImportExternalSemaphore(cudaExternalSemaphore_t *extSem_out, const struct cudaExternalSemaphoreHandleDesc *semHandleDesc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaSetDoubleForDevice(double *d);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaSetDoubleForHost(double *d);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, const void *func, int numBlocks, int blockSize);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaMallocManaged(void **devPtr, size_t size, unsigned int flags = 0x01);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaMalloc(void **devPtr, size_t size);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMallocHost(void **ptr, size_t size);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height = 0, unsigned int flags = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaFree(void *devPtr);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaFreeHost(void *ptr);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaFreeArray(cudaArray_t array);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaHostAlloc(void **pHost, size_t size, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaHostRegister(void *ptr, size_t size, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaHostUnregister(void *ptr);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaHostGetFlags(unsigned int *pFlags, void *pHost);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemGetInfo(size_t *free, size_t *total);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaArrayGetPlane(cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements *memoryRequirements, cudaArray_t array, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements *memoryRequirements, cudaMipmappedArray_t mipmap, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaArray_t array);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaMipmappedArray_t mipmap);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind = cudaMemcpyDeviceToDevice);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset = 0, enum cudaMemcpyKind kind = cudaMemcpyHostToDevice);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset = 0, enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemset(void *devPtr, int value, size_t count);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetSymbolAddress(void **devPtr, const void *symbol);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetSymbolSize(size_t *size, const void *symbol);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind = cudaMemcpyDeviceToDevice);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaFreeAsync(void *devPtr, cudaStream_t hStream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolSetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value );
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolGetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value );
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAccessDesc *descList, size_t count);
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool, struct cudaMemLocation *location);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolCreate(cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolDestroy(cudaMemPool_t memPool);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolExportToShareableHandle(
+ void *shareableHandle,
+ cudaMemPool_t memPool,
+ enum cudaMemAllocationHandleType handleType,
+ unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolImportFromShareableHandle(
+ cudaMemPool_t *memPool,
+ void *shareableHandle,
+ enum cudaMemAllocationHandleType handleType,
+ unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolExportPointer(struct cudaMemPoolPtrExportData *exportData, void *ptr);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool, struct cudaMemPoolPtrExportData *exportData);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceDisablePeerAccess(int peerDevice);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size = (0x7fffffff * 2U + 1U));
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaUnbindTexture(const struct textureReference *texref);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaGetTextureReference(const struct textureReference **texref, const void *symbol);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((deprecated)) __attribute__((host)) cudaError_t cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) struct cudaChannelFormatDesc cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDestroyTextureObject(cudaTextureObject_t texObject);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDriverGetVersion(int *driverVersion);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) __attribute__((cudart_builtin)) cudaError_t cudaRuntimeGetVersion(int *runtimeVersion);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphCreate(cudaGraph_t *pGraph, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaKernelNodeParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphKernelNodeCopyAttributes(
+ cudaGraphNode_t hSrc,
+ cudaGraphNode_t hDst);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphKernelNodeGetAttribute(
+ cudaGraphNode_t hNode,
+ cudaKernelNodeAttrID attr,
+ cudaKernelNodeAttrValue *value_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphKernelNodeSetAttribute(
+ cudaGraphNode_t hNode,
+ cudaKernelNodeAttrID attr,
+ const cudaKernelNodeAttrValue *value);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms *pCopyParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphAddMemcpyNodeToSymbol(
+ cudaGraphNode_t *pGraphNode,
+ cudaGraph_t graph,
+ const cudaGraphNode_t *pDependencies,
+ size_t numDependencies,
+ const void* symbol,
+ const void* src,
+ size_t count,
+ size_t offset,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphAddMemcpyNodeFromSymbol(
+ cudaGraphNode_t* pGraphNode,
+ cudaGraph_t graph,
+ const cudaGraphNode_t* pDependencies,
+ size_t numDependencies,
+ void* dst,
+ const void* symbol,
+ size_t count,
+ size_t offset,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphAddMemcpyNode1D(
+ cudaGraphNode_t *pGraphNode,
+ cudaGraph_t graph,
+ const cudaGraphNode_t *pDependencies,
+ size_t numDependencies,
+ void* dst,
+ const void* src,
+ size_t count,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(
+ cudaGraphNode_t node,
+ const void* symbol,
+ const void* src,
+ size_t count,
+ size_t offset,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphMemcpyNodeSetParamsFromSymbol(
+ cudaGraphNode_t node,
+ void* dst,
+ const void* symbol,
+ size_t count,
+ size_t offset,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphMemcpyNodeSetParams1D(
+ cudaGraphNode_t node,
+ void* dst,
+ const void* src,
+ size_t count,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemsetParams *pMemsetParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaHostNodeParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaGraph_t childGraph);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreSignalNodeParams *params_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreWaitNodeParams *params_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddMemAllocNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, struct cudaMemAllocNodeParams *nodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddMemFreeNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceGraphMemTrim(int device);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceGetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaDeviceSetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t *nodes, size_t *numNodes);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, size_t *numEdges);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, size_t *pNumDependencies);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, size_t *pNumDependentNodes);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphDestroyNode(cudaGraphNode_t node);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode, char *pLogBuffer, size_t bufferSize);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphInstantiateWithFlags(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(
+ cudaGraphExec_t hGraphExec,
+ cudaGraphNode_t node,
+ const void* symbol,
+ const void* src,
+ size_t count,
+ size_t offset,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphExecMemcpyNodeSetParamsFromSymbol(
+ cudaGraphExec_t hGraphExec,
+ cudaGraphNode_t node,
+ void* dst,
+ const void* symbol,
+ size_t count,
+ size_t offset,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphExecMemcpyNodeSetParams1D(
+ cudaGraphExec_t hGraphExec,
+ cudaGraphNode_t node,
+ void* dst,
+ const void* src,
+ size_t count,
+ enum cudaMemcpyKind kind);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int *isEnabled);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphNode_t *hErrorNode_out, enum cudaGraphExecUpdateResult *updateResult_out);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ extern __attribute__((host)) cudaError_t cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphExecDestroy(cudaGraphExec_t graphExec);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphDestroy(cudaGraph_t graph);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphDebugDotPrint(cudaGraph_t graph, const char *path, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaUserObjectCreate(cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy, unsigned int initialRefcount, unsigned int flags);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaUserObjectRetain(cudaUserObject_t object, unsigned int count = 1);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaUserObjectRelease(cudaUserObject_t object, unsigned int count = 1);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count = 1, unsigned int flags = 0);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count = 1);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags);
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern __attribute__((host)) cudaError_t cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolPtr);
+
+
+
+# 13173 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime_api.h"
+
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 62 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/channel_descriptor.h" 2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+static __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
+{
+ int e = (int)sizeof(unsigned short) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
+{
+ int e = (int)sizeof(unsigned short) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
+{
+ int e = (int)sizeof(unsigned short) * 8;
+
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
+{
+ int e = (int)sizeof(unsigned short) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(char) * 8;
+
+
+
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(signed char) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned char) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(signed char) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned char) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(signed char) * 8;
+
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned char) * 8;
+
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(signed char) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned char) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(short) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned short) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(short) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned short) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(short) * 8;
+
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned short) * 8;
+
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(short) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned short) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(int) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned int) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(int) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned int) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(int) * 8;
+
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned int) * 8;
+
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(int) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(unsigned int) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+# 395 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/channel_descriptor.h"
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(float) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(float) * 8;
+
+ return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(float) * 8;
+
+ return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ int e = (int)sizeof(float) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
+{
+ int e = (int)sizeof(char) * 8;
+
+ return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
+}
+
+template __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
+}
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
+}
+
+
+template<> __inline__ __attribute__((host)) cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+ return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
+}
+
+
+
+
+
+
+# 96 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_functions.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/builtin_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 54 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_functions.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 255 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 55 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/driver_functions.h" 2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+static __inline__ __attribute__((host)) struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
+{
+ struct cudaPitchedPtr s;
+
+ s.ptr = d;
+ s.pitch = p;
+ s.xsize = xsz;
+ s.ysize = ysz;
+
+ return s;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+static __inline__ __attribute__((host)) struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
+{
+ struct cudaPos p;
+
+ p.x = x;
+ p.y = y;
+ p.z = z;
+
+ return p;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+static __inline__ __attribute__((host)) struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
+{
+ struct cudaExtent e;
+
+ e.width = w;
+ e.height = h;
+ e.depth = d;
+
+ return e;
+}
+
+
+
+# 98 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 255 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 101 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/vector_functions.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+static __inline__ __attribute__((host)) __attribute__((device)) char1 make_char1(signed char x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) uchar1 make_uchar1(unsigned char x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) char2 make_char2(signed char x, signed char y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) uchar2 make_uchar2(unsigned char x, unsigned char y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) char3 make_char3(signed char x, signed char y, signed char z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) char4 make_char4(signed char x, signed char y, signed char z, signed char w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) short1 make_short1(short x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ushort1 make_ushort1(unsigned short x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) short2 make_short2(short x, short y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ushort2 make_ushort2(unsigned short x, unsigned short y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) short3 make_short3(short x,short y, short z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) short4 make_short4(short x, short y, short z, short w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) int1 make_int1(int x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) uint1 make_uint1(unsigned int x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) int2 make_int2(int x, int y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) uint2 make_uint2(unsigned int x, unsigned int y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) int3 make_int3(int x, int y, int z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) int4 make_int4(int x, int y, int z, int w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) long1 make_long1(long int x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulong1 make_ulong1(unsigned long int x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) long2 make_long2(long int x, long int y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulong2 make_ulong2(unsigned long int x, unsigned long int y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) long3 make_long3(long int x, long int y, long int z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) long4 make_long4(long int x, long int y, long int z, long int w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) float1 make_float1(float x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) float2 make_float2(float x, float y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) float3 make_float3(float x, float y, float z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) float4 make_float4(float x, float y, float z, float w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) longlong1 make_longlong1(long long int x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulonglong1 make_ulonglong1(unsigned long long int x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) longlong2 make_longlong2(long long int x, long long int y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) longlong3 make_longlong3(long long int x, long long int y, long long int z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
+
+static __inline__ __attribute__((host)) __attribute__((device)) double1 make_double1(double x);
+
+static __inline__ __attribute__((host)) __attribute__((device)) double2 make_double2(double x, double y);
+
+static __inline__ __attribute__((host)) __attribute__((device)) double3 make_double3(double x, double y, double z);
+
+static __inline__ __attribute__((host)) __attribute__((device)) double4 make_double4(double x, double y, double z, double w);
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/vector_functions.hpp" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+static __inline__ __attribute__((host)) __attribute__((device)) char1 make_char1(signed char x)
+{
+ char1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) uchar1 make_uchar1(unsigned char x)
+{
+ uchar1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) char2 make_char2(signed char x, signed char y)
+{
+ char2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) uchar2 make_uchar2(unsigned char x, unsigned char y)
+{
+ uchar2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) char3 make_char3(signed char x, signed char y, signed char z)
+{
+ char3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
+{
+ uchar3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) char4 make_char4(signed char x, signed char y, signed char z, signed char w)
+{
+ char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
+{
+ uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) short1 make_short1(short x)
+{
+ short1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ushort1 make_ushort1(unsigned short x)
+{
+ ushort1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) short2 make_short2(short x, short y)
+{
+ short2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ushort2 make_ushort2(unsigned short x, unsigned short y)
+{
+ ushort2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) short3 make_short3(short x,short y, short z)
+{
+ short3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
+{
+ ushort3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) short4 make_short4(short x, short y, short z, short w)
+{
+ short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+ ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) int1 make_int1(int x)
+{
+ int1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) uint1 make_uint1(unsigned int x)
+{
+ uint1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) int2 make_int2(int x, int y)
+{
+ int2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) uint2 make_uint2(unsigned int x, unsigned int y)
+{
+ uint2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) int3 make_int3(int x, int y, int z)
+{
+ int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
+{
+ uint3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) int4 make_int4(int x, int y, int z, int w)
+{
+ int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
+{
+ uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) long1 make_long1(long int x)
+{
+ long1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulong1 make_ulong1(unsigned long int x)
+{
+ ulong1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) long2 make_long2(long int x, long int y)
+{
+ long2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulong2 make_ulong2(unsigned long int x, unsigned long int y)
+{
+ ulong2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) long3 make_long3(long int x, long int y, long int z)
+{
+ long3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
+{
+ ulong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) long4 make_long4(long int x, long int y, long int z, long int w)
+{
+ long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
+{
+ ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) float1 make_float1(float x)
+{
+ float1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) float2 make_float2(float x, float y)
+{
+ float2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) float3 make_float3(float x, float y, float z)
+{
+ float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) float4 make_float4(float x, float y, float z, float w)
+{
+ float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) longlong1 make_longlong1(long long int x)
+{
+ longlong1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulonglong1 make_ulonglong1(unsigned long long int x)
+{
+ ulonglong1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) longlong2 make_longlong2(long long int x, long long int y)
+{
+ longlong2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
+{
+ ulonglong2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) longlong3 make_longlong3(long long int x, long long int y, long long int z)
+{
+ longlong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
+{
+ ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
+{
+ longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
+{
+ ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) double1 make_double1(double x)
+{
+ double1 t; t.x = x; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) double2 make_double2(double x, double y)
+{
+ double2 t; t.x = x; t.y = y; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) double3 make_double3(double x, double y, double z)
+{
+ double3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+static __inline__ __attribute__((host)) __attribute__((device)) double4 make_double4(double x, double y, double z, double w)
+{
+ double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+
+
+
+
+# 173 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/vector_functions.h" 2
+
+
+# 102 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/cuda_runtime.h" 2
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/common_functions.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/common_functions.h"
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/builtin_types.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 72 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/common_functions.h" 2
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h" 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 59 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 255 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/host_defines.h"
+
+# 73 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/common_functions.h" 2
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/string.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/libc-header-start.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 44 "/usr/include/bits/libc-header-start.h" 3
+
+
+
+
+
+# 61 "/usr/include/bits/libc-header-start.h" 3
+
+
+
+
+
+# 78 "/usr/include/bits/libc-header-start.h" 3
+
+
+
+# 27 "/usr/include/string.h" 2 3
+
+extern "C" {
+
+
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 44 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 91 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 116 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 159 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+# 240 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+# 349 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+# 361 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+# 395 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+# 413 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+# 440 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+# 447 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+# 34 "/usr/include/string.h" 2 3
+
+
+
+
+
+
+
+
+
+extern void *memcpy (void *__restrict __dest, const void *__restrict __src,
+ size_t __n) throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern void *memmove (void *__dest, const void *__src, size_t __n)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+
+
+extern void *memccpy (void *__restrict __dest, const void *__restrict __src,
+ int __c, size_t __n)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+
+extern void *memset (void *__s, int __c, size_t __n) throw () __attribute__ ((__nonnull__ (1)));
+
+
+extern int memcmp (const void *__s1, const void *__s2, size_t __n)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+extern "C++"
+{
+extern void *memchr (void *__s, int __c, size_t __n)
+ throw () __asm ("memchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+extern const void *memchr (const void *__s, int __c, size_t __n)
+ throw () __asm ("memchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+# 89 "/usr/include/string.h" 3
+}
+
+
+
+
+
+
+
+
+
+extern "C++" void *rawmemchr (void *__s, int __c)
+ throw () __asm ("rawmemchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+extern "C++" const void *rawmemchr (const void *__s, int __c)
+ throw () __asm ("rawmemchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+
+
+
+
+
+
+extern "C++" void *memrchr (void *__s, int __c, size_t __n)
+ throw () __asm ("memrchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+extern "C++" const void *memrchr (const void *__s, int __c, size_t __n)
+ throw () __asm ("memrchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+
+
+
+
+
+
+
+extern char *strcpy (char *__restrict __dest, const char *__restrict __src)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+extern char *strncpy (char *__restrict __dest,
+ const char *__restrict __src, size_t __n)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern char *strcat (char *__restrict __dest, const char *__restrict __src)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+extern char *strncat (char *__restrict __dest, const char *__restrict __src,
+ size_t __n) throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern int strcmp (const char *__s1, const char *__s2)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+extern int strncmp (const char *__s1, const char *__s2, size_t __n)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern int strcoll (const char *__s1, const char *__s2)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+extern size_t strxfrm (char *__restrict __dest,
+ const char *__restrict __src, size_t __n)
+ throw () __attribute__ ((__nonnull__ (2)));
+
+
+
+# 1 "/usr/include/bits/types/locale_t.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/types/__locale_t.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+struct __locale_struct
+{
+
+ struct __locale_data *__locales[13];
+
+
+ const unsigned short int *__ctype_b;
+ const int *__ctype_tolower;
+ const int *__ctype_toupper;
+
+
+ const char *__names[13];
+};
+
+typedef struct __locale_struct *__locale_t;
+
+# 23 "/usr/include/bits/types/locale_t.h" 2 3
+
+typedef __locale_t locale_t;
+
+# 154 "/usr/include/string.h" 2 3
+
+
+extern int strcoll_l (const char *__s1, const char *__s2, locale_t __l)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2, 3)));
+
+
+extern size_t strxfrm_l (char *__dest, const char *__src, size_t __n,
+ locale_t __l) throw () __attribute__ ((__nonnull__ (2, 4)));
+
+
+
+
+
+extern char *strdup (const char *__s)
+ throw () __attribute__ ((__malloc__)) __attribute__ ((__nonnull__ (1)));
+
+
+
+
+
+
+extern char *strndup (const char *__string, size_t __n)
+ throw () __attribute__ ((__malloc__)) __attribute__ ((__nonnull__ (1)));
+
+
+
+
+# 189 "/usr/include/string.h" 3
+
+
+# 201 "/usr/include/string.h" 3
+
+
+
+extern "C++"
+{
+extern char *strchr (char *__s, int __c)
+ throw () __asm ("strchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+extern const char *strchr (const char *__s, int __c)
+ throw () __asm ("strchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+# 224 "/usr/include/string.h" 3
+}
+
+
+
+
+
+
+extern "C++"
+{
+extern char *strrchr (char *__s, int __c)
+ throw () __asm ("strrchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+extern const char *strrchr (const char *__s, int __c)
+ throw () __asm ("strrchr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+# 251 "/usr/include/string.h" 3
+}
+
+
+
+
+
+
+
+
+
+extern "C++" char *strchrnul (char *__s, int __c)
+ throw () __asm ("strchrnul") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+extern "C++" const char *strchrnul (const char *__s, int __c)
+ throw () __asm ("strchrnul") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+
+
+
+
+
+
+
+extern size_t strcspn (const char *__s, const char *__reject)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern size_t strspn (const char *__s, const char *__accept)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern "C++"
+{
+extern char *strpbrk (char *__s, const char *__accept)
+ throw () __asm ("strpbrk") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+extern const char *strpbrk (const char *__s, const char *__accept)
+ throw () __asm ("strpbrk") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+# 301 "/usr/include/string.h" 3
+}
+
+
+
+
+
+
+extern "C++"
+{
+extern char *strstr (char *__haystack, const char *__needle)
+ throw () __asm ("strstr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+extern const char *strstr (const char *__haystack, const char *__needle)
+ throw () __asm ("strstr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+# 328 "/usr/include/string.h" 3
+}
+
+
+
+
+
+
+
+extern char *strtok (char *__restrict __s, const char *__restrict __delim)
+ throw () __attribute__ ((__nonnull__ (2)));
+
+
+
+extern char *__strtok_r (char *__restrict __s,
+ const char *__restrict __delim,
+ char **__restrict __save_ptr)
+ throw () __attribute__ ((__nonnull__ (2, 3)));
+
+extern char *strtok_r (char *__restrict __s, const char *__restrict __delim,
+ char **__restrict __save_ptr)
+ throw () __attribute__ ((__nonnull__ (2, 3)));
+
+
+
+
+
+extern "C++" char *strcasestr (char *__haystack, const char *__needle)
+ throw () __asm ("strcasestr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+extern "C++" const char *strcasestr (const char *__haystack,
+ const char *__needle)
+ throw () __asm ("strcasestr") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+
+
+
+
+
+
+
+extern void *memmem (const void *__haystack, size_t __haystacklen,
+ const void *__needle, size_t __needlelen)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 3)));
+
+
+
+extern void *__mempcpy (void *__restrict __dest,
+ const void *__restrict __src, size_t __n)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+extern void *mempcpy (void *__restrict __dest,
+ const void *__restrict __src, size_t __n)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+
+extern size_t strlen (const char *__s)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+
+
+
+extern size_t strnlen (const char *__string, size_t __maxlen)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+
+
+
+extern char *strerror (int __errnum) throw ();
+
+
+
+
+
+
+
+
+# 419 "/usr/include/string.h" 3
+
+
+extern char *strerror_r (int __errnum, char *__buf, size_t __buflen)
+ throw () __attribute__ ((__nonnull__ (2))) ;
+
+
+
+
+
+extern char *strerror_l (int __errnum, locale_t __l) throw ();
+
+
+# 1 "/usr/include/strings.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 44 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 91 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 116 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 159 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+# 240 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+# 349 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+# 361 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+# 395 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+# 413 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+# 440 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+# 447 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+# 24 "/usr/include/strings.h" 2 3
+
+
+
+
+
+
+extern "C" {
+
+
+
+extern int bcmp (const void *__s1, const void *__s2, size_t __n)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern void bcopy (const void *__src, void *__dest, size_t __n)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern void bzero (void *__s, size_t __n) throw () __attribute__ ((__nonnull__ (1)));
+
+
+
+extern "C++"
+{
+extern char *index (char *__s, int __c)
+ throw () __asm ("index") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+extern const char *index (const char *__s, int __c)
+ throw () __asm ("index") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+# 66 "/usr/include/strings.h" 3
+}
+
+
+
+
+
+
+
+extern "C++"
+{
+extern char *rindex (char *__s, int __c)
+ throw () __asm ("rindex") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+extern const char *rindex (const char *__s, int __c)
+ throw () __asm ("rindex") __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1)));
+
+# 94 "/usr/include/strings.h" 3
+}
+
+
+
+
+
+
+
+
+
+extern int ffs (int __i) throw () __attribute__ ((__const__));
+
+
+
+
+
+extern int ffsl (long int __l) throw () __attribute__ ((__const__));
+ extern int ffsll (long long int __ll)
+ throw () __attribute__ ((__const__));
+
+
+
+extern int strcasecmp (const char *__s1, const char *__s2)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern int strncasecmp (const char *__s1, const char *__s2, size_t __n)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+
+
+
+extern int strcasecmp_l (const char *__s1, const char *__s2, locale_t __loc)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2, 3)));
+
+
+
+extern int strncasecmp_l (const char *__s1, const char *__s2,
+ size_t __n, locale_t __loc)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2, 4)));
+
+
+}
+
+# 147 "/usr/include/strings.h" 3
+
+# 433 "/usr/include/string.h" 2 3
+
+
+
+extern void explicit_bzero (void *__s, size_t __n) throw () __attribute__ ((__nonnull__ (1)));
+
+
+
+extern char *strsep (char **__restrict __stringp,
+ const char *__restrict __delim)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+
+extern char *strsignal (int __sig) throw ();
+
+
+extern char *__stpcpy (char *__restrict __dest, const char *__restrict __src)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+extern char *stpcpy (char *__restrict __dest, const char *__restrict __src)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+extern char *__stpncpy (char *__restrict __dest,
+ const char *__restrict __src, size_t __n)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+extern char *stpncpy (char *__restrict __dest,
+ const char *__restrict __src, size_t __n)
+ throw () __attribute__ ((__nonnull__ (1, 2)));
+
+
+
+
+extern int strverscmp (const char *__s1, const char *__s2)
+ throw () __attribute__ ((__pure__)) __attribute__ ((__nonnull__ (1, 2)));
+
+
+extern char *strfry (char *__string) throw () __attribute__ ((__nonnull__ (1)));
+
+
+extern void *memfrob (void *__s, size_t __n) throw () __attribute__ ((__nonnull__ (1)));
+
+
+
+
+
+
+
+extern "C++" char *basename (char *__filename)
+ throw () __asm ("basename") __attribute__ ((__nonnull__ (1)));
+extern "C++" const char *basename (const char *__filename)
+ throw () __asm ("basename") __attribute__ ((__nonnull__ (1)));
+
+
+
+
+
+
+# 498 "/usr/include/string.h" 3
+
+}
+
+# 86 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/common_functions.h" 2
+# 1 "/usr/include/time.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 44 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 91 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 116 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+
+# 159 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+
+# 240 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+
+
+
+# 349 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+# 361 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+# 395 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+# 413 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+
+
+
+# 440 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+# 447 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 3
+
+
+
+# 30 "/usr/include/time.h" 2 3
+
+
+
+# 1 "/usr/include/bits/time.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/types.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/wordsize.h" 1 3
+
+
+# 10 "/usr/include/bits/wordsize.h" 3
+
+
+
+
+# 28 "/usr/include/bits/types.h" 2 3
+# 1 "/usr/include/bits/timesize.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 29 "/usr/include/bits/types.h" 2 3
+
+
+typedef unsigned char __u_char;
+typedef unsigned short int __u_short;
+typedef unsigned int __u_int;
+typedef unsigned long int __u_long;
+
+
+typedef signed char __int8_t;
+typedef unsigned char __uint8_t;
+typedef signed short int __int16_t;
+typedef unsigned short int __uint16_t;
+typedef signed int __int32_t;
+typedef unsigned int __uint32_t;
+
+typedef signed long int __int64_t;
+typedef unsigned long int __uint64_t;
+
+
+
+
+
+
+typedef __int8_t __int_least8_t;
+typedef __uint8_t __uint_least8_t;
+typedef __int16_t __int_least16_t;
+typedef __uint16_t __uint_least16_t;
+typedef __int32_t __int_least32_t;
+typedef __uint32_t __uint_least32_t;
+typedef __int64_t __int_least64_t;
+typedef __uint64_t __uint_least64_t;
+
+
+
+typedef long int __quad_t;
+typedef unsigned long int __u_quad_t;
+
+
+
+
+
+
+
+typedef long int __intmax_t;
+typedef unsigned long int __uintmax_t;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 136 "/usr/include/bits/types.h" 3
+
+# 1 "/usr/include/bits/typesizes.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 37 "/usr/include/bits/typesizes.h" 3
+
+# 75 "/usr/include/bits/typesizes.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 95 "/usr/include/bits/typesizes.h" 3
+
+
+
+
+
+# 142 "/usr/include/bits/types.h" 2 3
+# 1 "/usr/include/bits/time64.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 143 "/usr/include/bits/types.h" 2 3
+
+
+typedef unsigned long int __dev_t;
+typedef unsigned int __uid_t;
+typedef unsigned int __gid_t;
+typedef unsigned long int __ino_t;
+typedef unsigned long int __ino64_t;
+typedef unsigned int __mode_t;
+typedef unsigned long int __nlink_t;
+typedef long int __off_t;
+typedef long int __off64_t;
+typedef int __pid_t;
+typedef struct { int __val[2]; } __fsid_t;
+typedef long int __clock_t;
+typedef unsigned long int __rlim_t;
+typedef unsigned long int __rlim64_t;
+typedef unsigned int __id_t;
+typedef long int __time_t;
+typedef unsigned int __useconds_t;
+typedef long int __suseconds_t;
+
+typedef int __daddr_t;
+typedef int __key_t;
+
+
+typedef int __clockid_t;
+
+
+typedef void * __timer_t;
+
+
+typedef long int __blksize_t;
+
+
+
+
+typedef long int __blkcnt_t;
+typedef long int __blkcnt64_t;
+
+
+typedef unsigned long int __fsblkcnt_t;
+typedef unsigned long int __fsblkcnt64_t;
+
+
+typedef unsigned long int __fsfilcnt_t;
+typedef unsigned long int __fsfilcnt64_t;
+
+
+typedef long int __fsword_t;
+
+typedef long int __ssize_t;
+
+
+typedef long int __syscall_slong_t;
+
+typedef unsigned long int __syscall_ulong_t;
+
+
+
+typedef __off64_t __loff_t;
+typedef char *__caddr_t;
+
+
+typedef long int __intptr_t;
+
+
+typedef unsigned int __socklen_t;
+
+
+
+
+typedef int __sig_atomic_t;
+
+
+
+
+
+
+
+
+
+
+
+
+# 27 "/usr/include/bits/time.h" 2 3
+
+
+
+
+
+
+
+
+
+# 43 "/usr/include/bits/time.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/timex.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/types/struct_timeval.h" 1 3
+
+
+
+
+
+
+
+struct timeval
+{
+ __time_t tv_sec;
+ __suseconds_t tv_usec;
+};
+# 23 "/usr/include/bits/timex.h" 2 3
+
+
+
+struct timex
+{
+ unsigned int modes;
+ __syscall_slong_t offset;
+ __syscall_slong_t freq;
+ __syscall_slong_t maxerror;
+ __syscall_slong_t esterror;
+ int status;
+ __syscall_slong_t constant;
+ __syscall_slong_t precision;
+ __syscall_slong_t tolerance;
+ struct timeval time;
+ __syscall_slong_t tick;
+ __syscall_slong_t ppsfreq;
+ __syscall_slong_t jitter;
+ int shift;
+ __syscall_slong_t stabil;
+ __syscall_slong_t jitcnt;
+ __syscall_slong_t calcnt;
+ __syscall_slong_t errcnt;
+ __syscall_slong_t stbcnt;
+
+ int tai;
+
+
+ int :32; int :32; int :32; int :32;
+ int :32; int :32; int :32; int :32;
+ int :32; int :32; int :32;
+};
+
+
+# 70 "/usr/include/bits/timex.h" 3
+
+
+# 83 "/usr/include/bits/timex.h" 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 74 "/usr/include/bits/time.h" 2 3
+
+extern "C" {
+
+
+extern int clock_adjtime (__clockid_t __clock_id, struct timex *__utx) throw ();
+
+}
+
+
+# 34 "/usr/include/time.h" 2 3
+
+
+
+# 1 "/usr/include/bits/types/clock_t.h" 1 3
+
+
+
+
+
+
+typedef __clock_t clock_t;
+
+# 38 "/usr/include/time.h" 2 3
+# 1 "/usr/include/bits/types/time_t.h" 1 3
+
+
+
+
+
+
+typedef __time_t time_t;
+
+# 39 "/usr/include/time.h" 2 3
+# 1 "/usr/include/bits/types/struct_tm.h" 1 3
+
+
+
+
+
+
+struct tm
+{
+ int tm_sec;
+ int tm_min;
+ int tm_hour;
+ int tm_mday;
+ int tm_mon;
+ int tm_year;
+ int tm_wday;
+ int tm_yday;
+ int tm_isdst;
+
+
+ long int tm_gmtoff;
+ const char *tm_zone;
+
+
+
+
+};
+
+# 40 "/usr/include/time.h" 2 3
+
+# 1 "/usr/include/bits/types/struct_timespec.h" 1 3
+
+
+
+
+# 1 "/usr/include/bits/endian.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/bits/endianness.h" 1 3
+
+
+
+
+
+
+
+
+
+
+# 36 "/usr/include/bits/endian.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 7 "/usr/include/bits/types/struct_timespec.h" 2 3
+
+
+
+struct timespec
+{
+ __time_t tv_sec;
+
+
+
+ __syscall_slong_t tv_nsec;
+# 26 "/usr/include/bits/types/struct_timespec.h" 3
+};
+
+# 43 "/usr/include/time.h" 2 3
+
+
+# 1 "/usr/include/bits/types/clockid_t.h" 1 3
+
+
+
+
+
+
+typedef __clockid_t clockid_t;
+
+# 47 "/usr/include/time.h" 2 3
+# 1 "/usr/include/bits/types/timer_t.h" 1 3
+
+
+
+
+
+
+typedef __timer_t timer_t;
+
+# 48 "/usr/include/time.h" 2 3
+# 1 "/usr/include/bits/types/struct_itimerspec.h" 1 3
+
+
+
+
+
+
+
+struct itimerspec
+ {
+ struct timespec it_interval;
+ struct timespec it_value;
+ };
+
+# 49 "/usr/include/time.h" 2 3
+struct sigevent;
+
+
+
+
+typedef __pid_t pid_t;
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern "C" {
+
+
+
+extern clock_t clock (void) throw ();
+
+
+extern time_t time (time_t *__timer) throw ();
+
+
+extern double difftime (time_t __time1, time_t __time0)
+ throw () __attribute__ ((__const__));
+
+
+extern time_t mktime (struct tm *__tp) throw ();
+
+
+
+
+
+extern size_t strftime (char *__restrict __s, size_t __maxsize,
+ const char *__restrict __format,
+ const struct tm *__restrict __tp) throw ();
+
+
+
+
+extern char *strptime (const char *__restrict __s,
+ const char *__restrict __fmt, struct tm *__tp)
+ throw ();
+
+
+
+
+
+
+extern size_t strftime_l (char *__restrict __s, size_t __maxsize,
+ const char *__restrict __format,
+ const struct tm *__restrict __tp,
+ locale_t __loc) throw ();
+
+
+
+extern char *strptime_l (const char *__restrict __s,
+ const char *__restrict __fmt, struct tm *__tp,
+ locale_t __loc) throw ();
+
+
+
+
+
+extern struct tm *gmtime (const time_t *__timer) throw ();
+
+
+
+extern struct tm *localtime (const time_t *__timer) throw ();
+
+
+
+
+extern struct tm *gmtime_r (const time_t *__restrict __timer,
+ struct tm *__restrict __tp) throw ();
+
+
+
+extern struct tm *localtime_r (const time_t *__restrict __timer,
+ struct tm *__restrict __tp) throw ();
+
+
+
+
+extern char *asctime (const struct tm *__tp) throw ();
+
+
+extern char *ctime (const time_t *__timer) throw ();
+
+
+
+
+
+
+extern char *asctime_r (const struct tm *__restrict __tp,
+ char *__restrict __buf) throw ();
+
+
+extern char *ctime_r (const time_t *__restrict __timer,
+ char *__restrict __buf) throw ();
+
+
+
+
+extern char *__tzname[2];
+extern int __daylight;
+extern long int __timezone;
+
+
+
+
+extern char *tzname[2];
+
+
+
+extern void tzset (void) throw ();
+
+
+
+extern int daylight;
+extern long int timezone;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern time_t timegm (struct tm *__tp) throw ();
+
+
+extern time_t timelocal (struct tm *__tp) throw ();
+
+
+extern int dysize (int __year) throw () __attribute__ ((__const__));
+
+
+
+
+
+
+
+
+extern int nanosleep (const struct timespec *__requested_time,
+ struct timespec *__remaining);
+
+
+
+extern int clock_getres (clockid_t __clock_id, struct timespec *__res) throw ();
+
+
+extern int clock_gettime (clockid_t __clock_id, struct timespec *__tp) throw ();
+
+
+extern int clock_settime (clockid_t __clock_id, const struct timespec *__tp)
+ throw ();
+
+
+
+
+
+
+extern int clock_nanosleep (clockid_t __clock_id, int __flags,
+ const struct timespec *__req,
+ struct timespec *__rem);
+
+
+extern int clock_getcpuclockid (pid_t __pid, clockid_t *__clock_id) throw ();
+
+
+
+
+extern int timer_create (clockid_t __clock_id,
+ struct sigevent *__restrict __evp,
+ timer_t *__restrict __timerid) throw ();
+
+
+extern int timer_delete (timer_t __timerid) throw ();
+
+
+extern int timer_settime (timer_t __timerid, int __flags,
+ const struct itimerspec *__restrict __value,
+ struct itimerspec *__restrict __ovalue) throw ();
+
+
+extern int timer_gettime (timer_t __timerid, struct itimerspec *__value)
+ throw ();
+
+
+extern int timer_getoverrun (timer_t __timerid) throw ();
+
+
+
+
+
+extern int timespec_get (struct timespec *__ts, int __base)
+ throw () __attribute__ ((__nonnull__ (1)));
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern int getdate_err;
+
+
+
+
+
+
+
+
+extern struct tm *getdate (const char *__string);
+
+
+
+
+
+
+
+
+
+
+
+
+
+extern int getdate_r (const char *__restrict __string,
+ struct tm *__restrict __resbufp);
+
+
+}
+
+# 87 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/11.7/bin/../targets/x86_64-linux/include/crt/common_functions.h" 2
+
+extern "C"
+{
+
+extern __attribute__((host)) __attribute__((device)) __attribute__((device_builtin)) __attribute__((cudart_builtin)) clock_t clock(void)
+
+
+
+throw ();
+extern __attribute__((host)) __attribute__((device)) __attribute__((device_builtin)) __attribute__((cudart_builtin)) void* memset(void*, int, size_t) throw ();
+extern __attribute__((host)) __attribute__((device)) __attribute__((device_builtin)) __attribute__((cudart_builtin)) void* memcpy(void*, const void*, size_t) throw ();
+
+}
+
+
+
+
+# 1 "/usr/include/c++/7/new" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 40 "/usr/include/c++/7/new" 2 3
+# 1 "/usr/include/c++/7/exception" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#pragma GCC visibility push(default)
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 38 "/usr/include/c++/7/exception" 2 3
+# 1 "/usr/include/c++/7/bits/exception.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#pragma GCC visibility push(default)
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 39 "/usr/include/c++/7/bits/exception.h" 2 3
+
+extern "C++" {
+
+namespace std
+{
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ class exception
+ {
+ public:
+ exception() noexcept { }
+ virtual ~exception() noexcept;
+
+
+
+ virtual const char*
+ what() const noexcept;
+ };
+
+}
+
+}
+
+#pragma GCC visibility pop
+
+# 39 "/usr/include/c++/7/exception" 2 3
+
+extern "C++" {
+
+namespace std
+{
+
+
+ class bad_exception : public exception
+ {
+ public:
+ bad_exception() noexcept { }
+
+
+
+ virtual ~bad_exception() noexcept;
+
+
+ virtual const char*
+ what() const noexcept;
+ };
+
+
+ typedef void (*terminate_handler) ();
+
+
+ typedef void (*unexpected_handler) ();
+
+
+ terminate_handler set_terminate(terminate_handler) noexcept;
+
+
+
+ terminate_handler get_terminate() noexcept;
+
+
+
+
+ void terminate() noexcept __attribute__ ((__noreturn__));
+
+
+ unexpected_handler set_unexpected(unexpected_handler) noexcept;
+
+
+
+ unexpected_handler get_unexpected() noexcept;
+
+
+
+
+ void unexpected() __attribute__ ((__noreturn__));
+
+
+
+
+
+
+
+
+
+
+
+
+ bool uncaught_exception() noexcept __attribute__ ((__pure__));
+
+
+
+
+ int uncaught_exceptions() noexcept __attribute__ ((__pure__));
+
+
+
+}
+
+namespace __gnu_cxx
+{
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ void __verbose_terminate_handler();
+
+
+}
+
+}
+
+#pragma GCC visibility pop
+
+# 1 "/usr/include/c++/7/bits/exception_ptr.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#pragma GCC visibility push(default)
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 37 "/usr/include/c++/7/bits/exception_ptr.h" 2 3
+# 1 "/usr/include/c++/7/bits/exception_defines.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 39 "/usr/include/c++/7/bits/exception_defines.h" 3
+
+
+
+
+
+
+# 38 "/usr/include/c++/7/bits/exception_ptr.h" 2 3
+# 1 "/usr/include/c++/7/bits/cxxabi_init_exception.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#pragma GCC visibility push(default)
+
+# 1 "/usr/lib64/gcc/x86_64-suse-linux/7/include/stddef.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 39 "/usr/include/c++/7/bits/cxxabi_init_exception.h" 2 3
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 40 "/usr/include/c++/7/bits/cxxabi_init_exception.h" 2 3
+
+# 47 "/usr/include/c++/7/bits/cxxabi_init_exception.h" 3
+
+
+
+namespace std
+{
+ class type_info;
+}
+
+namespace __cxxabiv1
+{
+ struct __cxa_refcounted_exception;
+
+ extern "C"
+ {
+
+ void*
+ __cxa_allocate_exception(size_t) noexcept;
+
+ void
+ __cxa_free_exception(void*) noexcept;
+
+
+ __cxa_refcounted_exception*
+ __cxa_init_primary_exception(void *object, std::type_info *tinfo,
+ void ( *dest) (void *)) noexcept;
+
+ }
+}
+
+
+
+#pragma GCC visibility pop
+
+# 39 "/usr/include/c++/7/bits/exception_ptr.h" 2 3
+# 1 "/usr/include/c++/7/typeinfo" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/usr/include/c++/7/bits/hash_bytes.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 36 "/usr/include/c++/7/bits/hash_bytes.h" 2 3
+
+namespace std
+{
+
+
+
+
+
+
+
+ size_t
+ _Hash_bytes(const void* __ptr, size_t __len, size_t __seed);
+
+
+
+
+
+ size_t
+ _Fnv_hash_bytes(const void* __ptr, size_t __len, size_t __seed);
+
+
+}
+
+# 37 "/usr/include/c++/7/typeinfo" 2 3
+
+
+#pragma GCC visibility push(default)
+
+extern "C++" {
+
+namespace __cxxabiv1
+{
+ class __class_type_info;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 79 "/usr/include/c++/7/typeinfo" 3
+
+namespace std
+{
+
+
+
+
+
+
+ class type_info
+ {
+ public:
+
+
+
+
+ virtual ~type_info();
+
+
+
+ const char* name() const noexcept
+ { return __name[0] == '*' ? __name + 1 : __name; }
+
+# 110 "/usr/include/c++/7/typeinfo" 3
+
+
+
+
+
+ bool before(const type_info& __arg) const noexcept
+ { return (__name[0] == '*' && __arg.__name[0] == '*')
+ ? __name < __arg.__name
+ : __builtin_strcmp (__name, __arg.__name) < 0; }
+
+ bool operator==(const type_info& __arg) const noexcept
+ {
+ return ((__name == __arg.__name)
+ || (__name[0] != '*' &&
+ __builtin_strcmp (__name, __arg.__name) == 0));
+ }
+# 136 "/usr/include/c++/7/typeinfo" 3
+ bool operator!=(const type_info& __arg) const noexcept
+ { return !operator==(__arg); }
+
+
+ size_t hash_code() const noexcept
+ {
+
+ return _Hash_bytes(name(), __builtin_strlen(name()),
+ static_cast(0xc70f6907UL));
+
+
+
+ }
+
+
+
+ virtual bool __is_pointer_p() const;
+
+
+ virtual bool __is_function_p() const;
+
+
+
+
+
+
+
+ virtual bool __do_catch(const type_info *__thr_type, void **__thr_obj,
+ unsigned __outer) const;
+
+
+ virtual bool __do_upcast(const __cxxabiv1::__class_type_info *__target,
+ void **__obj_ptr) const;
+
+ protected:
+ const char *__name;
+
+ explicit type_info(const char *__n): __name(__n) { }
+
+ private:
+
+ type_info& operator=(const type_info&);
+ type_info(const type_info&);
+ };
+
+
+
+
+
+
+
+ class bad_cast : public exception
+ {
+ public:
+ bad_cast() noexcept { }
+
+
+
+ virtual ~bad_cast() noexcept;
+
+
+ virtual const char* what() const noexcept;
+ };
+
+
+
+
+
+ class bad_typeid : public exception
+ {
+ public:
+ bad_typeid () noexcept { }
+
+
+
+ virtual ~bad_typeid() noexcept;
+
+
+ virtual const char* what() const noexcept;
+ };
+}
+
+}
+
+#pragma GCC visibility pop
+
+# 40 "/usr/include/c++/7/bits/exception_ptr.h" 2 3
+# 1 "/usr/include/c++/7/new" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 41 "/usr/include/c++/7/bits/exception_ptr.h" 2 3
+
+extern "C++" {
+
+namespace std
+{
+ class type_info;
+
+
+
+
+
+ namespace __exception_ptr
+ {
+ class exception_ptr;
+ }
+
+ using __exception_ptr::exception_ptr;
+
+
+
+
+
+ exception_ptr current_exception() noexcept;
+
+ template
+ exception_ptr make_exception_ptr(_Ex) noexcept;
+
+
+ void rethrow_exception(exception_ptr) __attribute__ ((__noreturn__));
+
+ namespace __exception_ptr
+ {
+ using std::rethrow_exception;
+
+
+
+
+
+ class exception_ptr
+ {
+ void* _M_exception_object;
+
+ explicit exception_ptr(void* __e) noexcept;
+
+ void _M_addref() noexcept;
+ void _M_release() noexcept;
+
+ void *_M_get() const noexcept __attribute__ ((__pure__));
+
+ friend exception_ptr std::current_exception() noexcept;
+ friend void std::rethrow_exception(exception_ptr);
+ template
+ friend exception_ptr std::make_exception_ptr(_Ex) noexcept;
+
+ public:
+ exception_ptr() noexcept;
+
+ exception_ptr(const exception_ptr&) noexcept;
+
+
+ exception_ptr(nullptr_t) noexcept
+ : _M_exception_object(0)
+ { }
+
+ exception_ptr(exception_ptr&& __o) noexcept
+ : _M_exception_object(__o._M_exception_object)
+ { __o._M_exception_object = 0; }
+
+
+# 116 "/usr/include/c++/7/bits/exception_ptr.h" 3
+
+ exception_ptr&
+ operator=(const exception_ptr&) noexcept;
+
+
+ exception_ptr&
+ operator=(exception_ptr&& __o) noexcept
+ {
+ exception_ptr(static_cast(__o)).swap(*this);
+ return *this;
+ }
+
+
+ ~exception_ptr() noexcept;
+
+ void
+ swap(exception_ptr&) noexcept;
+
+# 142 "/usr/include/c++/7/bits/exception_ptr.h" 3
+
+
+ explicit operator bool() const
+ { return _M_exception_object; }
+
+
+ friend bool
+ operator==(const exception_ptr&, const exception_ptr&)
+ noexcept __attribute__ ((__pure__));
+
+ const class std::type_info*
+ __cxa_exception_type() const noexcept
+ __attribute__ ((__pure__));
+ };
+
+ bool
+ operator==(const exception_ptr&, const exception_ptr&)
+ noexcept __attribute__ ((__pure__));
+
+ bool
+ operator!=(const exception_ptr&, const exception_ptr&)
+ noexcept __attribute__ ((__pure__));
+
+ inline void
+ swap(exception_ptr& __lhs, exception_ptr& __rhs)
+ { __lhs.swap(__rhs); }
+
+ template
+ inline void
+ __dest_thunk(void* __x)
+ { static_cast<_Ex*>(__x)->~_Ex(); }
+
+ }
+
+
+ template
+ exception_ptr
+ make_exception_ptr(_Ex __ex) noexcept
+ {
+
+ void* __e = __cxxabiv1::__cxa_allocate_exception(sizeof(_Ex));
+ (void) __cxxabiv1::__cxa_init_primary_exception(
+ __e, const_cast(&typeid(__ex)),
+ __exception_ptr::__dest_thunk<_Ex>);
+ try
+ {
+ ::new (__e) _Ex(__ex);
+ return exception_ptr(__e);
+ }
+ catch(...)
+ {
+ __cxxabiv1::__cxa_free_exception(__e);
+ return current_exception();
+ }
+# 208 "/usr/include/c++/7/bits/exception_ptr.h" 3
+ }
+
+
+
+
+
+ template
+ exception_ptr
+ copy_exception(_Ex __ex) noexcept ;
+
+ template
+ exception_ptr
+ copy_exception(_Ex __ex) noexcept
+ { return std::make_exception_ptr<_Ex>(__ex); }
+
+
+}
+
+}
+
+#pragma GCC visibility pop
+
+# 143 "/usr/include/c++/7/exception" 2 3
+# 1 "/usr/include/c++/7/bits/nested_exception.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#pragma GCC visibility push(default)
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 40 "/usr/include/c++/7/bits/nested_exception.h" 2 3
+# 1 "/usr/include/c++/7/bits/move.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 34 "/usr/include/c++/7/bits/move.h" 2 3
+# 1 "/usr/include/c++/7/bits/concept_check.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 36 "/usr/include/c++/7/bits/concept_check.h" 2 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 80 "/usr/include/c++/7/bits/concept_check.h" 3
+
+# 35 "/usr/include/c++/7/bits/move.h" 2 3
+
+namespace std __attribute__ ((__visibility__ ("default")))
+{
+
+
+
+
+
+
+
+ template
+ inline constexpr _Tp*
+ __addressof(_Tp& __r) noexcept
+ { return __builtin_addressof(__r); }
+
+
+}
+
+# 1 "/usr/include/c++/7/type_traits" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 1 "/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/include/bits/c++config.h" 1 3
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# 39 "/usr/include/c++/7/type_traits" 2 3
+
+
+
+namespace std
+{
+ typedef short unsigned int uint_least16_t;
+ typedef unsigned int uint_least32_t;
+}
+
+
+
+
+
+namespace std __attribute__ ((__visibility__ ("default")))
+{
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ template
+ struct integral_constant
+ {
+ static constexpr _Tp value = __v;
+ typedef _Tp value_type;
+ typedef integral_constant<_Tp, __v> type;
+ constexpr operator value_type() const noexcept { return value; }
+
+
+
+
+ constexpr value_type operator()() const noexcept { return value; }
+
+ };
+
+ template
+ constexpr _Tp integral_constant<_Tp, __v>::value;
+
+
+ typedef integral_constant true_type;
+
+
+ typedef integral_constant false_type;
+
+ template
+ using __bool_constant = integral_constant;
+
+
+
+
+
+
+
+
+
+ template
+ struct conditional;
+
+ template
+ struct __or_;
+
+ template<>
+ struct __or_<>
+ : public false_type
+ { };
+
+ template
+ struct __or_<_B1>
+ : public _B1
+ { };
+
+ template
+ struct __or_<_B1, _B2>
+ : public conditional<_B1::value, _B1, _B2>::type
+ { };
+
+ template
+ struct __or_<_B1, _B2, _B3, _Bn...>
+ : public conditional<_B1::value, _B1, __or_<_B2, _B3, _Bn...>>::type
+ { };
+
+ template
+ struct __and_;
+
+ template<>
+ struct __and_<>
+ : public true_type
+ { };
+
+ template
+ struct __and_<_B1>
+ : public _B1
+ { };
+
+ template
+ struct __and_<_B1, _B2>
+ : public conditional<_B1::value, _B2, _B1>::type
+ { };
+
+ template
+ struct __and_<_B1, _B2, _B3, _Bn...>
+ : public conditional<_B1::value, __and_<_B2, _B3, _Bn...>, _B1>::type
+ { };
+
+ template
+ struct __not_
+ : public __bool_constant
+ { };
+
+# 186 "/usr/include/c++/7/type_traits" 3
+
+
+
+
+
+
+ template
+ struct __success_type
+ { typedef _Tp type; };
+
+ struct __failure_type
+ { };
+
+
+
+ template
+ struct remove_cv;
+
+ template
+ struct __is_void_helper
+ : public false_type { };
+
+ template<>
+ struct __is_void_helper
+ : public true_type { };
+
+
+ template
+ struct is_void
+ : public __is_void_helper::type>::type
+ { };
+
+ template
+ struct __is_integral_helper
+ : public false_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+
+
+
+
+ template<>
+ struct __is_integral_helper<__int128>
+ : public true_type { };
+
+ template<>
+ struct __is_integral_helper
+ : public true_type { };
+# 322 "/usr/include/c++/7/type_traits" 3
+
+
+ template
+ struct is_integral
+ : public __is_integral_helper::type>::type
+ { };
+
+ template
+ struct __is_floating_point_helper
+ : public false_type { };
+
+ template<>
+ struct __is_floating_point_helper
+ : public true_type { };
+
+ template<>
+ struct __is_floating_point_helper
+ : public true_type { };
+
+ template<>
+ struct __is_floating_point_helper
+ : public true_type { };
+
+
+
+
+
+
+
+
+ template
+ struct is_floating_point
+ : public __is_floating_point_helper::type>::type
+ { };
+
+
+ template
+ struct is_array
+ : public false_type { };
+
+ template
+ struct is_array<_Tp[_Size]>
+ : public true_type { };
+
+ template
+ struct is_array<_Tp[]>
+ : public true_type { };
+
+ template
+ struct __is_pointer_helper
+ : public false_type { };
+
+ template
+ struct __is_pointer_helper<_Tp*>
+ : public true_type { };
+
+
+ template
+ struct is_pointer
+ : public __is_pointer_helper::type>::type
+ { };
+
+
+ template
+ struct is_lvalue_reference
+ : public false_type { };
+
+ template
+ struct is_lvalue_reference<_Tp&>
+ : public true_type { };
+
+
+ template
+ struct is_rvalue_reference
+ : public false_type { };
+
+ template
+ struct is_rvalue_reference<_Tp&&>
+ : public true_type { };
+
+ template
+ struct is_function;
+
+ template
+ struct __is_member_object_pointer_helper
+ : public false_type { };
+
+ template
+ struct __is_member_object_pointer_helper<_Tp _Cp::*>
+ : public integral_constant::value> { };
+
+
+ template
+ struct is_member_object_pointer
+ : public __is_member_object_pointer_helper<
+ typename remove_cv<_Tp>::type>::type
+ { };
+
+ template
+ struct __is_member_function_pointer_helper
+ : public false_type { };
+
+ template
+ struct __is_member_function_pointer_helper<_Tp _Cp::*>
+ : public integral_constant::value> { };
+
+
+ template
+ struct is_member_function_pointer
+ : public __is_member_function_pointer_helper<
+ typename remove_cv<_Tp>::type>::type
+ { };
+
+
+ template
+ struct is_enum
+ : public integral_constant
+ { };
+
+
+ template
+ struct is_union
+ : public integral_constant
+ { };
+
+
+ template
+ struct is_class
+ : public integral_constant
+ { };
+
+
+ template
+ struct is_function
+ : public false_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) & >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) && >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) & >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) && >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) const >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) const & >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) const && >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) const >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) const & >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) const && >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) volatile >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) volatile & >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) volatile && >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) volatile >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) volatile & >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) volatile && >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) const volatile >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) const volatile & >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes...) const volatile && >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) const volatile >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) const volatile & >
+ : public true_type { };
+
+ template
+ struct is_function<_Res(_ArgTypes......) const volatile && >
+ : public true_type { };
+
+
+
+ template
+ struct __is_null_pointer_helper
+ : public false_type { };
+
+ template<>
+ struct __is_null_pointer_helper
+ : public true_type { };
+
+
+ template
+ struct is_null_pointer
+ : public __is_null_pointer_helper::type>::type
+ { };
+
+
+ template
+ struct __is_nullptr_t
+ : public is_null_pointer<_Tp>
+ { };
+
+
+
+
+ template
+ struct is_reference
+ : public __or_,
+ is_rvalue_reference<_Tp>>::type
+ { };
+
+
+ template
+ struct is_arithmetic
+ : public __or_, is_floating_point<_Tp>>::type
+ { };
+
+
+ template
+ struct is_fundamental
+ : public __or_, is_void<_Tp>,
+ is_null_pointer<_Tp>>::type
+ { };
+
+
+ template
+ struct is_object
+ : public __not_<__or_, is_reference<_Tp>,
+ is_void<_Tp>>>::type
+ { };
+
+ template
+ struct is_member_pointer;
+
+
+ template
+ struct is_scalar
+ : public __or_, is_enum<_Tp>, is_pointer<_Tp>,
+ is_member_pointer<_Tp>, is_null_pointer<_Tp>>::type
+ { };
+
+
+ template
+ struct is_compound
+ : public integral_constant::value> { };
+
+ template
+ struct __is_member_pointer_helper
+ : public false_type { };
+
+ template
+ struct __is_member_pointer_helper<_Tp _Cp::*>
+ : public true_type { };
+
+
+ template
+ struct is_member_pointer
+ : public __is_member_pointer_helper::type>::type
+ { };
+
+
+
+ template
+ struct __is_referenceable
+ : public __or_, is_reference<_Tp>>::type
+ { };
+
+ template
+ struct __is_referenceable<_Res(_Args...) >
+ : public true_type
+ { };
+
+ template
+ struct __is_referenceable<_Res(_Args......) >
+ : public true_type
+ { };
+
+
+
+
+ template
+ struct is_const
+ : public false_type { };
+
+ template
+ struct is_const<_Tp const>
+ : public true_type { };
+
+
+ template
+ struct is_volatile
+ : public false_type { };
+
+ template
+ struct is_volatile<_Tp volatile>
+ : public true_type { };
+
+
+ template
+ struct is_trivial
+ : public integral_constant
+ { };
+
+
+ template
+ struct is_trivially_copyable
+ : public integral_constant
+ { };
+
+
+ template
+ struct is_standard_layout
+ : public integral_constant
+ { };
+
+
+
+ template
+ struct is_pod
+ : public integral_constant
+ { };
+
+
+ template
+ struct is_literal_type
+ : public integral_constant
+ { };
+
+
+ template