Skip to content

Commit

Permalink
Change the counters used in TriSolve communication from 'int_t' to 'i…
Browse files Browse the repository at this point in the history
…nt': fmod[], frecv[], bmod[], brecv[], etc. in xLocalLU_t{..} structure.

Change from 'int_t' to 'int' for several variable/arrays in mc64ad_dist.c.
  • Loading branch information
xiaoyeli committed Jan 24, 2022
1 parent becc2d9 commit 3681656
Show file tree
Hide file tree
Showing 71 changed files with 2,327 additions and 2,159 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ jobs:
-DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \
-DTPL_BLAS_LIBRARIES="$BLAS_LIB" \
-DTPL_LAPACK_LIBRARIES="$LAPACK_LIB" \
-Denable_blaslib=OFF \
-DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_C_COMPILER=mpicc \
-DCMAKE_CXX_COMPILER=mpic++ \
Expand Down
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ install:
-DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \
-DTPL_BLAS_LIBRARIES="$BLAS_LIB" \
-DTPL_LAPACK_LIBRARIES="$LAPACK_LIB" \
-Denable_blaslib=OFF \
-DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_C_COMPILER=mpicc \
-DCMAKE_CXX_COMPILER=mpic++ \
Expand Down
7 changes: 4 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,10 @@ if (TPL_ENABLE_CUDALIB) ## want to use cuda
# find_package(CUB REQUIRED)

find_package(CUDAToolkit REQUIRED)
if(CUDAToolkit_FOUND)
target_link_libraries(superlu PUBLIC CUDA::cudart CUDA::cusolver CUDA::cublas)
endif()
# The following appears in SRC/CMakeLists.txt
# if(CUDAToolkit_FOUND)
# target_link_libraries(superlu_dist PUBLIC CUDA::cudart CUDA::cusolver CUDA::cublas)
# endif()
message("-- CUDAToolkit_LIBRARY_ROOT='${CUDAToolkit_LIBRARY_ROOT}'")
if (NOT "${CUDAToolkit_LIBRARY_ROOT}" STREQUAL "")
set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so")
Expand Down
10 changes: 3 additions & 7 deletions EXAMPLE/pddrive.c
Original file line number Diff line number Diff line change
Expand Up @@ -197,14 +197,10 @@ int main(int argc, char *argv[])
options.DiagInv = NO;
*/
set_default_options_dist(&options);
// options.IterRefine = NOREFINE;
// options.DiagInv = YES;
// options.ReplaceTinyPivot = YES;

// options.Equil = NO;
// options.ColPerm = NATURAL;
// options.RowPerm = NOROWPERM;
options.ParSymbFact = NO;
options.ColPerm = PARMETIS;
#if 0
options.ReplaceTinyPivot = YES;
options.RowPerm = LargeDiag_HWPM;
options.RowPerm = NOROWPERM;
options.IterRefine = NOREFINE;
Expand Down
2 changes: 2 additions & 0 deletions EXAMPLE/pzdrive.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ int main(int argc, char *argv[])
options.DiagInv = NO;
*/
set_default_options_dist(&options);
options.ParSymbFact = YES;
options.ColPerm = PARMETIS;
#if 0
options.RowPerm = NOROWPERM;
options.IterRefine = NOREFINE;
Expand Down
6 changes: 6 additions & 0 deletions SRC/comm_tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,16 @@
tree->empty_= NO; // non-empty if rank_cnt>1
if(precision=='d'){
MPI_Type_contiguous( sizeof(double), MPI_BYTE, &tree->type_ );
}
if(precision=='s'){
MPI_Type_contiguous( sizeof(float), MPI_BYTE, &tree->type_ );
}
if(precision=='z'){
MPI_Type_contiguous( sizeof(doublecomplex), MPI_BYTE, &tree->type_ );
}
//if(precision=='c'){
//MPI_Type_contiguous( sizeof(complex), MPI_BYTE, &tree->type_ );
//}
MPI_Type_commit( &tree->type_ );

int myIdx = 0;
Expand Down
6 changes: 3 additions & 3 deletions SRC/communication_aux.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,17 +192,17 @@ int_t Test_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
/*
* The following are from trfCommWrapper.c.
*/
int_t Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req,
int Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req,
MPI_Request *L_diag_blk_send_req,
gridinfo_t *grid, SCT_t *SCT)
{
// Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
// LocalLU_t *Llu = LUstruct->Llu;
// int_t* xsup = Glu_persist->xsup;

int_t iam = grid->iam;
int iam = grid->iam;

int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
int pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);

if (iam == pkk)
{
Expand Down
66 changes: 34 additions & 32 deletions SRC/ddistribute.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,19 +135,20 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A,
int *ToRecv, *ToSendD, **ToSendR;

/*-- Counts to be used in lower triangular solve. --*/
int_t *fmod; /* Modification count for L-solve. */
int_t **fsendx_plist; /* Column process list to send down Xk. */
int_t nfrecvx = 0; /* Number of Xk I will receive. */
int_t nfsendx = 0; /* Number of Xk I will send */
int_t kseen;
int *fmod; /* Modification count for L-solve. */
int **fsendx_plist; /* Column process list to send down Xk. */
int nfrecvx = 0; /* Number of Xk I will receive. */
int nfsendx = 0; /* Number of Xk I will send */
int kseen;

/*-- Counts to be used in upper triangular solve. --*/
int_t *bmod; /* Modification count for U-solve. */
int_t **bsendx_plist; /* Column process list to send down Xk. */
int_t nbrecvx = 0; /* Number of Xk I will receive. */
int_t nbsendx = 0; /* Number of Xk I will send */
int_t *ilsum; /* starting position of each supernode in
the full array (local) */
int *bmod; /* Modification count for U-solve. */
int **bsendx_plist; /* Column process list to send down Xk. */
int nbrecvx = 0; /* Number of Xk I will receive. */
int nbsendx = 0; /* Number of Xk I will send */

int_t *ilsum; /* starting position of each supernode in
the full array (local) */

/*-- Auxiliary arrays; freed on return --*/
int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */
Expand All @@ -173,8 +174,9 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A,
int_t iword, dword;
float mem_use = 0.0;

int_t *mod_bit;
int_t *frecv, *brecv, *lloc;
int *mod_bit;
int *frecv, *brecv;
int_t *lloc;
double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */
double *Linv_bc_dat; /* size sum of sizes of Linv_bc_ptr[lk]) */
long int *Linv_bc_offset; /* size ceil(NSUPERS/Pc) */
Expand Down Expand Up @@ -374,7 +376,7 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A,
ABORT("Malloc fails for ToSendR[].");
j = k * grid->npcol;
if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) )
ABORT("Malloc fails for index[].");
ABORT("Malloc fails for index1[].");

mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword;

Expand Down Expand Up @@ -534,9 +536,9 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A,
ABORT("Calloc fails for SPA dense[].");

/* These counts will be used for triangular solves. */
if ( !(fmod = intCalloc_dist(k)) )
if ( !(fmod = int32Calloc_dist(k)) )
ABORT("Calloc fails for fmod[].");
if ( !(bmod = intCalloc_dist(k)) )
if ( !(bmod = int32Calloc_dist(k)) )
ABORT("Calloc fails for bmod[].");
#if ( PRNTlevel>=1 )
mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword;
Expand Down Expand Up @@ -598,23 +600,23 @@ ddistribute(fact_t fact, int_t n, SuperMatrix *A,
ABORT("Malloc fails for Unnz[].");

/* These lists of processes will be used for triangular solves. */
if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
if ( !(fsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
ABORT("Malloc fails for fsendx_plist[].");
len = k * grid->nprow;
if ( !(index = intMalloc_dist(len)) )
if ( !(index1 = int32Malloc_dist(len)) )
ABORT("Malloc fails for fsendx_plist[0]");
for (i = 0; i < len; ++i) index[i] = EMPTY;
for (i = 0; i < len; ++i) index1[i] = EMPTY;
for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
fsendx_plist[i] = &index[j];
if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) )
fsendx_plist[i] = &index1[j];
if ( !(bsendx_plist = (int **) SUPERLU_MALLOC(k*sizeof(int*))) )
ABORT("Malloc fails for bsendx_plist[].");
if ( !(index = intMalloc_dist(len)) )
if ( !(index1 = int32Malloc_dist(len)) )
ABORT("Malloc fails for bsendx_plist[0]");
for (i = 0; i < len; ++i) index[i] = EMPTY;
for (i = 0; i < len; ++i) index1[i] = EMPTY;
for (i = 0, j = 0; i < k; ++i, j += grid->nprow)
bsendx_plist[i] = &index[j];
bsendx_plist[i] = &index1[j];

mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword;
mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*sizeof(int);

/*------------------------------------------------------------
PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS.
Expand Down Expand Up @@ -1346,9 +1348,9 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
/* construct the Reduce tree for L ... */
/* the following is used as reference */
nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(mod_bit = intMalloc_dist(nlb)) )
if ( !(mod_bit = int32Malloc_dist(nlb)) )
ABORT("Malloc fails for mod_bit[].");
if ( !(frecv = intMalloc_dist(nlb)) )
if ( !(frecv = int32Malloc_dist(nlb)) )
ABORT("Malloc fails for frecv[].");

for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
Expand All @@ -1363,7 +1365,7 @@ if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t);
}
/* Every process receives the count, but it is only useful on the
diagonal processes. */
MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
MPI_Allreduce( mod_bit, frecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);



Expand Down Expand Up @@ -1678,9 +1680,9 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
/* construct the Reduce tree for U ... */
/* the following is used as reference */
nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(mod_bit = intMalloc_dist(nlb)) )
if ( !(mod_bit = int32Malloc_dist(nlb)) )
ABORT("Malloc fails for mod_bit[].");
if ( !(brecv = intMalloc_dist(nlb)) )
if ( !(brecv = int32Malloc_dist(nlb)) )
ABORT("Malloc fails for brecv[].");

for (k = 0; k < nlb; ++k) mod_bit[k] = 0;
Expand All @@ -1695,7 +1697,7 @@ if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t);
}
/* Every process receives the count, but it is only useful on the
diagonal processes. */
MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm);
MPI_Allreduce( mod_bit, brecv, nlb, MPI_INT, MPI_SUM, grid->rscp.comm);



Expand Down Expand Up @@ -2022,7 +2024,7 @@ if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t);
SUPERLU_FREE(dense);

k = CEILING( nsupers, grid->nprow );/* Number of local block rows */
if ( !(Llu->mod_bit = intMalloc_dist(k)) )
if ( !(Llu->mod_bit = int32Malloc_dist(k)) )
ABORT("Malloc fails for mod_bit[].");

/* Find the maximum buffer size. */
Expand Down
16 changes: 9 additions & 7 deletions SRC/dldperm_dist.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ at the top-level directory.

#include "superlu_ddefs.h"

extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
int_t*, int_t [], int_t*, int_t[], int_t*, double [],
int_t [], int_t []);
extern int mc64ad_dist(int *job, int *n, int_t *ne, int_t *ip,
int_t *irn, double *a, int *num, int_t *cperm,
int_t *liw, int_t *iw, int_t *ldw, double *dw,
int * icntl, int *info);

/*! \brief
*
Expand Down Expand Up @@ -92,11 +93,12 @@ extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [],
*/

int
dldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
dldperm_dist(int job, int n, int_t nnz, int_t colptr[], int_t adjncy[],
double nzval[], int_t *perm, double u[], double v[])
{
int_t i, liw, ldw, num;
int_t *iw, icntl[10], info[10];
int i, num, icntl[10], info[10];
int_t liw, ldw;
int_t *iw;
double *dw;
extern double *doubleMalloc_dist(int_t);

Expand Down Expand Up @@ -147,7 +149,7 @@ dldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[],
printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num);
#endif
if ( info[0] == 1 ) { /* Structurally singular */
printf(".. The last " IFMT " permutations:\n", n-num);
printf(".. The last %d permutations:\n", n-num);
PrintInt10("perm", n-num, &perm[num]);
}

Expand Down
5 changes: 2 additions & 3 deletions SRC/dlustruct_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

#ifdef GPU_ACC // enable GPU
#include "gpu_api_utils.h"

// #include "mkl.h"
// #include "sec_structs.h"
// #include "supernodal_etree.h"
Expand Down Expand Up @@ -120,9 +119,9 @@ typedef struct //LUstruct_gpu_
double tHost_PCIeH2D;
double tHost_PCIeD2H;

/*gpu events to measure DGEMM and SCATTER timing */
/*GPU events to measure DGEMM and SCATTER timing */
int *isOffloaded; /*stores if any iteration is offloaded or not*/
gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*gpu events to store gemm and scatter's begin and end*/
gpuEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*GPU events to store gemm and scatter's begin and end*/
gpuEvent_t *ePCIeH2D;
gpuEvent_t *ePCIeD2H_Start;
gpuEvent_t *ePCIeD2H_End;
Expand Down
Loading

0 comments on commit 3681656

Please sign in to comment.