Skip to content

Commit

Permalink
Updated zScatter_B3d() to support uneven block row partition of {A,B}…
Browse files Browse the repository at this point in the history
… in 3D interface. Only complex16 is implemented.
  • Loading branch information
xiaoyeli committed Sep 21, 2021
1 parent 7375880 commit fd71062
Show file tree
Hide file tree
Showing 36 changed files with 386 additions and 74 deletions.
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
option(enable_doc "Build doxygen documentation" OFF)
option(enable_double "Enable double precision library" ON)
option(enable_single "Enable single precision library" OFF)
option(enable_complex16 "Enable complex16 precision library" OFF)
option(enable_complex16 "Enable complex16 precision library" ON)
option(enable_tests "Build tests" ON)
option(enable_examples "Build examples" ON)
option(XSDK_ENABLE_Fortran "Enable Fortran" ON)
Expand Down Expand Up @@ -224,9 +224,9 @@ if (enable_openmp)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}")
# On edison, OpenMP_EXE_LINKER_FLAGS is empty
# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
# The following causes problem with cmake/3.20.+
# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
message("-- OpenMP_EXE_LINKER_FLAGS='${OpenMP_EXE_LINKER_FLAGS}'")
message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'")
endif()
Expand Down
2 changes: 1 addition & 1 deletion EXAMPLE/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ ZEXM1 = pzdrive1.o zcreate_matrix.o
ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o
ZEXM3 = pzdrive3.o zcreate_matrix.o
ZEXM4 = pzdrive4.o zcreate_matrix.o
ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o
ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o #znrformat_loc3d.o
ZEXM3D1 = pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o
ZEXM3D2 = pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o
ZEXM3D3 = pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pddrive1.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
6 changes: 6 additions & 0 deletions EXAMPLE/pddrive1_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );
#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif

/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pddrive2_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pddrive3_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pddrive4.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
if ( nprocs < 10 ) {
fprintf(stderr, "Requires at least 10 processes\n");
Expand Down
6 changes: 6 additions & 0 deletions EXAMPLE/pddrive4_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );
#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
if ( nprocs < 10 ) {
fprintf(stderr, "Requires at least 10 processes\n");
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pddrive_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pddrive_spawn.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ int main(int argc, char *argv[])
//MPI_Init( &argc, &argv );
MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);
MPI_Comm_get_parent(&parent);

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif


#if ( VAMPIR>=1 )
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pzdrive.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,12 @@ int main(int argc, char *argv[])
------------------------------------------------------------*/
//MPI_Init( &argc, &argv );
MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif

#if ( VAMPIR>=1 )
VT_traceoff();
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pzdrive1.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pzdrive1_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pzdrive2_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pzdrive3_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
1 change: 1 addition & 0 deletions EXAMPLE/pzdrive3d.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ main (int argc, char *argv[])
INITIALIZE THE SUPERLU PROCESS GRID.
------------------------------------------------------------ */
superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
// grid.rankorder = 1;

if(grid.iam==0) {
MPI_Query_thread(&omp_mpi_level);
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pzdrive4.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
if ( nprocs < 10 ) {
fprintf(stderr, "Requires at least 10 processes\n");
Expand Down
6 changes: 6 additions & 0 deletions EXAMPLE/pzdrive4_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );
#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
if ( nprocs < 10 ) {
fprintf(stderr, "Requires at least 10 processes\n");
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pzdrive_ABglobal.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,12 @@ int main(int argc, char *argv[])
INITIALIZE MPI ENVIRONMENT.
------------------------------------------------------------*/
MPI_Init( &argc, &argv );

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif
/* Parse command line argv[]. */
for (cpp = argv+1; *cpp; ++cpp) {
if ( **cpp == '-' ) {
Expand Down
7 changes: 6 additions & 1 deletion EXAMPLE/pzdrive_spawn.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ int main(int argc, char *argv[])
//MPI_Init( &argc, &argv );
MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level);
MPI_Comm_get_parent(&parent);

#ifdef GPU_ACC
int rank, devs;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
cudaGetDeviceCount(&devs);
cudaSetDevice(rank % devs);
#endif


#if ( VAMPIR>=1 )
Expand Down
51 changes: 39 additions & 12 deletions EXAMPLE/zcreate_matrix3d.c
Original file line number Diff line number Diff line change
Expand Up @@ -341,17 +341,44 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
nzval[0] = 0.1;
#endif

/* Compute the number of rows to be distributed to local process */
m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
m_loc_fst = m_loc;
/* When m / procs is not an integer */
if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
{
/*m_loc = m_loc+1;
m_loc_fst = m_loc;*/
if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
}
// /* Compute the number of rows to be distributed to local process */
// m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
// m_loc_fst = m_loc;
// /* When m / procs is not an integer */
// if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
// {
// /*m_loc = m_loc+1;
// m_loc_fst = m_loc;*/
// if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
// m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
// }

switch(iam) {
case 0:
m_loc=111; fst_row=0;
break;
case 1:
m_loc=84; fst_row=111;
break;
case 2:
m_loc=108; fst_row=195;
break;
case 3:
m_loc=84; fst_row=303;
break;
case 4:
m_loc=108; fst_row=387;
break;
case 5:
m_loc=84; fst_row=495;
break;
case 6:
m_loc=108; fst_row=579;
break;
case 7:
m_loc=84; fst_row=687;
break;
}

/* Create compressed column matrix for GA. */
zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
Expand Down Expand Up @@ -379,7 +406,7 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
/* Set up row pointers */
rowptr[0] = 0;
fst_row = iam * m_loc_fst;
// fst_row = iam * m_loc_fst;
nnz_loc = 0;
for (j = 0; j < m_loc; ++j)
{
Expand Down
6 changes: 6 additions & 0 deletions SRC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,12 @@ if(CUDAToolkit_FOUND) # this is found in top-level CMakeLists.txt
target_link_libraries(superlu_dist CUDA::cudart CUDA::cublas)
endif()

# This is recommended by modern cmake:
# https://cliutils.gitlab.io/modern-cmake/chapters/packages/OpenMP.html
if(OpenMP_FOUND) # this is found in top-level CMakeLists.txt
target_link_libraries(superlu_dist OpenMP::OpenMP_C)
endif()

target_compile_definitions(superlu_dist PRIVATE SUPERLU_DIST_EXPORTS)
if(MSVC AND BUILD_SHARED_LIBS)
set_target_properties(superlu_dist PROPERTIES
Expand Down
8 changes: 4 additions & 4 deletions SRC/dlustruct_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,17 @@ typedef struct //LUstruct_gpu_
int_t *LrowindPtr; /* A single vector */

double *LnzvalVec; /* A single vector */
int_t *LnzvalPtr; /* A single vector */
int_t *LnzvalPtr_host; /* A single vector */
int_t *LnzvalPtr; /* A single vector */
int_t *LnzvalPtr_host; /* A single vector */

int_t *UrowindVec; /* A single vector */
int_t *UrowindPtr; /* A single vector */
int_t *UrowindPtr_host; /* A single vector */
int_t *UnzvalPtr_host;

double *UnzvalVec; /* A single vector */
int_t *UnzvalPtr; /* A single vector */
int_t *UnzvalPtr; /* A single vector */

/*gpu pointers for easy block accesses */
local_l_blk_info_t *local_l_blk_infoVec;
int_t *local_l_blk_infoPtr;
Expand All @@ -109,7 +110,6 @@ typedef struct //LUstruct_gpu_
int_t *xsup;
gridinfo_t *grid;


double ScatterMOPCounter;
double ScatterMOPTimer;
double GemmFLOPCounter;
Expand Down
2 changes: 2 additions & 0 deletions SRC/dnrformat_loc3d.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ void dGatherNRformat_loc3d
nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));

/* Gathered to layer 0. Other procs do not have these counts */
MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
1, mpi_int_t, 0, grid3d->zscp.comm);
MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
Expand Down
Loading

0 comments on commit fd71062

Please sign in to comment.