Skip to content

Commit

Permalink
add SHMEM C transpose with alltoall
Browse files Browse the repository at this point in the history
  • Loading branch information
mrogowski authored and jeffhammond committed Jul 31, 2023
1 parent f9adef3 commit 2ed3324
Show file tree
Hide file tree
Showing 2 changed files with 355 additions and 1 deletion.
3 changes: 2 additions & 1 deletion MPI1/Transpose/transpose-a2a.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
Copyright (c) 2013, Intel Corporation
Copyright (c) 2023
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -136,7 +137,7 @@ int main(int argc, char ** argv)
int error; /* error flag */
double * RESTRICT A_p; /* original matrix column block */
double * RESTRICT B_p; /* transposed matrix column block */
double * RESTRICT T_p; /* original matrix column block */
double * RESTRICT T_p;
double abserr, /* absolute error */
abserr_tot; /* aggregate absolute error */
double epsilon = 1.e-8; /* error tolerance */
Expand Down
353 changes: 353 additions & 0 deletions SHMEM/Transpose/transpose_a2a.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,353 @@
/*
Copyright (c) 2013, Intel Corporation
Copyright (c) 2023
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/

/*******************************************************************
NAME: transpose
PURPOSE: This program tests the efficiency with which a square matrix
can be transposed and stored in another matrix. The matrices
are distributed identically.
USAGE: Program inputs are the matrix order, the number of times to
repeat the operation, and the communication mode
transpose <# iterations> <matrix order> [tile size]
An optional parameter specifies the tile size used to divide the
individual matrix blocks for improved cache and TLB performance.
The output consists of diagnostics to make sure the
transpose worked and timing statistics.
FUNCTIONS CALLED:
Other than SHMEM or standard C functions, the following
functions are used in this program:
wtime() Portable wall-timer interface.
bail_out() Determine global error and exit if nonzero.
HISTORY: Written by Tom St. John, July 2015.
Rob vdW: Fixed race condition on synchronization flags, August 2015
Marcin Rogowski: shmem_alltoall, July 2023
*******************************************************************/

/******************************************************************
Layout nomenclature
-------------------
o Each rank owns one block of columns (Colblock) of the overall
matrix to be transposed, as well as of the transposed matrix.
o Colblock is stored contiguously in the memory of the rank.
The stored format is column major, which means that matrix
elements (i,j) and (i+1,j) are adjacent, and (i,j) and (i,j+1)
are "order" words apart
o Colblock is logically composed of #ranks Blocks, but a Block is
not stored contiguously in memory. Conceptually, the Block is
the unit of data that gets communicated between ranks. Block i of
rank j is locally transposed and gathered into a buffer called Work,
which is sent to rank i, where it is scattered into Block j of the
transposed matrix.
o The original and transposed matrices are called A and B
-----------------------------------------------------------------
| | | | |
| Colblock | | | |
| | | | |
| | | | |
| | | | |
| ------------------------------- |
| | | | |
| | Block | | |
| | | | |
| | | | |
| | | | |
| ------------------------------- |
| | | | |
| | | | Overall Matrix |
| | | | |
| | | | |
| | | | |
| ------------------------------- |
| | | | |
| | | | |
| | | | |
| | | | |
| | | | |
-----------------------------------------------------------------*/

#include <par-res-kern_general.h>
#include <par-res-kern_shmem.h>

#include <math.h>

int main(int argc, char ** argv)
{
long Block_order; /* number of columns owned by rank */
int Block_size; /* size of a single block */
int Colblock_size; /* size of column block */
int Num_procs; /* number of ranks */
int order; /* order of overall matrix */
int send_to, recv_from; /* ranks with which to communicate */
long bytes; /* combined size of matrices */
int my_ID; /* rank */
int root=0; /* rank of root */
int iterations; /* number of times to do the transpose */
long i, j, it, jt, istart;/* dummies */
int iter; /* index of iteration */
int phase; /* phase inside staged communication */
int colstart; /* starting column for owning rank */
int error; /* error flag */
double * RESTRICT A_p; /* original matrix column block */
double * RESTRICT B_p; /* transposed matrix column block */
double * RESTRICT T_p;
double epsilon = 1.e-8; /* error tolerance */
double avgtime; /* timing parameters */
long *pSync_bcast; /* work space for collectives */
long *pSync_reduce; /* work space for collectives */
double *pWrk; /* work space for SHMEM collectives */
double *local_trans_time,
*trans_time; /* timing parameters */
double *abserr,
*abserr_tot; /* local and aggregate error */
#if !BARRIER_SYNCH
int *recv_flag; /* synchronization flags: data received */
int *send_flag; /* synchronization flags: receiver ready */
#endif
int *arguments; /* command line arguments */

/*********************************************************************
** Initialize the SHMEM environment
*********************************************************************/

prk_shmem_init();
my_ID=prk_shmem_my_pe();
Num_procs=prk_shmem_n_pes();

if (my_ID == root) {
printf("Parallel Research Kernels version %s\n", PRKVERSION);
printf("SHMEM matrix transpose: B = A^T\n");
}

// initialize sync variables for error checks
pSync_bcast = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
pSync_reduce = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
pWrk = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
arguments = (int *) prk_shmem_align(prk_get_alignment(),3*sizeof(int));
abserr = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double));
if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time ||
!trans_time || !arguments || !abserr) {
printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID);
error = 1;
goto ENDOFTESTS;
}

for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
error = 0;
if (my_ID == root) {
if (argc != 3){
printf("Usage: %s <# iterations> <matrix order>\n",
*argv);
error = 1; goto ENDOFTESTS;
}

iterations = atoi(*++argv);
arguments[0]=iterations;
if(iterations < 1){
printf("ERROR: iterations must be >= 1 : %d \n",iterations);
error = 1; goto ENDOFTESTS;
}

order = atoi(*++argv);
arguments[1]=order;
if (order < Num_procs) {
printf("ERROR: matrix order %d should at least # procs %d\n",
order, Num_procs);
error = 1; goto ENDOFTESTS;
}
if (order%Num_procs) {
printf("ERROR: matrix order %d should be divisible by # procs %d\n",
order, Num_procs);
error = 1; goto ENDOFTESTS;
}

ENDOFTESTS:;
}
bail_out(error);

if (my_ID == root) {
printf("Number of ranks = %d\n", Num_procs);
printf("Matrix order = %d\n", order);
printf("Number of iterations = %d\n", iterations);
}

/* Broadcast input data to all ranks */
shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast);
shmem_barrier_all();

iterations=arguments[0];
order=arguments[1];

shmem_barrier_all();
prk_shmem_free(arguments);

bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a
** rank. Each column block is made up of Num_procs smaller square
** blocks of order block_order.
*********************************************************************/

Block_order = order/Num_procs;
colstart = Block_order * my_ID;
Colblock_size = order * Block_order;
Block_size = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the row block of the
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
T_p = (double *)prk_malloc(Colblock_size*sizeof(double));
if (A_p == NULL){
printf(" Error allocating space for original matrix on node %d\n",my_ID);
error = 1;
}
bail_out(error);

B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
if (B_p == NULL){
printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
error = 1;
}
bail_out(error);

/* Fill the original column matrices */
for (i=0;i<order; i++)
for (j=0;j<Block_order;j++)
{
A_p[i*Block_order+j] = (double) (my_ID*Block_order + i*order +j);
B_p[i*Block_order+j] = 0.0;
T_p[i*Block_order+j] = 0.0;
}

shmem_barrier_all();

for (iter = 0; iter<=iterations; iter++){

/* start timer after a warmup iteration */
if (iter == 1) {
shmem_barrier_all();
local_trans_time[0] = wtime();
}

shmem_barrier_all();
shmem_double_alltoall(SHMEM_TEAM_WORLD, T_p, A_p, Block_order*Block_order);

for (phase=0; phase<Num_procs; phase++){
int lo = Block_order*Block_order*phase;

for (i = 0; i < Block_order; i++)
for (j = 0; j < Block_order; j++)
B_p[lo + i + Block_order * j] += T_p[lo + j + Block_order * i];
} /* end of phase loop */

for (j=0; j<Colblock_size; j++)
{
A_p[j] += 1.0;
}

} /* end of iterations */

shmem_barrier_all();
local_trans_time[0] = wtime() - local_trans_time[0];

shmem_barrier_all();
shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

abserr[0] = 0.0;
double addit = 0.5 * ( (iterations+1.) * (double)iterations );
for (i=0;i<order; i++)
{
for (j=0;j<Block_order;j++)
{
const double temp = (order*(my_ID*Block_order+j)+i) * (iterations+1) + addit;
abserr[0] += ABS(B_p[i*Block_order+j] - temp);
}
}

shmem_barrier_all();
shmem_double_sum_to_all(&(abserr[1]), &(abserr[0]), 1, 0, 0, Num_procs, pWrk, pSync_reduce);

if (abserr[1] <= epsilon) {
avgtime = trans_time[0]/(double)iterations;
if (my_ID == root) {
printf("Solution validates\n");
printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
printf("Summed errors: %30.15lf \n", abserr[1]);
#endif
}
} else {
error = 1;
if (my_ID == root) {
printf("ERROR: Aggregate squared error %30.15lf exceeds threshold %30.15lf\n", abserr[1], epsilon);
}
fflush(stdout);
printf("ERROR: PE=%d, error = %30.15lf\n", my_ID, abserr[0]);
}

bail_out(error);

prk_shmem_free(pSync_bcast);
prk_shmem_free(pSync_reduce);
prk_shmem_free(pWrk);

prk_shmem_finalize();
exit(EXIT_SUCCESS);

} /* end of main */

0 comments on commit 2ed3324

Please sign in to comment.