From c7fc50f0179ed660ab71131924c1a78a7035d6df Mon Sep 17 00:00:00 2001 From: Steffen Christgau Date: Mon, 28 Feb 2022 13:49:50 +0100 Subject: [PATCH 1/3] Add support for stride and linked list initialization Currently limited to integer type initializations. --- bench/includes/allocator.h | 14 +++- bench/includes/test_types.h | 8 ++ bench/likwid-bench.c | 23 ++++++ bench/perl/generatePas.pl | 19 ++++- bench/perl/templates/testcases.tt | 2 +- bench/src/allocator.c | 123 +++++++++++++++++++++--------- bench/src/bench.c | 118 ++-------------------------- bench/src/ptt2asm.c | 60 +++++++++++++++ 8 files changed, 216 insertions(+), 151 deletions(-) diff --git a/bench/includes/allocator.h b/bench/includes/allocator.h index 658693d25..b8ff76b8a 100644 --- a/bench/includes/allocator.h +++ b/bench/includes/allocator.h @@ -31,6 +31,7 @@ #define ALLOCATOR_H #include +#include #include #include @@ -42,10 +43,21 @@ extern size_t allocator_dataTypeLength(DataType type); extern void allocator_allocateVector(void** ptr, int alignment, uint64_t size, - int offset, + off_t offset, DataType type, int stride, bstring domain, + InitMethod init_method, + uint64_t init_method_arg, int init_per_thread); +extern void allocator_initVector(void** ptr, + uint64_t size, + off_t offset, + DataType type, + int stride, + InitMethod init_method, + uint64_t init_method_arg, + bool fill); + #endif /*ALLOCATOR_H*/ diff --git a/bench/includes/test_types.h b/bench/includes/test_types.h index 652a7b6b3..aa356c8f7 100644 --- a/bench/includes/test_types.h +++ b/bench/includes/test_types.h @@ -40,6 +40,12 @@ typedef enum { DOUBLE, INT} DataType; +typedef enum { + CONSTANT_ONE = 0, /* fill stream with 1 of the employed data type (default) */ + INDEX_STRIDE, /* fill stream with the index + stride inside the stream modulo the stream size using the employed data type */ + LINKED_LIST /* create linked linked for pointer chasing */ +} InitMethod; + typedef enum { STREAM_0 = 0, STREAM_1 = 1, @@ -97,6 +103,8 @@ typedef struct { int instr_const; int instr_loop; int uops; + InitMethod init_method; + uint64_t init_arg; int loadstores; void* dlhandle; } TestCase; diff --git a/bench/likwid-bench.c b/bench/likwid-bench.c index 0105a9c3b..8d4d902c0 100644 --- a/bench/likwid-bench.c +++ b/bench/likwid-bench.c @@ -333,6 +333,19 @@ int main(int argc, char** argv) { ownprintf("Loop micro Ops (\u03BCOPs): %d\n",test->uops); } + ownprintf("Initialization: "); + switch (test->init_method) + { + case CONSTANT_ONE: + ownprintf("constant, all ones\n"); + break; + case INDEX_STRIDE: + ownprintf("(index + stride) %% size\n"); + break; + case LINKED_LIST: + ownprintf("linked list\n"); + break; + } } bdestroy(testcase); if (!builtin) @@ -524,6 +537,8 @@ int main(int argc, char** argv) test->type, test->stride, currentWorkgroup->streams[i].domain, + test->init_method, + test->init_arg, currentWorkgroup->init_per_thread && nrThreads > 1); } tmp++; @@ -776,6 +791,14 @@ int main(int argc, char** argv) ownprintf("UOPs:\t\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->uops*threads_data[0].data.iter); } + if (test->init_method == INDEX_STRIDE) + { + ownprintf("Initialization stride:\t%" PRIu64 "\n",test->init_arg); + } + else if (test->init_method == LINKED_LIST) + { + ownprintf("Linked list elem. size:\t%" PRIu64 "\n",test->init_arg); + } ownprintf(bdata(HLINE)); threads_destroy(numberOfWorkgroups, test->streams); diff --git a/bench/perl/generatePas.pl b/bench/perl/generatePas.pl index 9ce2240a1..3d0b841d5 100755 --- a/bench/perl/generatePas.pl +++ b/bench/perl/generatePas.pl @@ -40,6 +40,8 @@ my $name; my $streams; my $type; +my $init_method; +my $init_arg; my $flops; my $bytes; my $desc; @@ -122,6 +124,8 @@ $prolog=''; $loop=''; $desc=''; + $init_method='CONSTANT_ONE'; + $init_arg=''; $streams=1; my $loads=-1; my $stores=-1; @@ -140,6 +144,13 @@ } } elsif ($line =~ /TYPE[ ]+(SINGLE|DOUBLE|INT)/) { $type = $1; + } elsif ($line =~ /INIT[ ]+(CONST|INDEX_STRIDE|LINKED_LIST)?/p) { + $init_method = $1; + # translate to actual enum items + if ($init_method eq "CONST") { $init_method = "CONSTANT_ONE"; } + if (${^POSTMATCH} =~ /[ ]+([0-9]+)/) { + $init_arg = $1; + } } elsif ($line =~ /FLOPS[ ]+([0-9]+)/) { $flops = $1; } elsif ($line =~ /BYTES[ ]+([0-9]+)/) { @@ -156,7 +167,7 @@ $loop_instr = $1; } elsif ($line =~ /UOPS[ ]+([0-9]+)/) { $uops = $1; - } elsif ($line =~ /DESC[ ]+([a-zA-z ,.\-_\(\)\+\*\/=]+)/) { + } elsif ($line =~ /DESC[ ]+(.*)/) { $desc = $1; } elsif ($line =~ /INC[ ]+([0-9]+)/) { $increment = $1; @@ -196,6 +207,8 @@ $Vars->{skip} = $skip; $Vars->{multi} = $multi; $Vars->{desc} = $desc; + $Vars->{init_method} = $init_method; + $Vars->{init_arg} = $init_arg; #print Dumper($Vars); @@ -212,7 +225,9 @@ branches => $branches, instr_const => $instr, instr_loop => $loop_instr, - uops => $uops}); + uops => $uops, + init_method => $init_method, + init_arg => $init_arg}); } } #print Dumper(@Testcases); diff --git a/bench/perl/templates/testcases.tt b/bench/perl/templates/testcases.tt index ceaa23b3c..ed7efdc49 100644 --- a/bench/perl/templates/testcases.tt +++ b/bench/perl/templates/testcases.tt @@ -12,7 +12,7 @@ extern void [% test.name %](); static const TestCase kernels[NUMKERNELS] = { [% FOREACH test IN Testcases %] - {"[% test.name %]" , [% test.streams %], [% test.type %], [% test.stride %], &[% test.name %], [% test.flops %], [% test.bytes %], "[% test.desc %]", [% test.loads %], [% test.stores %], [% test.branches %], [% test.instr_const %], [% test.instr_loop %], [% test.uops %]}, + {"[% test.name %]" , [% test.streams %], [% test.type %], [% test.stride %], &[% test.name %], [% test.flops %], [% test.bytes %], "[% test.desc %]", [% test.loads %], [% test.stores %], [% test.branches %], [% test.instr_const %], [% test.instr_loop %], [% test.uops %], [% test.init_method %], [% test.init_arg %] }, [% END %] }; diff --git a/bench/src/allocator.c b/bench/src/allocator.c index 4771f3ccc..d78b90506 100644 --- a/bench/src/allocator.c +++ b/bench/src/allocator.c @@ -93,10 +93,12 @@ allocator_allocateVector( void** ptr, int alignment, uint64_t size, - int offset, + off_t offset, DataType type, int stride, bstring domainString, + InitMethod init_method, + uint64_t init_method_arg, int init_per_thread) { int i; @@ -155,7 +157,7 @@ allocator_allocateVector( numberOfAllocatedVectors++; affinity_pinProcess(domain->processorList[0]); - printf("Allocate: Process running on hwthread %d (Domain %s) - Vector length %llu/%llu Offset %d Alignment %llu\n", + printf("Allocate: Process running on hwthread %d (Domain %s) - Vector length %llu/%llu Offset %llu Alignment %llu\n", affinity_processGetProcessorId(), bdata(domain->tag), LLU_CAST size, @@ -165,48 +167,99 @@ allocator_allocateVector( if (!init_per_thread) { - switch ( type ) - { - case INT: - { - int* sptr = (int*) (*ptr); - sptr += offset; + allocator_initVector(ptr, size, offset, type, stride, init_method, init_method_arg, true); + } +} + +void allocator_initVector(void** ptr, + uint64_t size, + off_t offset, + DataType type, + int stride, + InitMethod init_method, + uint64_t init_method_arg, + bool fill) +{ + switch ( type ) + { + case INT: + { + int* iptr = (int*) (*ptr); + iptr += offset; + + switch ( init_method ) { + case CONSTANT_ONE: + for ( uint64_t i=0; fill && i < size; i++ ) + { + iptr[i] = 1; + } + break; + case INDEX_STRIDE: + for ( int64_t i=0; fill && i < size; i++ ) + { + iptr[i] = (int) ((i + stride) % size); + } + break; + case LINKED_LIST: + ; + /* init_method_arg is guaranteed to be a non-zero multiple of sizeof(int) or linked lists items */ + const int64_t ll_int_item_size = init_method_arg / sizeof(int); + const int64_t ll_items = size / init_method_arg; - for ( uint64_t i=0; i < size; i++ ) - { - sptr[i] = 1; - } - *ptr = (void*) sptr; + for ( int64_t i=0; fill && i < ll_items; i++ ) + { + iptr[i * ll_int_item_size] = i * init_method_arg; + } + /* Use Sattolo's algorithm to create single-cycle permutation */ + struct drand48_data rng_state; + srand48_r(0, &rng_state); + + int64_t i = ll_items; + while ( i > 1 && fill ) + { + i--; + + long j; + mrand48_r(&rng_state, &j); + j = abs(j) % i; + + /* swap */ + const int tmp = iptr[i * ll_int_item_size]; + iptr[i * ll_int_item_size] = iptr[j * ll_int_item_size]; + iptr[j * ll_int_item_size] = tmp; + } + break; } - break; - case SINGLE: - { - float* sptr = (float*) (*ptr); - sptr += offset; + *ptr = (void*) iptr; + } + break; - for ( uint64_t i=0; i < size; i++ ) - { - sptr[i] = 1.0; - } - *ptr = (void*) sptr; + case SINGLE: + { + float* sptr = (float*) (*ptr); + sptr += offset; + for ( uint64_t i=0; fill && i < size; i++ ) + { + sptr[i] = 1.0; } - break; + *ptr = (void*) sptr; + } + break; + + case DOUBLE: + { + double* dptr = (double*) (*ptr); + dptr += offset; - case DOUBLE: + for ( uint64_t i=0; fill && i < size; i++ ) { - double* dptr = (double*) (*ptr); - dptr += offset; - - for ( uint64_t i=0; i < size; i++ ) - { - dptr[i] = 1.0; - } - *ptr = (void*) dptr; + dptr[i] = 1.0; } - break; - } + *ptr = (void*) dptr; + } + break; } } diff --git a/bench/src/bench.c b/bench/src/bench.c index 4a2c66674..c9bb0a1d1 100644 --- a/bench/src/bench.c +++ b/bench/src/bench.c @@ -110,62 +110,9 @@ runTest(void* arg) offset); BARRIER; - switch ( myData->test->type ) - { - case SINGLE: - { - float* sptr; - for (i=0; i < myData->test->streams; i++) - { - sptr = (float*) myData->streams[i]; - sptr += offset; - if (myData->init_per_thread) - { - for (j = 0; j < vecsize; j++) - { - sptr[j] = 1.0; - } - } - myData->streams[i] = (float*) sptr; - } - } - break; - case INT: - { - int* sptr; - for (i=0; i < myData->test->streams; i++) - { - sptr = (int*) myData->streams[i]; - sptr += offset; - if (myData->init_per_thread) - { - for (j = 0; j < vecsize; j++) - { - sptr[j] = 1; - } - } - myData->streams[i] = (int*) sptr; - } - } - break; - case DOUBLE: - { - double* dptr; - for (i=0; i < myData->test->streams; i++) - { - dptr = (double*) myData->streams[i]; - dptr += offset; - if (myData->init_per_thread) - { - for (j = 0; j < vecsize; j++) - { - dptr[j] = 1.0; - } - } - myData->streams[i] = (double*) dptr; - } - } - break; + for (i=0; i < myData->test->streams; i++) { + allocator_initVector(&myData->streams[i], vecsize, offset, + myData->test->type, myData->test->stride, myData->test->init_method, myData->test->init_arg, myData->init_per_thread); } BARRIER; @@ -509,62 +456,9 @@ getIterSingle(void* arg) printf("Automatic iteration count detection:"); #endif - switch ( myData->test->type ) - { - case SINGLE: - { - float* sptr; - for (i=0; i < myData->test->streams; i++) - { - sptr = (float*) myData->streams[i]; - sptr += offset; - if (myData->init_per_thread) - { - for (j = 0; j < vecsize; j++) - { - sptr[j] = 1.0; - } - } - myData->streams[i] = (float*) sptr; - } - } - break; - case INT: - { - int* sptr; - for (i=0; i < myData->test->streams; i++) - { - sptr = (int*) myData->streams[i]; - sptr += offset; - if (myData->init_per_thread) - { - for (j = 0; j < vecsize; j++) - { - sptr[j] = 1; - } - } - myData->streams[i] = (int*) sptr; - } - } - break; - case DOUBLE: - { - double* dptr; - for (i=0; i < myData->test->streams; i++) - { - dptr = (double*) myData->streams[i]; - dptr += offset; - if (myData->init_per_thread) - { - for (j = 0; j < vecsize; j++) - { - dptr[j] = 1.0; - } - } - myData->streams[i] = (double*) dptr; - } - } - break; + for (i=0; i < myData->test->streams; i++) { + allocator_initVector(&myData->streams[i], vecsize, offset, + myData->test->type, myData->test->stride, myData->test->init_method, myData->test->init_arg, myData->init_per_thread); } switch ( myData->test->streams ) { diff --git a/bench/src/ptt2asm.c b/bench/src/ptt2asm.c index bacd3c2b1..bcecc2f9f 100644 --- a/bench/src/ptt2asm.c +++ b/bench/src/ptt2asm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -173,6 +174,10 @@ static struct bstrList* analyse_ptt(bstring pttfile, TestCase** testcase) bstring bUOPS = bformat("UOPS"); bstring bBRANCHES = bformat("BRANCHES"); bstring bLOOP = bformat("LOOP"); + bstring bINITMETHOD = bformat("INIT"); + bstring bINITMETHOD_ONES = bformat("CONST"); + bstring bINITMETHOD_INDEX = bformat("INDEX_STRIDE"); + bstring bINITMETHOD_LINKED_LIST = bformat("LINKED_LIST"); int (*ownatoi)(const char*) = &atoi; ptt = read_ptt(pttfile); @@ -189,6 +194,7 @@ static struct bstrList* analyse_ptt(bstring pttfile, TestCase** testcase) test->instr_const = -1; test->instr_loop = -1; test->uops = -1; + test->init_method = CONSTANT_ONE; /* compat: set default to "all 1" initialization */ code = bstrListCreate(); for (int i = 0; i < ptt->qty; i++) { @@ -249,6 +255,56 @@ static struct bstrList* analyse_ptt(bstring pttfile, TestCase** testcase) } } } + else if (bstrncmp(ptt->entry[i], bINITMETHOD, blength(bINITMETHOD)) == BSTR_OK) + { + bstring binit_method = bmidstr(ptt->entry[i], blength(bINITMETHOD)+1, blength(ptt->entry[i])-blength(bINITMETHOD)); + btrimws(binit_method); + + int skip = 0; + if (bstrncmp(binit_method, bINITMETHOD_ONES, blength(bINITMETHOD_ONES)) == BSTR_OK) + { + test->init_method = CONSTANT_ONE; + } + else if (bstrncmp(binit_method, bINITMETHOD_INDEX, blength(bINITMETHOD_INDEX)) == BSTR_OK) + { + test->init_method = INDEX_STRIDE; + skip = blength(bINITMETHOD_INDEX); + } + else if (bstrncmp(binit_method, bINITMETHOD_LINKED_LIST, blength(bINITMETHOD_LINKED_LIST)) == BSTR_OK) + { + test->init_method = LINKED_LIST; + skip = blength(bINITMETHOD_LINKED_LIST); + } + else + { + fprintf(stderr, "Unknown initialization type (\"%s\"). Falling back to CONSTANT_ONE\n", binit_method->data); + test->init_method = CONSTANT_ONE; + } + + test->init_arg = 0; + if (test->init_method == INDEX_STRIDE || test->init_method == LINKED_LIST) + { + ANALYSE_PTT_GET_INT(binit_method, + test->init_method == INDEX_STRIDE ? bINITMETHOD_INDEX : bINITMETHOD_LINKED_LIST, + test->init_arg); + + } + + if (test->init_method == LINKED_LIST) + { + if ( test->init_arg % sizeof(int) != 0 ) + { + test->init_arg = (test->init_arg / sizeof(int)) * sizeof(int); + fprintf(stderr, "Warning: Size of linked list item not a multiple of int size (%zu), truncated to %" PRIu64 ".\n", + sizeof(int), test->init_arg); + } + if ( test->init_arg == 0 ) { + test->init_arg = sizeof(int); + } + } + + bdestroy(binit_method); + } else if (bstrncmp(ptt->entry[i], bTYPE, blength(bTYPE)) == BSTR_OK) { bstring btype = bmidstr(ptt->entry[i], blength(bTYPE)+1, blength(ptt->entry[i])-blength(bTYPE)); @@ -303,6 +359,10 @@ static struct bstrList* analyse_ptt(bstring pttfile, TestCase** testcase) bdestroy(bUOPS); bdestroy(bBRANCHES); bdestroy(bLOOP); + bdestroy(bINITMETHOD); + bdestroy(bINITMETHOD_ONES); + bdestroy(bINITMETHOD_INDEX); + bdestroy(bINITMETHOD_LINKED_LIST); return code; } From 8acbc1445456bab2887ad191268f04eb8da8d6e3 Mon Sep 17 00:00:00 2001 From: Steffen Christgau Date: Thu, 1 Sep 2022 11:03:45 +0200 Subject: [PATCH 2/3] Add pointer chase benchmarks --- bench/armv8/pchase_linkedlist.ptt | 15 +++++++++++++++ bench/armv8/pchase_strided.ptt | 15 +++++++++++++++ bench/x86-64/pchase_linkedlist.ptt | 15 +++++++++++++++ bench/x86-64/pchase_strided.ptt | 15 +++++++++++++++ 4 files changed, 60 insertions(+) create mode 100644 bench/armv8/pchase_linkedlist.ptt create mode 100644 bench/armv8/pchase_strided.ptt create mode 100644 bench/x86-64/pchase_linkedlist.ptt create mode 100644 bench/x86-64/pchase_strided.ptt diff --git a/bench/armv8/pchase_linkedlist.ptt b/bench/armv8/pchase_linkedlist.ptt new file mode 100644 index 000000000..da3bd1398 --- /dev/null +++ b/bench/armv8/pchase_linkedlist.ptt @@ -0,0 +1,15 @@ +STREAMS 1 +TYPE INT +INIT LINKED_LIST 32 +FLOPS 0 +BYTES 4 +DESC Linked list pointer chase with 32 byte-sized list items +LOADS 4 +STORES 0 +INSTR_LOOP 1 +mov x2, xzr +LOOP 4 +ldr w3, [STR0, x2] +ldr w4, [STR0, x3] +ldr w5, [STR0, x4] +ldr w2, [STR0, x5] diff --git a/bench/armv8/pchase_strided.ptt b/bench/armv8/pchase_strided.ptt new file mode 100644 index 000000000..358913d79 --- /dev/null +++ b/bench/armv8/pchase_strided.ptt @@ -0,0 +1,15 @@ +STREAMS 1 +TYPE INT +INIT INDEX_STRIDE +FLOPS 0 +BYTES 4 +DESC Stride-based 32-bit integer pointer chase +LOADS 4 +STORES 0 +INSTR_LOOP 1 +mov x2, xzr +LOOP 4 +ldr w3, [STR0, x2, lsl 2] +ldr w4, [STR0, x3, lsl 2] +ldr w5, [STR0, x4, lsl 2] +ldr w2, [STR0, x5, lsl 2] diff --git a/bench/x86-64/pchase_linkedlist.ptt b/bench/x86-64/pchase_linkedlist.ptt new file mode 100644 index 000000000..794a53680 --- /dev/null +++ b/bench/x86-64/pchase_linkedlist.ptt @@ -0,0 +1,15 @@ +STREAMS 1 +TYPE INT +INIT LINKED_LIST 64 +FLOPS 0 +BYTES 4 +DESC Linked list pointer chase with 64 byte-sized list items +LOADS 4 +STORES 0 +INSTR_LOOP 1 +XOR R12, R12 +LOOP 4 +MOV R13D, [STR0 + R12] +MOV R14D, [STR0 + R13] +MOV R15D, [STR0 + R14] +MOV R12D, [STR0 + R15] diff --git a/bench/x86-64/pchase_strided.ptt b/bench/x86-64/pchase_strided.ptt new file mode 100644 index 000000000..c3a40d4ef --- /dev/null +++ b/bench/x86-64/pchase_strided.ptt @@ -0,0 +1,15 @@ +STREAMS 1 +TYPE INT +INIT INDEX_STRIDE +FLOPS 0 +BYTES 4 +DESC Stride-based 32-bit integer pointer chase +LOADS 4 +STORES 0 +INSTR_LOOP 1 +XOR R12, R12 +LOOP 4 +MOV R13D, [STR0 + R12 * 4] +MOV R14D, [STR0 + R13 * 4] +MOV R15D, [STR0 + R14 * 4] +MOV R12D, [STR0 + R15 * 4] From 5d08c5ab142d4394372f6de8f6d1251e2df48b7d Mon Sep 17 00:00:00 2001 From: Steffen Christgau Date: Wed, 30 Nov 2022 10:35:11 +0100 Subject: [PATCH 3/3] Fix LOADS statement in pointer chase benchmarks --- bench/armv8/pchase_linkedlist.ptt | 2 +- bench/armv8/pchase_strided.ptt | 2 +- bench/x86-64/pchase_linkedlist.ptt | 2 +- bench/x86-64/pchase_strided.ptt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bench/armv8/pchase_linkedlist.ptt b/bench/armv8/pchase_linkedlist.ptt index da3bd1398..5818ed445 100644 --- a/bench/armv8/pchase_linkedlist.ptt +++ b/bench/armv8/pchase_linkedlist.ptt @@ -4,7 +4,7 @@ INIT LINKED_LIST 32 FLOPS 0 BYTES 4 DESC Linked list pointer chase with 32 byte-sized list items -LOADS 4 +LOADS 1 STORES 0 INSTR_LOOP 1 mov x2, xzr diff --git a/bench/armv8/pchase_strided.ptt b/bench/armv8/pchase_strided.ptt index 358913d79..29c72a8df 100644 --- a/bench/armv8/pchase_strided.ptt +++ b/bench/armv8/pchase_strided.ptt @@ -4,7 +4,7 @@ INIT INDEX_STRIDE FLOPS 0 BYTES 4 DESC Stride-based 32-bit integer pointer chase -LOADS 4 +LOADS 1 STORES 0 INSTR_LOOP 1 mov x2, xzr diff --git a/bench/x86-64/pchase_linkedlist.ptt b/bench/x86-64/pchase_linkedlist.ptt index 794a53680..c7d14f943 100644 --- a/bench/x86-64/pchase_linkedlist.ptt +++ b/bench/x86-64/pchase_linkedlist.ptt @@ -4,7 +4,7 @@ INIT LINKED_LIST 64 FLOPS 0 BYTES 4 DESC Linked list pointer chase with 64 byte-sized list items -LOADS 4 +LOADS 1 STORES 0 INSTR_LOOP 1 XOR R12, R12 diff --git a/bench/x86-64/pchase_strided.ptt b/bench/x86-64/pchase_strided.ptt index c3a40d4ef..64b02f1af 100644 --- a/bench/x86-64/pchase_strided.ptt +++ b/bench/x86-64/pchase_strided.ptt @@ -4,7 +4,7 @@ INIT INDEX_STRIDE FLOPS 0 BYTES 4 DESC Stride-based 32-bit integer pointer chase -LOADS 4 +LOADS 1 STORES 0 INSTR_LOOP 1 XOR R12, R12