From 0f967f171a0ba5ca4c8408ad8cdbee93023a25b9 Mon Sep 17 00:00:00 2001 From: thomas Date: Thu, 19 Sep 2024 16:12:40 +0200 Subject: [PATCH] STREAM + numactl --- STREAM/.gitignore | 3 + STREAM/HISTORY.txt | 152 ++++++++++++ STREAM/LICENSE.txt | 34 +++ STREAM/Makefile | 22 ++ STREAM/README | 110 +++++++++ STREAM/mysecond.c | 27 +++ STREAM/stream.c | 591 +++++++++++++++++++++++++++++++++++++++++++++ STREAM/stream.f | 462 +++++++++++++++++++++++++++++++++++ STREAM/stream.sh | 12 + benchmarking.md | 17 ++ 10 files changed, 1430 insertions(+) create mode 100644 STREAM/.gitignore create mode 100644 STREAM/HISTORY.txt create mode 100644 STREAM/LICENSE.txt create mode 100644 STREAM/Makefile create mode 100644 STREAM/README create mode 100644 STREAM/mysecond.c create mode 100644 STREAM/stream.c create mode 100644 STREAM/stream.f create mode 100755 STREAM/stream.sh diff --git a/STREAM/.gitignore b/STREAM/.gitignore new file mode 100644 index 0000000..8073ea9 --- /dev/null +++ b/STREAM/.gitignore @@ -0,0 +1,3 @@ +*.o +stream_c +stream_f diff --git a/STREAM/HISTORY.txt b/STREAM/HISTORY.txt new file mode 100644 index 0000000..496fca6 --- /dev/null +++ b/STREAM/HISTORY.txt @@ -0,0 +1,152 @@ +------------------------------------------------------------------------- + +Revisions as of Thu, Jan 17, 2013 3:50:01 PM + +Version 5.10 of stream.c has been released. +This version includes improved validation code and will automatically +use 64-bit array indices on 64-bit systems to allow very large arrays. + +------------------------------------------------------------------------- + +Revisions as of Thu Feb 19 08:16:57 CST 2009 + +Note that the codes in the "Versions" subdirectory should be +considered obsolete -- the versions of stream.c and stream.f +in this main directory include the OpenMP directives and structure +for creating "TUNED" versions. + +Only the MPI version in the "Versions" subdirectory should be +of any interest, and I have not recently checked that version for +errors or compliance with the current versions of stream.c and +stream.f. + +I added a simple Makefile to this directory. It works under Cygwin +on my Windows XP box (using gcc and g77). + +A user suggested a sneaky trick for "mysecond.c" -- instead of using +the #ifdef UNDERSCORE to generate the function name that the Fortran +compiler expects, the new version simply defines both "mysecond()" +and "mysecond_()", so it should automagically link with most Fortran +compilers. + +------------------------------------------------------------------------- + +Revisions as of Wed Nov 17 09:15:37 CST 2004 + +The most recent "official" versions have been renamed "stream.f" and +"stream.c" -- all other versions have been moved to the "Versions" +subdirectory. + +The "official" timer (was "second_wall.c") has been renamed "mysecond.c". +This is embedded in the C version ("stream.c"), but still needs to be +externally linked to the FORTRAN version ("stream.f"). + +------------------------------------------------------------------------- + +Revisions as of Tue May 27 11:51:23 CDT 2003 + +Copyright and License info added to stream_d.f, stream_mpi.f, and +stream_tuned.f + + +------------------------------------------------------------------------- + +Revisions as of Tue Apr 8 10:26:48 CDT 2003 + +I changed the name of the timer interface from "second" to "mysecond" +and removed the dummy argument in all versions of the source code (but +not the "Contrib" versions). + + +------------------------------------------------------------------------- + +Revisions as of Mon Feb 25 06:48:14 CST 2002 + +Added an OpenMP version of stream_d.c, called stream_d_omp.c. This is +still not up to date with the Fortran version, which includes error +checking and advanced data flow to prevent overoptimization, but it is +a good start.... + + +------------------------------------------------------------------------- + +Revisions as of Tue Jun 4 16:31:31 EDT 1996 + +I have fixed an "off-by-one" error in the RMS time calculation in +stream_d.f. This was already corrected in stream_d.c. No results are +invalidated, since I use minimum time instead of RMS time anyway.... + +------------------------------------------------------------------------- + +Revisions as of Fri Dec 8 14:49:56 EST 1995 + +I have renamed the timer routines to: + second_cpu.c + second_wall.c + second_cpu.f + +All have a function interface named 'second' which returns a double +precision floating point number. It should be possible to link +second_wall.c with stream_d.f without too much trouble, though the +details will depend on your environment. + +If anyone builds versions of these timers for machines running the +Macintosh O/S or DOS/Windows, I would appreciate getting a copy. + +To clarify: + * For single-user machines, the wallclock timer is preferred. + * For parallel machines, the wallclock timer is required. + * For time-shared systems, the cpu timer is more reliable, + though less accurate. + + +------------------------------------------------------------------------- + +Revisions as of Wed Oct 25 09:40:32 EDT 1995 + +(1) NOTICE to C users: + + stream_d.c has been updated to version 4.0 (beta), and + should be functionally identical to stream_d.f + + Two timers are provided --- second_cpu.c and second_wall.c + second_cpu.c measures cpu time, while second_wall.c measures + elapsed (real) time. + + For single-user machines, the wallclock timer is preferred. + For parallel machines, the wallclock timer is required. + For time-shared systems, the cpu timer is more reliable, + though less accurate. + +(2) cstream.c has been removed -- use stream_d.c + +(3) stream_wall.f has been removed --- to do parallel aggregate + bandwidth runs, comment out the definition of FUNCTION SECOND + in stream_d.f and compile/link with second_wall.c + +(4) stream_offset has been deprecated. It is still here + and usable, but stream_d.f is the "standard" version. + There are easy hooks in stream_d.f to change the + array offsets if you want to. + +(5) The rules of the game are clarified as follows: + + The reference case uses array sizes of 2,000,000 elements + and no additional offsets. I would like to see results + for this case. + + But, you are free to use any array size and any offset + you want, provided that the arrays are each bigger than + the last-level of cache. The output will show me what + parameters you chose. + + I expect that I will report just the best number, but + if there is a serious discrepancy between the reference + case and the "best" case, I reserve the right to report + both. + + Of course, I also reserve the right to reject any results + that I do not trust.... +-- +John D. McCalpin, Ph.D. +john@mccalpin.com diff --git a/STREAM/LICENSE.txt b/STREAM/LICENSE.txt new file mode 100644 index 0000000..cf1c8e0 --- /dev/null +++ b/STREAM/LICENSE.txt @@ -0,0 +1,34 @@ +*======================================================================= +*----------------------------------------------------------------------- +* Copyright 1991-2003: John D. McCalpin +*----------------------------------------------------------------------- +* License: +* 1. You are free to use this program and/or to redistribute +* this program. +* 2. You are free to modify this program for your own use, +* including commercial use, subject to the publication +* restrictions in item 3. +* 3. You are free to publish results obtained from running this +* program, or from works that you derive from this program, +* with the following limitations: +* 3a. In order to be referred to as "STREAM benchmark results", +* published results must be in conformance to the STREAM +* Run Rules, (briefly reviewed below) published at +* http://www.cs.virginia.edu/stream/ref.html +* and incorporated herein by reference. +* As the copyright holder, John McCalpin retains the +* right to determine conformity with the Run Rules. +* 3b. Results based on modified source code or on runs not in +* accordance with the STREAM Run Rules must be clearly +* labelled whenever they are published. Examples of +* proper labelling include: +* "tuned STREAM benchmark results" +* "based on a variant of the STREAM benchmark code" +* Other comparable, clear and reasonable labelling is +* acceptable. +* 3c. Submission of results to the STREAM benchmark web site +* is encouraged, but not required. +* 4. Use of this program or creation of derived works based on this +* program constitutes acceptance of these licensing restrictions. +* 5. Absolutely no warranty is expressed or implied. +*----------------------------------------------------------------------- diff --git a/STREAM/Makefile b/STREAM/Makefile new file mode 100644 index 0000000..2edd2d4 --- /dev/null +++ b/STREAM/Makefile @@ -0,0 +1,22 @@ +CC = gcc +CFLAGS = -Ofast -march=native -mtune=native -fopenmp -D STREAM_ARRAY_SIZE=400000000 + +FC = gfortran +FFLAGS = -Ofast -march=native -mtune=native -fopenmp -D STREAM_ARRAY_SIZE=400000000 + +all: stream_f stream_c + +stream_f: stream.f mysecond.o + $(CC) $(CFLAGS) -c mysecond.c + $(FC) $(FFLAGS) -c stream.f + $(FC) $(FFLAGS) stream.o mysecond.o -o stream_f + +stream_c: stream.c + $(CC) $(CFLAGS) stream.c -o stream_c + +clean: + rm -f stream_f stream_c *.o + +# an example of a more complex build line for the Intel icc compiler +stream.icc: stream.c + icc -O3 -xCORE-AVX2 -ffreestanding -qopenmp -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 stream.c -o stream.omp.AVX2.80M.20x.icc diff --git a/STREAM/README b/STREAM/README new file mode 100644 index 0000000..175a3f0 --- /dev/null +++ b/STREAM/README @@ -0,0 +1,110 @@ +=============================================== + +STREAM is the de facto industry standard benchmark +for measuring sustained memory bandwidth. + +Documentation for STREAM is on the web at: + http://www.cs.virginia.edu/stream/ref.html + +=============================================== +NEWS +=============================================== +UPDATE: October 28 2014: + +"stream_mpi.c" released in the Versions directory. + +Based on Version 5.10 of stream.c, stream_mpi.c +brings the following new features: +* MPI implementation that *distributes* the arrays + across all MPI ranks. (The older Fortran version + of STREAM in MPI *replicates* the arrays across + all MPI ranks.) +* Data is allocated using "posix_memalign" + rather than using static arrays. Different + compiler flags may be needed for both portability + and optimization. + See the READ.ME file in the Versions directory + for more details. +* Error checking and timing done by all ranks and + gathered by rank 0 for processing and output. +* Timing code uses barriers to ensure correct + operation even when multiple MPI ranks run on + shared memory systems. + +NOTE: MPI is not a preferred implementation for + STREAM, which is intended to measure memory + bandwidth in shared-memory systems. In stream_mpi, + the MPI calls are only used to properly synchronize + the timers (using MPI_Barrier) and to gather + timing and error data, so the performance should + scale linearly with the size of the cluster. + But it may be useful, and was an interesting + exercise to develop and debug. + +=============================================== +UPDATE: January 17 2013: + +Version 5.10 of stream.c is finally available! + +There are no changes to what is being measured, but +a number of long-awaited improvements have been made: + +* Updated validation code does not suffer from + accumulated roundoff error for large arrays. +* Defining the preprocessor variable "VERBOSE" + when compiling will (1) cause the code to print the + measured average relative absolute error (rather than + simply printing "Solution Validates", and (2) print + the first 10 array entries with relative error exceeding + the error tolerance. +* Array index variables have been upgraded from + "int" to "ssize_t" to allow arrays with more + than 2 billion elements on 64-bit systems. +* Substantial improvements to the comments in + the source on how to configure/compile/run the + benchmark. +* The proprocessor variable controlling the array + size has been changed from "N" to "STREAM_ARRAY_SIZE". +* A new preprocessor variable "STREAM_TYPE" can be + used to override the data type from the default + "double" to "float". + This mechanism could also be used to change to + non-floating-point types, but several "printf" + statements would need to have their formats changed + to accomodate the modified data type. +* Some small changes in output, including printing + array sizes is GiB as well as MiB. +* Change to the default output format to print fewer + decimals for the bandwidth and more decimals for + the min/max/avg execution times. + + +=============================================== +UPDATE: February 19 2009: + +The most recent "official" versions have been renamed +"stream.f" and "stream.c" -- all other versions have +been moved to the "Versions" subdirectory and should be +considered obsolete. + +The "official" timer (was "second_wall.c") has been +renamed "mysecond.c". This is embedded in the C version +("stream.c"), but still needs to be externally linked to +the FORTRAN version ("stream.f"). The new version defines +entry points both with and without trailing underscores, +so it *should* link automagically with any Fortran compiler. + +=============================================== + +STREAM is a project of "Dr. Bandwidth": + John D. McCalpin, Ph.D. + john@mccalpin.com + +=============================================== + +The STREAM web and ftp sites are currently hosted at +the Department of Computer Science at the University of +Virginia under the generous sponsorship of Professor Bill +Wulf and Professor Alan Batson. + +=============================================== diff --git a/STREAM/mysecond.c b/STREAM/mysecond.c new file mode 100644 index 0000000..d206a4a --- /dev/null +++ b/STREAM/mysecond.c @@ -0,0 +1,27 @@ +/* A gettimeofday routine to give access to the wall + clock timer on most UNIX-like systems. + + This version defines two entry points -- with + and without appended underscores, so it *should* + automagically link with FORTRAN */ + +#include + +double mysecond() +{ +/* struct timeval { long tv_sec; + long tv_usec; }; + +struct timezone { int tz_minuteswest; + int tz_dsttime; }; */ + + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +double mysecond_() {return mysecond();} + diff --git a/STREAM/stream.c b/STREAM/stream.c new file mode 100644 index 0000000..330da9b --- /dev/null +++ b/STREAM/stream.c @@ -0,0 +1,591 @@ +/*-----------------------------------------------------------------------*/ +/* Program: STREAM */ +/* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ +/* Original code developed by John D. McCalpin */ +/* Programmers: John D. McCalpin */ +/* Joe R. Zagar */ +/* */ +/* This program measures memory transfer rates in MB/s for simple */ +/* computational kernels coded in C. */ +/*-----------------------------------------------------------------------*/ +/* Copyright 1991-2013: John D. McCalpin */ +/*-----------------------------------------------------------------------*/ +/* License: */ +/* 1. You are free to use this program and/or to redistribute */ +/* this program. */ +/* 2. You are free to modify this program for your own use, */ +/* including commercial use, subject to the publication */ +/* restrictions in item 3. */ +/* 3. You are free to publish results obtained from running this */ +/* program, or from works that you derive from this program, */ +/* with the following limitations: */ +/* 3a. In order to be referred to as "STREAM benchmark results", */ +/* published results must be in conformance to the STREAM */ +/* Run Rules, (briefly reviewed below) published at */ +/* http://www.cs.virginia.edu/stream/ref.html */ +/* and incorporated herein by reference. */ +/* As the copyright holder, John McCalpin retains the */ +/* right to determine conformity with the Run Rules. */ +/* 3b. Results based on modified source code or on runs not in */ +/* accordance with the STREAM Run Rules must be clearly */ +/* labelled whenever they are published. Examples of */ +/* proper labelling include: */ +/* "tuned STREAM benchmark results" */ +/* "based on a variant of the STREAM benchmark code" */ +/* Other comparable, clear, and reasonable labelling is */ +/* acceptable. */ +/* 3c. Submission of results to the STREAM benchmark web site */ +/* is encouraged, but not required. */ +/* 4. Use of this program or creation of derived works based on this */ +/* program constitutes acceptance of these licensing restrictions. */ +/* 5. Absolutely no warranty is expressed or implied. */ +/*-----------------------------------------------------------------------*/ +# include +# include +# include +# include +# include +# include +# include + +/*----------------------------------------------------------------------- + * INSTRUCTIONS: + * + * 1) STREAM requires different amounts of memory to run on different + * systems, depending on both the system cache size(s) and the + * granularity of the system timer. + * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) + * to meet *both* of the following criteria: + * (a) Each array must be at least 4 times the size of the + * available cache memory. I don't worry about the difference + * between 10^6 and 2^20, so in practice the minimum array size + * is about 3.8 times the cache size. + * Example 1: One Xeon E3 with 8 MB L3 cache + * STREAM_ARRAY_SIZE should be >= 4 million, giving + * an array size of 30.5 MB and a total memory requirement + * of 91.5 MB. + * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) + * STREAM_ARRAY_SIZE should be >= 20 million, giving + * an array size of 153 MB and a total memory requirement + * of 458 MB. + * (b) The size should be large enough so that the 'timing calibration' + * output by the program is at least 20 clock-ticks. + * Example: most versions of Windows have a 10 millisecond timer + * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. + * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. + * This means the each array must be at least 1 GB, or 128M elements. + * + * Version 5.10 increases the default array size from 2 million + * elements to 10 million elements in response to the increasing + * size of L3 caches. The new default size is large enough for caches + * up to 20 MB. + * Version 5.10 changes the loop index variables from "register int" + * to "ssize_t", which allows array indices >2^32 (4 billion) + * on properly configured 64-bit systems. Additional compiler options + * (such as "-mcmodel=medium") may be required for large memory runs. + * + * Array size can be set at compile time without modifying the source + * code for the (many) compilers that support preprocessor definitions + * on the compile line. E.g., + * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M + * will override the default size of 10M with a new size of 100M elements + * per array. + */ +#ifndef STREAM_ARRAY_SIZE +# define STREAM_ARRAY_SIZE 10000000 +#endif + +/* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result + * for any iteration after the first, therefore the minimum value + * for NTIMES is 2. + * There are no rules on maximum allowable values for NTIMES, but + * values larger than the default are unlikely to noticeably + * increase the reported performance. + * NTIMES can also be set on the compile line without changing the source + * code using, for example, "-DNTIMES=7". + */ +#ifdef NTIMES +#if NTIMES<=1 +# define NTIMES 10 +#endif +#endif +#ifndef NTIMES +# define NTIMES 10 +#endif + +/* Users are allowed to modify the "OFFSET" variable, which *may* change the + * relative alignment of the arrays (though compilers may change the + * effective offset by making the arrays non-contiguous on some systems). + * Use of non-zero values for OFFSET can be especially helpful if the + * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. + * OFFSET can also be set on the compile line without changing the source + * code using, for example, "-DOFFSET=56". + */ +#ifndef OFFSET +# define OFFSET 0 +#endif + +/* + * 3) Compile the code with optimization. Many compilers generate + * unreasonably bad code before the optimizer tightens things up. + * If the results are unreasonably good, on the other hand, the + * optimizer might be too smart for me! + * + * For a simple single-core version, try compiling with: + * cc -O stream.c -o stream + * This is known to work on many, many systems.... + * + * To use multiple cores, you need to tell the compiler to obey the OpenMP + * directives in the code. This varies by compiler, but a common example is + * gcc -O -fopenmp stream.c -o stream_omp + * The environment variable OMP_NUM_THREADS allows runtime control of the + * number of threads/cores used when the resulting "stream_omp" program + * is executed. + * + * To run with single-precision variables and arithmetic, simply add + * -DSTREAM_TYPE=float + * to the compile line. + * Note that this changes the minimum array sizes required --- see (1) above. + * + * The preprocessor directive "TUNED" does not do much -- it simply causes the + * code to call separate functions to execute each kernel. Trivial versions + * of these functions are provided, but they are *not* tuned -- they just + * provide predefined interfaces to be replaced with tuned code. + * + * + * 4) Optional: Mail the results to mccalpin@cs.virginia.edu + * Be sure to include info that will help me understand: + * a) the computer hardware configuration (e.g., processor model, memory type) + * b) the compiler name/version and compilation flags + * c) any run-time information (such as OMP_NUM_THREADS) + * d) all of the output from the test case. + * + * Thanks! + * + *-----------------------------------------------------------------------*/ + +# define HLINE "-------------------------------------------------------------\n" + +# ifndef MIN +# define MIN(x,y) ((x)<(y)?(x):(y)) +# endif +# ifndef MAX +# define MAX(x,y) ((x)>(y)?(x):(y)) +# endif + +#ifndef STREAM_TYPE +#define STREAM_TYPE double +#endif + +static STREAM_TYPE *a, *b, *c; + +static double avgtime[4] = {0}, maxtime[4] = {0}, + mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; + +static char *label[4] = {"Copy: ", "Scale: ", + "Add: ", "Triad: "}; + +static double bytes[4] = { + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 4 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, + 4 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE + }; + +extern double mysecond(); +extern void checkSTREAMresults(); +#ifdef TUNED +extern void tuned_STREAM_Copy(); +extern void tuned_STREAM_Scale(STREAM_TYPE scalar); +extern void tuned_STREAM_Add(); +extern void tuned_STREAM_Triad(STREAM_TYPE scalar); +#endif +#ifdef _OPENMP +extern int omp_get_num_threads(); +#endif +int +main() + { + int quantum, checktick(); + int BytesPerWord; + int k; + ssize_t j; + STREAM_TYPE scalar; + double t, times[4][NTIMES]; + + /* --- SETUP --- determine precision and check timing --- */ + a = malloc(sizeof(STREAM_TYPE) * (STREAM_ARRAY_SIZE + OFFSET)); + b = malloc(sizeof(STREAM_TYPE) * (STREAM_ARRAY_SIZE + OFFSET)); + c = malloc(sizeof(STREAM_TYPE) * (STREAM_ARRAY_SIZE + OFFSET)); + + printf(HLINE); + printf("STREAM version $Revision: 5.10 $\n"); + printf(HLINE); + BytesPerWord = sizeof(STREAM_TYPE); + printf("This system uses %d bytes per array element.\n", + BytesPerWord); + + printf(HLINE); +#ifdef N + printf("***** WARNING: ******\n"); + printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); + printf(" This version of the code uses the preprocessor variable STREAM_ARRAY_SIZE to control the array size\n"); + printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); + printf("***** WARNING: ******\n"); +#endif + + printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); + printf("Memory per array = %.1f MiB (= %.1f GiB).\n", + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), + BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); + printf("Total memory required = %.1f MiB (= %.1f GiB).\n", + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), + (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); + printf("Each kernel will be executed %d times.\n", NTIMES); + printf(" The *best* time for each kernel (excluding the first iteration)\n"); + printf(" will be used to compute the reported bandwidth.\n"); + +#ifdef _OPENMP + printf(HLINE); +#pragma omp parallel + { +#pragma omp master + { + k = omp_get_num_threads(); + printf ("Number of Threads requested = %i\n",k); + } + } +#endif + +#ifdef _OPENMP + k = 0; +#pragma omp parallel +#pragma omp atomic + k++; + printf ("Number of Threads counted = %i\n",k); +#endif + + /* Get initial value for system clock. */ +#pragma omp parallel for + for (j=0; j= 1) + printf("Your clock granularity/precision appears to be " + "%d microseconds.\n", quantum); + else { + printf("Your clock granularity appears to be " + "less than one microsecond.\n"); + quantum = 1; + } + + t = mysecond(); +#pragma omp parallel for + for (j = 0; j < STREAM_ARRAY_SIZE; j++) + a[j] = 2.0E0 * a[j]; + t = 1.0E6 * (mysecond() - t); + + printf("Each test below will take on the order" + " of %d microseconds.\n", (int) t ); + printf(" (= %d clock ticks)\n", (int) (t/quantum) ); + printf("Increase the size of the arrays if this shows that\n"); + printf("you are not getting at least 20 clock ticks per test.\n"); + + printf(HLINE); + + printf("WARNING -- The above is only a rough guideline.\n"); + printf("For best results, please be sure you know the\n"); + printf("precision of your system timer.\n"); + printf(HLINE); + + /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ + + scalar = 3.0; + for (k=0; k + +double mysecond() +{ + struct timeval tp; + struct timezone tzp; + int i; + + i = gettimeofday(&tp,&tzp); + return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); +} + +#ifndef abs +#define abs(a) ((a) >= 0 ? (a) : -(a)) +#endif +void checkSTREAMresults () +{ + STREAM_TYPE aj,bj,cj,scalar; + STREAM_TYPE aSumErr,bSumErr,cSumErr; + STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; + double epsilon; + ssize_t j; + int k,ierr,err; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (k=0; k epsilon) { + err++; + printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,aj,a[j],abs((aj-a[j])/aAvgErr)); + } +#endif + } + } + printf(" For array a[], %d errors were found.\n",ierr); + } + if (abs(bAvgErr/bj) > epsilon) { + err++; + printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,bj,b[j],abs((bj-b[j])/bAvgErr)); + } +#endif + } + } + printf(" For array b[], %d errors were found.\n",ierr); + } + if (abs(cAvgErr/cj) > epsilon) { + err++; + printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); + printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); + printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + ierr = 0; + for (j=0; j epsilon) { + ierr++; +#ifdef VERBOSE + if (ierr < 10) { + printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", + j,cj,c[j],abs((cj-c[j])/cAvgErr)); + } +#endif + } + } + printf(" For array c[], %d errors were found.\n",ierr); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); + } +#ifdef VERBOSE + printf ("Results Validation Verbose Results: \n"); + printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); + printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); + printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); +#endif +} + +#ifdef TUNED +/* stubs for "tuned" versions of the kernels */ +void tuned_STREAM_Copy() +{ + ssize_t j; +#pragma omp parallel for + for (j=0; j