From 1fd513960b773bfcf26b77d259b80d0c23797a68 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Fri, 17 Jan 2025 00:24:11 -0500 Subject: [PATCH 1/5] codegen: Add --depend option --- tools/codegen.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/codegen.py b/tools/codegen.py index d79e89f2..273ed372 100755 --- a/tools/codegen.py +++ b/tools/codegen.py @@ -63,7 +63,7 @@ Makefile.blas.gen: force_gen endif endif - + force_gen: ; ---------------------------------------------------------------------- @@ -97,6 +97,7 @@ epilog=help ) parser.add_argument( '-v', '--verbose', action='store_true', help='Print verbose output to stderr' ) parser.add_argument( '-o', '--output', action='store_true', help='Generate list of output files' ) +parser.add_argument( '-d', '--depend', action='store_true', help='Generate list of dependencies (output1 output2: input)' ) parser.add_argument( '-m', '--make', action='store_true', help='Generate Makefile rules' ) parser.add_argument( '--prefix', action='store', help='Prefix for variables in Makefile', default='src') parser.add_argument( '-p', '--precision', action='append', help='Generate only given precision (s d c z ds zc ...). Repeatable.' ) @@ -344,6 +345,15 @@ def main(): # end print( " ".join( generated ) ) + elif opts.depend: + depends = '' + for filename in opts.args: + src = SourceFile( filename ) + (files, precs) = src.get_filenames( opts.precision ) + if (files): + print( " ".join( files ) + ": " + filename ) + # end + else: # default is to generate files for filename in opts.args: From a05afcc5250e9727ea51521c222f87abc959e7c7 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Fri, 17 Jan 2025 16:12:29 -0500 Subject: [PATCH 2/5] codegen: fix subs for newer python --- tools/subs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/subs.py b/tools/subs.py index e8b7f99b..aad67cd2 100644 --- a/tools/subs.py +++ b/tools/subs.py @@ -437,8 +437,8 @@ def title( table ): #('plasma_s', 'plasma_c' ), # ----- Fortran examples - ('real\(', 'complex\(', ), - ('\(transpose\(', 'conjg\(transpose\(' ), + (r'real\(', r'complex\(', ), + (r'\(transpose\(', r'conjg\(transpose\(' ), ], # end mixed @@ -463,7 +463,7 @@ def title( table ): ('symmetric', 'symmetric', 'hermitian', 'hermitian' ), ('symmetric', 'symmetric', 'Hermitian', 'Hermitian' ), ('orthogonal', 'orthogonal', 'unitary', 'unitary' ), - ('\^T', '\^T', '\^H', '\^H' ), + (r'\^T', r'\^T', r'\^H', r'\^H' ), ('%f', '%lf', '%f', '%lf' ), # for scanf # ----- CBLAS @@ -551,8 +551,8 @@ def title( table ): # ----- Fortran examples ('wp = sp', 'wp = dp', 'wp = sp', 'wp = dp' ), - ('real\(wp\)', 'real\(wp\)', 'complex\(wp\)', 'complex\(wp\)' ), - ('\(transpose\(', '\(transpose\(', 'conjg\(transpose\(', 'conjg\(transpose\(' ), + (r'real\(wp\)', r'real\(wp\)', r'complex\(wp\)', r'complex\(wp\)' ), + (r'\(transpose\(', r'\(transpose\(', r'conjg\(transpose\(', r'conjg\(transpose\(' ), ], # end normal } # end subs From 8cf2db3ea359aca7a6c30192357bd533eefc2338 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Fri, 17 Jan 2025 15:53:28 -0500 Subject: [PATCH 3/5] generate sstevx2 from dstevx2 instead of dummy zstevx2; similarly with others --- .gitignore | 4 - compute/{zlaebz2.c => dlaebz2.c} | 131 ++++++++-------- compute/{zlaneg2.c => dlaneg2.c} | 57 ++++--- compute/{zstevx2.c => dstevx2.c} | 191 ++++++++++++------------ test/{test_zstevx2.c => test_dstevx2.c} | 106 +++++++------ 5 files changed, 235 insertions(+), 254 deletions(-) rename compute/{zlaebz2.c => dlaebz2.c} (84%) rename compute/{zlaneg2.c => dlaneg2.c} (86%) rename compute/{zstevx2.c => dstevx2.c} (81%) rename test/{test_zstevx2.c => test_dstevx2.c} (84%) diff --git a/.gitignore b/.gitignore index e270d32a..768b2547 100644 --- a/.gitignore +++ b/.gitignore @@ -96,9 +96,7 @@ compute/dgetri.c compute/dgetri_aux.c compute/dgetrs.c compute/dlacpy.c -compute/dlaebz2.c compute/dlag2s.c -compute/dlaneg2.c compute/dlangb.c compute/dlange.c compute/dlansy.c @@ -122,7 +120,6 @@ compute/dpotrs.c compute/dsgbsv.c compute/dsgesv.c compute/dsposv.c -compute/dstevx2.c compute/dsymm.c compute/dsyr2k.c compute/dsyrk.c @@ -606,7 +603,6 @@ test/test_ds.h test/test_dsgbsv.c test/test_dsgesv.c test/test_dsposv.c -test/test_dstevx2.c test/test_dsymm.c test/test_dsyr2k.c test/test_dsyrk.c diff --git a/compute/zlaebz2.c b/compute/dlaebz2.c similarity index 84% rename from compute/zlaebz2.c rename to compute/dlaebz2.c index 6b0b211a..f592422d 100644 --- a/compute/zlaebz2.c +++ b/compute/dlaebz2.c @@ -1,17 +1,17 @@ /** * - * @file + * @file * * PLASMA is a software package provided by: * University of Tennessee, US, * - * @precisions normal z -> s d + * @precisions normal d -> s * **/ #include "plasma.h" #include "plasma_internal.h" /* needed for imin, imax. */ -#include "plasma_zlaebz2_work.h" /* work areas. */ +#include "plasma_dlaebz2_work.h" /* work areas. */ #include #include @@ -22,29 +22,26 @@ * * @ingroup plasma_gemm * - * - * This file is a z-template to generate s and d code. - * Only s and d are compiled; not c or z. * This code is not designed to be called directly by users; it is a subroutine - * for zstevx2.c. + * for dstevx2.c. * * Specifically, this is a task-based parallel algorithm, the parameters are - * contained in the already initialized and populated zlaebz2_Control_t; For - * example, from zstevx2: + * contained in the already initialized and populated dlaebz2_Control_t; For + * example, from dstevx2: * * #pragma omp parallel * { * #pragma omp single * { - * plasma_zlaebz2(&Control, ...etc...); + * plasma_dlaebz2(&Control, ...etc...); * } * } * - * + * ******************************************************************************* - * + * * @param[in] *Control - * A pointer to the global variables needed. + * A pointer to the global variables needed. * * @param[in] Control->N * int number of rows in the matrix. @@ -66,20 +63,20 @@ * PlasmaVec if user desires eigenvectors computed. * * @param[in] Control->il - * int enum. The lowerBound of an index range if range is - * PlasmaRangeI. + * int enum. The lowerBound of an index range if range is + * PlasmaRangeI. * * @param[in] Control->iu - * int enum. The upperBound of an index range, if range is + * int enum. The upperBound of an index range, if range is * PlasmaRangeI. * * @param[in] Control->stein_arrays - * array of [max_threads], type zlaebz2_Stein_Array_t, contains work + * array of [max_threads], type dlaebz2_Stein_Array_t, contains work * areas per thread for invoking _stein (inverse iteration to find * eigenvectors). * * @param[in] Control->baseIdx - * The index of the least eigenvalue to be found in the bracket, + * The index of the least eigenvalue to be found in the bracket, * used to calculate the offset into the return vectors/arrays. * * @param[out] Control->error @@ -130,7 +127,7 @@ * * * This algorithm uses Bisection by the Scaled Sturm Sequence, implemented in - * plasma_zlaebz2, followed by the LAPACK routine _STEIN, which uses inverse + * plasma_dlaebz2, followed by the LAPACK routine _STEIN, which uses inverse * iteration to find the eigenvalue. The initial 'bracket' parameters should * contain the full range for the eigenvalues we are to discover. The algorithm * is recursively task based, at each division the bracket is divided into two @@ -151,7 +148,7 @@ *****************************************************************************/ /******************************************************************************* - * Use LAPACK zstein to find a single eigenvector. We may use this routine + * Use LAPACK dstein to find a single eigenvector. We may use this routine * multiple times, so instead of allocating/freeing the work spaces repeatedly, * we have an array of pointers, per thread, to workspaces we allocate if not * already allocated for this thread. So we don't allocate more than once per @@ -160,9 +157,9 @@ * to converge. *******************************************************************************/ -int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd, - plasma_complex64_t u, plasma_complex64_t *v, int N, - zlaebz2_Stein_Array_t *myArrays) { +int plasma_dstein( double *diag, double *offd, + double u, double *v, int N, + dlaebz2_Stein_Array_t *myArrays) { int M=1, LDZ=N, INFO; int thread = omp_get_thread_num(); @@ -176,22 +173,22 @@ int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd, if (myArrays[thread].ISPLIT != NULL) myArrays[thread].ISPLIT[0]=N; } - if (myArrays[thread].WORK == NULL) myArrays[thread].WORK = (plasma_complex64_t*) calloc(5*N, sizeof(plasma_complex64_t)); + if (myArrays[thread].WORK == NULL) myArrays[thread].WORK = (double*) calloc(5*N, sizeof(double)); if (myArrays[thread].IWORK == NULL) myArrays[thread].IWORK = (int*) calloc(N, sizeof(int)); if (myArrays[thread].IFAIL == NULL) myArrays[thread].IFAIL = (int*) calloc(N, sizeof(int)); - if (myArrays[thread].IBLOCK == NULL || - myArrays[thread].ISPLIT == NULL || - myArrays[thread].WORK == NULL || - myArrays[thread].IWORK == NULL || + if (myArrays[thread].IBLOCK == NULL || + myArrays[thread].ISPLIT == NULL || + myArrays[thread].WORK == NULL || + myArrays[thread].IWORK == NULL || myArrays[thread].IFAIL == NULL) { return(PlasmaErrorOutOfMemory); } - plasma_complex64_t W = u; + double W = u; - /* We use the 'work' version so we can re-use our work arrays; using LAPACKE_zstein() */ - /* would re-allocate and release work areas on every call. */ - INFO = LAPACKE_zstein_work(LAPACK_COL_MAJOR, N, diag, offd, M, &W, myArrays[thread].IBLOCK, + /* We use the 'work' version so we can re-use our work arrays; using LAPACKE_dstein() */ + /* would re-allocate and release work areas on every call. */ + INFO = LAPACKE_dstein_work(LAPACK_COL_MAJOR, N, diag, offd, M, &W, myArrays[thread].IBLOCK, myArrays[thread].ISPLIT, v, LDZ, myArrays[thread].WORK, myArrays[thread].IWORK, myArrays[thread].IFAIL); return(INFO); @@ -213,23 +210,23 @@ int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd, * nLT_Low or nLT_hi is computed. * ***************************************************************************/ -void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound, - plasma_complex64_t upperBound, int nLT_low, int nLT_hi, int numEV) { +void plasma_dlaebz2(dlaebz2_Control_t *Control, double lowerBound, + double upperBound, int nLT_low, int nLT_hi, int numEV) { - plasma_complex64_t *diag = Control->diag; - plasma_complex64_t *offd = Control->offd; + double *diag = Control->diag; + double *offd = Control->offd; int N = Control->N; - - plasma_complex64_t cp; + + double cp; int flag=0, evLess; if (nLT_low < 0) { - nLT_low = plasma_zlaneg2(diag, offd, N, lowerBound); + nLT_low = plasma_dlaneg2(diag, offd, N, lowerBound); flag=1; } if (nLT_hi < 0) { - nLT_hi = plasma_zlaneg2(diag, offd, N, upperBound); + nLT_hi = plasma_dlaneg2(diag, offd, N, upperBound); flag=1; } @@ -243,17 +240,17 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound, if (Control->range == PlasmaRangeI) { if (nLT_hi < Control->il || /* e.g if il=500, and nLT_hi=499, this bracket is under range of interest. */ nLT_low > Control->iu) { /* e.g if iu=1000, and nLT_low=1001, this bracket is above range of interest. */ - return; + return; } - } - + } + /* Bisect the bracket until we can't anymore. */ - + flag = 0; for (;;) { cp = (lowerBound+upperBound)*0.5; if (cp == lowerBound || cp == upperBound) { - /* Our bracket has been narrowed to machine epsilon for this magnitude (=ulp). + /* Our bracket has been narrowed to machine epsilon for this magnitude (=ulp). * We are done; the bracket is always [low,high). 'high' is not included, so * we have numEV eigenvalues at low, whether it == 1 or is > 1. We find * the eigenvector. (We can test multiplicity with GluedWilk). @@ -261,13 +258,13 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound, break; /* exit for(;;). */ } else { /* we have a new cutpoint. */ - evLess = plasma_zlaneg2(diag, offd, N, cp); + evLess = plasma_dlaneg2(diag, offd, N, cp); if (evLess < 0) { /* We could not compute the Sturm sequence for it. */ flag = -1; /* indicate an error. */ break; /* exit for (;;). */ } - + /* Discard empty halves in both PlasmaRangeV and PlasmaRangeI. * If #EV < cutpoint is the same as the #EV < high, it means * no EV are in [cutpoint, hi]. We can discard that range. @@ -277,16 +274,16 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound, upperBound = cp; continue; } - + /* If #EV < cutpoint is the same as #EV < low, it means no - * EV are in [low, cutpoint]. We can discard that range. + * EV are in [low, cutpoint]. We can discard that range. */ if (evLess == nLT_low) { lowerBound = cp; continue; } - + /* Note: If we were PlasmaRangeV, the initial bounds given by the user are the ranges, * so we have nothing further to do. In PlasmaRangeI; the initial bounds are Gerschgorin * limits and not enough: We must further narrow to the desired indices. @@ -295,8 +292,8 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound, if (Control->range == PlasmaRangeI) { /* For PlasmaRangeI: * Recall that il, iu are 1-relative; while evLess is zero-relative; i.e. - * if [il,iu]=[1,2], evless must be 0, or 1. - * when evLess= iu, @@ -311,7 +308,7 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound, numEV = (nLT_hi-nLT_low); continue; } - + if (evLess >= Control->iu) { /* The upper half [cp, upperBound) is not needed, it has no indices > iu; */ upperBound = cp; @@ -320,24 +317,24 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound, continue; } } /*end if index search. */ - + /* Here, the cutpoint has EV on both left right. We push off the right bracket. - * The new lowerBound is the cp, the upperBound is unchanged, the number of + * The new lowerBound is the cp, the upperBound is unchanged, the number of * eigenvalues changes. */ #pragma omp task - plasma_zlaebz2(Control, cp, upperBound, evLess, nLT_hi, (nLT_hi-evLess)); + plasma_dlaebz2(Control, cp, upperBound, evLess, nLT_hi, (nLT_hi-evLess)); /* Update the Left side I kept. The new number of EV less than upperBound - * is evLess, recompute number of EV in the bracket. */ + * is evLess, recompute number of EV in the bracket. */ upperBound = cp; nLT_hi = evLess; - numEV =( evLess - nLT_low); - continue; + numEV =( evLess - nLT_low); + continue; } } /* end for (;;) for Bisection. */ - + /* Okay, count this eigenpair done, add to the Done list. - * NOTE: nLT_low is the global zero-relative index of + * NOTE: nLT_low is the global zero-relative index of * this set of mpcity eigenvalues. * No other brackets can change our entry, so we * don't need any thread block or atomicity. @@ -349,24 +346,24 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound, } else { /* range == PlasmaRangeV */ myIdx = nLT_low - Control->baseIdx; } - + if (Control->jobtype == PlasmaVec) { /* get the eigenvector. */ - int ret=plasma_zstein(diag, offd, lowerBound, &(Control->pVec[myIdx*N]), N, Control->stein_arrays); + int ret=plasma_dstein(diag, offd, lowerBound, &(Control->pVec[myIdx*N]), N, Control->stein_arrays); if (ret != 0) { #pragma omp critical (UpdateStack) { - /* Only store first error we encounter */ + /* Only store first error we encounter */ if (Control->error == 0) Control->error = ret; } } } - + /* Add eigenvalue and multiplicity. */ Control->pVal[myIdx]=lowerBound; Control->pMul[myIdx]=numEV; - -// #pragma omp atomic + +// #pragma omp atomic // Control->finished += numEV; } diff --git a/compute/zlaneg2.c b/compute/dlaneg2.c similarity index 86% rename from compute/zlaneg2.c rename to compute/dlaneg2.c index 258292df..eafc5f5a 100644 --- a/compute/zlaneg2.c +++ b/compute/dlaneg2.c @@ -1,19 +1,14 @@ /** * - * @file + * @file * * PLASMA is a software package provided by: * University of Tennessee, US, * - * @precisions normal z -> s d + * @precisions normal d -> s * **/ -/* - * This file is a z-template to generate s and d code. - * Only s and d are compiled; not c or z. - */ - /****************************************************************************** * See https://archive.siam.org/meetings/la03/proceedings/zhangjy3.pdf * "J. Zhang, 2003, The Scaled Sturm Sequence Computation". Both the Sturm @@ -28,22 +23,22 @@ * p[-1] = 1.; // zero relative indexing. * p[0] = diag[0] - u; * p[i] = (diag[i]-u)*p[i-1] - offd[i-1]*offd[i-1]*p[i-2], i=1, N-1. - * + * * The Classical Sturm recurrence can be shown as a matrix computation; namely * P[i] = M[i]*P[i-1]. Be careful of the i-1 index: * M[i] = [(diag[i]-u) , -offd[i-1]*offd[i-1] ] and P[i-1] = [ p[i-1] ] * [ 1 , 0 ] [ p[i-2] ] * with P[-1] defined to be [1, 0] transposed. * notice 'p' is the classical Sturm, 'P' is a vector. - * - * the matrix computation results in the vector: + * + * the matrix computation results in the vector: * M[i]*P[i-1] = { (diag[i]-u)*p[i-1] -offd[i-1]*offd[i-1]*p[i-2] , p[i-1] } - * + * * So, in the classical case, P[i][0] is the classic Sturm sequence for p[i]; * the second element is just the classic Sturm for p[i-1]. * - * However, this won't remain that way. For the SCALED Sturm sequence, we - * will scale P[i] after each calculation, with the scalar 's': + * However, this won't remain that way. For the SCALED Sturm sequence, we + * will scale P[i] after each calculation, with the scalar 's': * * ********************************* * P[i] = s * M[i]*P[i-1], i=0, N-1. Note we are scaling a vector here. @@ -56,28 +51,28 @@ * save = s * Pm1_0; * Pm1_0 = s * ( (diag[i]-u)*Pm1_0 -offd[i-1]*offd[i-1]*Pm1_1 ); * Pm1_1 = save; - + * Pm1_0 is used like the classical Sturm sequence; meaning we must calculate * sign changes. - * + * * s is computed given the vector X[] = M[i]*P[i-1] above. * PHI is set to 10^{10}, UPSILON is set to 10^{-10}. Then: - * w = max(fabs(X[0]), fabs(X[1])). + * w = max(fabs(X[0]), fabs(X[1])). * if w > PHI then s = PHI/w; * else if w < UPSILON then s = UPSILON/w; * else s=1.0 (or, do not scale X). - * + * * This algorithm is backward stable. execution time is 1.5 times classic Sturm. - * + * * No sign change counts eigenvalues >= u. * sign changes count eigenvalues < u. * This routine returns the number of sign changes, which is the count of * eigenvalues strictly less than u. - * + * * computation: What we need for each computation: * M[i], which we compute on the fly from diag[i] and offd[i-1]. * P[i-1], which has two elements, [Pm1_0, Pm1_1]. (Pm1 means P minus 1). - * LAPACK routine DLAEBZ computes a standard Sturm sequences; there is no + * LAPACK routine DLAEBZ computes a standard Sturm sequences; there is no * comparable auto-scaling Sturm sequence. * * This routine is most similar to LAPACK DLANEG.f, but is not a replacement @@ -92,13 +87,13 @@ #include -int plasma_zlaneg2(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, plasma_complex64_t u) { +int plasma_dlaneg2(double *diag, double *offd, int n, double u) { int i, isneg=0; - plasma_complex64_t s, w, v0, v1, Pm1_0, Pm1_1, PHI, UPSILON; + double s, w, v0, v1, Pm1_0, Pm1_1, PHI, UPSILON; if (n==0) return (0); - PHI = ((plasma_complex64_t)(((long long) 1)<<34)); + PHI = ((double)(((long long) 1)<<34)); UPSILON = 1.0/PHI; - + Pm1_1 = 1.0; Pm1_0 = (diag[0]-u); if (Pm1_0 < 0) isneg = 1; /* our first test. */ @@ -108,12 +103,12 @@ int plasma_zlaneg2(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, pl v1 = fabs(Pm1_1); if (v0 > v1) w = v0; else w = v1; - + /*Go ahead and calculate P[i]: */ s = Pm1_0; Pm1_0 = (diag[i]-u)*Pm1_0 -((offd[i-1]*offd[i-1])*Pm1_1); Pm1_1 = s; - + /* Now determine whether to scale these new values. */ if (w > PHI) { s = PHI/w; @@ -124,13 +119,13 @@ int plasma_zlaneg2(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, pl Pm1_0 *= s; Pm1_1 *= s; } /* else skip scaling. */ - + /* Finally, see if the sign changed. */ - if ( (Pm1_0 < 0 && Pm1_1 >= 0) || + if ( (Pm1_0 < 0 && Pm1_1 >= 0) || (Pm1_0 >= 0 && Pm1_1 < 0) - ) isneg++; + ) isneg++; } - + return(isneg); -} /* end plasma_zlaneg2 */ +} /* end plasma_dlaneg2 */ diff --git a/compute/zstevx2.c b/compute/dstevx2.c similarity index 81% rename from compute/zstevx2.c rename to compute/dstevx2.c index 87a065be..0a241083 100644 --- a/compute/zstevx2.c +++ b/compute/dstevx2.c @@ -1,23 +1,18 @@ /** * - * @file + * @file * * PLASMA is a software package provided by: * University of Tennessee, US, * University of Manchester, UK. * - * @precisions normal z -> s d + * @precisions normal d -> s * **/ -/* - * This file is a z-template to generate s and d code. - * Only s and d are compiled; not c or z. - */ - #include "plasma.h" #include "plasma_internal.h" /* needed for imin, imax. */ -#include "plasma_zlaebz2_work.h" /* work areas. */ +#include "plasma_dlaebz2_work.h" /* work areas. */ #include #include @@ -34,14 +29,14 @@ * eigenvectors can be selected by specifying either a range of values or a * range of indices for the desired eigenvalues. * - * This is similiar to LAPACK dstevx, with more output parameters. + * This is similiar to LAPACK dstevx, with more output parameters. * * Because input matrices are expected to be extremely large and the exact * number of eigenvalues is not necessarily known to the caller, this routine * provides a way to get the number of eigenvalues in either a value range or * an index range; so the caller can allocate the return arrays. There are * three; the floating point vector pVal, the integer vector pMul, and the - * floating point matrix pVec, which is only required and only referenced for + * floating point matrix pVec, which is only required and only referenced for * jobtype=PLasmaVec. * * When the jobtype=PlasmaCount; the code returns the maximum number of @@ -60,7 +55,7 @@ * * Finding eigenvalues alone is much faster than finding eigenpairs; the * majority of the time consumed when eigenvectors are found is in - * orthogonalizing the eigenvectors; an O(N*K^2) operation. + * orthogonalizing the eigenvectors; an O(N*K^2) operation. ******************************************************************************* * * @param[in] jobtype @@ -68,22 +63,22 @@ * = PlasmaNoVec: computes eigenvalues only; * = PlasmaVec: computes eigenvalues and eigenvectors. * = PlasmaCount: computes pFound as the max number of eigenvalues/pairs - * in the given range if there is no ULP-multiplicity, so + * in the given range if there is no ULP-multiplicity, so * the user can allocate pVal[], pMul[], pVec[]. * * @param[in] range * enum: * PlasmaRangeV use vl, vu for range [vl, vu) - * PlasmaRangeI use il, iu for range [il, iu]. 1-relative; 1..N. + * PlasmaRangeI use il, iu for range [il, iu]. 1-relative; 1..N. * * @param[in] n * int. The order of the matrix A. n >= 0. * * @param[in] k * int. The space the user has allocated for eigenvalues; as reflected - * in pVal, pMul, pVec. + * in pVal, pMul, pVec. * - * @param[in] diag double[n]. Vector of [n] diagonal entries of A. + * @param[in] diag double[n]. Vector of [n] diagonal entries of A. * * @param[in] offd double[n-1]. A vector of [n-1] off-diagonal entries of A. * @@ -137,53 +132,53 @@ * with offd[-1], offd[n] = 0. * Indexes above are 0 relative. * Although Gerschgorin is mentioned in ?larr?.f LAPACK files, it is coded - * inline there. + * inline there. *****************************************************************************/ -void plasma_zstelg(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, - plasma_complex64_t *Min, plasma_complex64_t *Max) { +void plasma_dstelg(double *diag, double *offd, int n, + double *Min, double *Max) { int i; - plasma_complex64_t test, testdi, testdim1, min=__DBL_MAX__, max=-__DBL_MAX__; - + double test, testdi, testdim1, min=__DBL_MAX__, max=-__DBL_MAX__; + for (i=0; i max) { max=test; - } + } } - - - plasma_complex64_t cp, minLB=min, minUB=max, maxLB=min, maxUB=max; + + + double cp, minLB=min, minUB=max, maxLB=min, maxUB=max; /* Within that range, find the actual minimum. */ for (;;) { cp = (minLB+minUB)*0.5; if (cp == minLB || cp == minUB) break; - if (plasma_zlaneg2(diag, offd, n, cp) == n) minLB = cp; + if (plasma_dlaneg2(diag, offd, n, cp) == n) minLB = cp; else minUB = cp; } - + /* Within that range, find the actual maximum. */ for (;;) { cp = (maxLB+maxUB)*0.5; if (cp == maxLB || cp == maxUB) break; - if (plasma_zlaneg2(diag, offd, n, cp) == n) { + if (plasma_dlaneg2(diag, offd, n, cp) == n) { maxUB=cp; } else { maxLB=cp; } } - + *Min = minLB; *Max = maxUB; } @@ -191,7 +186,7 @@ void plasma_zstelg(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, /****************************************************************************** * STMVM: Symmetric Tridiagonal Matrix Vector Multiply. * Matrix multiply; A * X = Y. - * A = [diag[0], offd[0], + * A = [diag[0], offd[0], * [offd[0], diag[1], offd[1] * [ 0, offd[1], diag[2], offd[2], * ... @@ -201,12 +196,12 @@ void plasma_zstelg(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, * This could be done by 3 daxpy, but more code and I think more confusing. *****************************************************************************/ -void plasma_zstmv(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, - plasma_complex64_t *X, plasma_complex64_t *Y) { +void plasma_dstmv(double *diag, double *offd, int n, + double *X, double *Y) { int i; Y[0] = diag[0]*X[0] + offd[0]*X[1]; Y[n-1] = offd[n-2]*X[n-2] + diag[n-1]*X[n-1]; - + for (i=1; i<(n-1); i++) { Y[i] = offd[i-1]*X[i-1] + diag[i]*X[i] + offd[i]*X[i+1]; } @@ -218,23 +213,23 @@ void plasma_zstmv(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, * This routine is necessary to determine if eigenvectors should be swapped. * eigenpair error: If A*v = u*v, then A*v-u*v should == 0. We compute the * L_infinity norm of (A*v-u*v). - * We return DBL_MAX if the eigenvector (v) is all zeros, or if we fail to - * allocate memory. - * If u==0.0, we'll return L_INF of (A*V). + * We return DBL_MAX if the eigenvector (v) is all zeros, or if we fail to + * allocate memory. + * If u==0.0, we'll return L_INF of (A*V). *****************************************************************************/ -plasma_complex64_t plasma_zstepe(plasma_complex64_t *diag, - plasma_complex64_t *offd, int n, plasma_complex64_t u, - plasma_complex64_t *v) { +double plasma_dstepe(double *diag, + double *offd, int n, double u, + double *v) { int i, zeros=0; - plasma_complex64_t *AV; - plasma_complex64_t norm, dtemp; - - AV = (plasma_complex64_t*) malloc(n * sizeof(plasma_complex64_t)); + double *AV; + double norm, dtemp; + + AV = (double*) malloc(n * sizeof(double)); if (AV == NULL) return __DBL_MAX__; - - plasma_zstmv(diag, offd, n, v, AV); /* AV = A*v. */ - + + plasma_dstmv(diag, offd, n, v, AV); /* AV = A*v. */ + norm = -__DBL_MAX__; /* Trying to find maximum. */ zeros=0; for (i=0; i norm) norm=dtemp; if (v[i] == 0.) zeros++; } - + free(AV); if (zeros == n) return __DBL_MAX__; return norm; @@ -250,19 +245,19 @@ plasma_complex64_t plasma_zstepe(plasma_complex64_t *diag, /****************************************************************************** - * This is the main routine; plasma_zstevx2 - * Arguments are described at the top of this source. + * This is the main routine; plasma_dstevx2 + * Arguments are described at the top of this source. *****************************************************************************/ -int plasma_zstevx2( +int plasma_dstevx2( /* error report */ /* args 1 - 4 */ plasma_enum_t jobtype, plasma_enum_t range, int n, int k, - /* args 5,6 */ plasma_complex64_t *diag, plasma_complex64_t *offd, - /* args 7,8 */ plasma_complex64_t vl, plasma_complex64_t vu, - /* args 9 - 12*/ int il, int iu, int *pFound, plasma_complex64_t *pVal, - /* arg 13,14 */ int *pMul, plasma_complex64_t *pVec) + /* args 5,6 */ double *diag, double *offd, + /* args 7,8 */ double vl, double vu, + /* args 9 - 12*/ int il, int iu, int *pFound, double *pVal, + /* arg 13,14 */ int *pMul, double *pVec) { int i, max_threads; - zlaebz2_Stein_Array_t *stein_arrays = NULL; + dlaebz2_Stein_Array_t *stein_arrays = NULL; /* Get PLASMA context. */ plasma_context_t *plasma = plasma_context_self(); if (plasma == NULL) { @@ -295,7 +290,7 @@ int plasma_zstevx2( plasma_error("illegal pointer offd"); return -6; } - + if (range == PlasmaRangeV && vu <= vl ) { plasma_error("illegal value of vl and vu"); return -7; @@ -323,13 +318,13 @@ int plasma_zstevx2( if (jobtype == PlasmaVec) { /* we use calloc because we rely on pointer elements being NULL to single */ - /* a need to allocate. */ - stein_arrays = (zlaebz2_Stein_Array_t*) calloc(max_threads, sizeof(zlaebz2_Stein_Array_t)); + /* a need to allocate. */ + stein_arrays = (dlaebz2_Stein_Array_t*) calloc(max_threads, sizeof(dlaebz2_Stein_Array_t)); if (stein_arrays == NULL) { return PlasmaErrorOutOfMemory; } } - + /* Initialize sequence. */ plasma_sequence_t sequence; plasma_sequence_init(&sequence); @@ -338,10 +333,10 @@ int plasma_zstevx2( plasma_request_t request; plasma_request_init(&request); - plasma_complex64_t globMinEval, globMaxEval; + double globMinEval, globMaxEval; - zlaebz2_Control_t Control; - memset(&Control, 0, sizeof(zlaebz2_Control_t)); + dlaebz2_Control_t Control; + memset(&Control, 0, sizeof(dlaebz2_Control_t)); Control.N = n; Control.diag = diag; Control.offd = offd; @@ -352,15 +347,15 @@ int plasma_zstevx2( Control.stein_arrays = stein_arrays; /* Find actual least and greatest eigenvalues. */ - plasma_zstelg(Control.diag, Control.offd, Control.N, &globMinEval, &globMaxEval); + plasma_dstelg(Control.diag, Control.offd, Control.N, &globMinEval, &globMaxEval); int evLessThanVL=0, evLessThanVU=n, nEigVals=0; if (range == PlasmaRangeV) { /* We don't call Sturm if we already know the answer. */ - if (vl >= globMinEval) evLessThanVL=plasma_zlaneg2(diag, offd, n, vl); + if (vl >= globMinEval) evLessThanVL=plasma_dlaneg2(diag, offd, n, vl); else vl = globMinEval; /* optimize for computing step size. */ - if (vu <= globMaxEval) evLessThanVU=plasma_zlaneg2(diag, offd, n, vu); + if (vu <= globMaxEval) evLessThanVU=plasma_dlaneg2(diag, offd, n, vu); else vu = nexttoward(globMaxEval, __DBL_MAX__); /* optimize for computing step size */ /* Compute the number of eigenvalues in [vl, vu). */ nEigVals = (evLessThanVU - evLessThanVL); @@ -384,7 +379,7 @@ int plasma_zstevx2( /* Now if user's K (arg 4) isn't enough room, we have a problem. */ if (k < nEigVals) { return -4; /* problem with user's K value. */ - } + } /* We are going into discovery. Make sure we have arrays. */ if (pVal == NULL) return -12; /* pointers cannot be null. */ @@ -396,17 +391,17 @@ int plasma_zstevx2( Control.pVal = pVal; Control.pMul = pMul; Control.pVec = pVec; - + /* We launch the root task: The full range to subdivide. */ #pragma omp parallel { #pragma omp single { - #pragma omp task - plasma_zlaebz2(&Control, vl, vu, -1, -1, nEigVals); + #pragma omp task + plasma_dlaebz2(&Control, vl, vu, -1, -1, nEigVals); } } - + /* Now, all the eigenvalues should have unit eigenvectors in the array Control.pVec. * We don't need to sort that, but we do want to compress it; in case of multiplicity. * We compute the final number of eigenvectors in vectorsFound, and mpcity is recorded. @@ -424,14 +419,14 @@ int plasma_zstevx2( /* compress the array in case vectorsFound < nEigVals (due to multiplicities). */ /* Note that pMul[] is initialized to zeros, if still zero, a multiplicity entry. */ if (vectorsFound < nEigVals) { - int j=0; + int j=0; for (i=0; i 0) { /* If this is NOT a multiplicity, */ pMul[j] = pMul[i]; /* copy to next open slot j */ - pVal[j] = pVal[i]; + pVal[j] = pVal[i]; if (Control.jobtype == PlasmaVec) { if (j != i) { - memcpy(&pVec[j*Control.N], &pVec[i*Control.N], Control.N*sizeof(plasma_complex64_t)); + memcpy(&pVec[j*Control.N], &pVec[i*Control.N], Control.N*sizeof(double)); } } @@ -444,24 +439,24 @@ int plasma_zstevx2( plasma_desc_t T; int retqrf=0, retgqr=0; - retqrf = plasma_zgeqrf(Control.N, vectorsFound, /* This leaves pVec in compressed state of Q+R */ + retqrf = plasma_dgeqrf(Control.N, vectorsFound, /* This leaves pVec in compressed state of Q+R */ pVec, Control.N, &T); if (retqrf != 0) { - plasma_error("plasma_zgeqrf failed."); + plasma_error("plasma_dgeqrf failed."); } else { /* extract just the Q of the QR, in normal form, in workspace pQ */ - plasma_complex64_t* pQ = (plasma_complex64_t*) malloc(Control.N * vectorsFound * sizeof(plasma_complex64_t)); - retgqr = plasma_zungqr(Control.N, vectorsFound, vectorsFound, + double* pQ = (double*) malloc(Control.N * vectorsFound * sizeof(double)); + retgqr = plasma_dorgqr(Control.N, vectorsFound, vectorsFound, pVec, Control.N, T, pQ, Control.N); if (retgqr != 0) { - plasma_error("plasma_zungqr failed."); + plasma_error("plasma_dorgqr failed."); } /* copy orthonormal vectors from workspace pQ to pVec for user return. */ - memcpy(pVec, pQ, Control.N*vectorsFound*sizeof(plasma_complex64_t)); - free(pQ); + memcpy(pVec, pQ, Control.N*vectorsFound*sizeof(double)); + free(pQ); pQ = NULL; } @@ -469,40 +464,40 @@ int plasma_zstevx2( if (retqrf || retgqr) goto Cleanup; /************************************************************************* * When eigenvalue are crowded, it is possible that after orthogonalizing - * vectors, it can be better to swap neighboring eigenvectors. We just - * test all the pairs; basically ||(A*V-e*V)||_max is the error. if BOTH + * vectors, it can be better to swap neighboring eigenvectors. We just + * test all the pairs; basically ||(A*V-e*V)||_max is the error. if BOTH * vectors in a pair have less error by being swapped, we swap them. ************************************************************************/ int swaps=0; if (jobtype == PlasmaVec) { - int N = Control.N; - plasma_complex64_t *Y = malloc(N * sizeof(plasma_complex64_t)); - plasma_complex64_t test[4]; + int N = Control.N; + double *Y = malloc(N * sizeof(double)); + double test[4]; for (i=0; i 1.E-11) continue; /* We've tried to parallelize the following four tests * as four omp tasks. It works, but takes an average of - * 8% longer (~3.6 ms) than just serial execution. + * 8% longer (~3.6 ms) than just serial execution. * omp schedule and taskwait overhead, I presume. */ - test[0]= plasma_zstepe(Control.diag, Control.offd, N, + test[0]= plasma_dstepe(Control.diag, Control.offd, N, pVal[i], &pVec[i*N]); - test[1] = plasma_zstepe(Control.diag, Control.offd, N, + test[1] = plasma_dstepe(Control.diag, Control.offd, N, pVal[i+1], &pVec[(i+1)*N]); - - test[2] = plasma_zstepe(Control.diag, Control.offd, N, + + test[2] = plasma_dstepe(Control.diag, Control.offd, N, pVal[i], &pVec[(i+1)*N]); - test[3] = plasma_zstepe(Control.diag, Control.offd, N, + test[3] = plasma_dstepe(Control.diag, Control.offd, N, pVal[i+1], &pVec[i*N]); - + if ( (test[2] < test[0]) /* val1 with vec2 beats val1 with vec1 */ && (test[3] < test[1]) ) { /* val2 with vec1 beats val2 with vec2 */ - memcpy(Y, &pVec[i*N], N*sizeof(plasma_complex64_t)); - memcpy(&pVec[i*N], &pVec[(i+1)*N], N*sizeof(plasma_complex64_t)); - memcpy(&pVec[(i+1)*N], Y, N*sizeof(plasma_complex64_t)); + memcpy(Y, &pVec[i*N], N*sizeof(double)); + memcpy(&pVec[i*N], &pVec[(i+1)*N], N*sizeof(double)); + memcpy(&pVec[(i+1)*N], Y, N*sizeof(double)); swaps++; } } /* end swapping. */ diff --git a/test/test_zstevx2.c b/test/test_dstevx2.c similarity index 84% rename from test/test_zstevx2.c rename to test/test_dstevx2.c index 54efda93..d7f4dda4 100644 --- a/test/test_zstevx2.c +++ b/test/test_dstevx2.c @@ -6,7 +6,7 @@ * University of Tennessee, US, * University of Manchester, UK. * - * @precisions normal z -> s d + * @precisions normal d -> s * **/ #include "test.h" @@ -23,34 +23,32 @@ #include -#define COMPLEX +#define REAL /****************************************************************************** - * Matrix detailed in Kahan; et al. + * Matrix detailed in Kahan; et al. * Matrix Test: diag=[+x,-x,+x,-x,...+x,-x] for any real x, but Kahan chooses * a tiny x. * offd=[1,1,...1] - * Dimension: n. + * Dimension: n. * Computed eigenvalues: - * evalue[k] = [ x*x + 4*cos(k/(n+1))^2 ] ^(1/2), + * evalue[k] = [ x*x + 4*cos(k/(n+1))^2 ] ^(1/2), * evalue[n+1-k] = -evalue[k], for k=1,[n/2], * evalue[(n+1)/2] = 0 if n is odd. * Note k is 1-relative in these formulations. * The eigenvalues range from (-2,+2). * Note: This routine verified to match documentation for n=4,8,12,24. - * Note: This code is a template, it is not intended to work in complex - * arithmetic, it is only to be translated to either single or double. *****************************************************************************/ -static void testMatrix_Kahan(plasma_complex64_t* diag, plasma_complex64_t *offd, - plasma_complex64_t* evalue, lapack_int n, plasma_complex64_t myDiag) { +static void testMatrix_Kahan(double* diag, double *offd, + double* evalue, lapack_int n, double myDiag) { lapack_int i,k; for (k=1; k<=(n/2); k++) { - plasma_complex64_t ev; + double ev; ev = (M_PI*k+0.)/(n+1.0); /* angle in radians. */ ev = cos(ev); /* cos(angle) */ ev *= 4.*ev; /* 4*cos^2(angle) */ - ev += myDiag*myDiag; /* x^2 + 4*cos^2(angle) */ + ev += myDiag*myDiag; /* x^2 + 4*cos^2(angle) */ ev = sqrt(ev); /* (x^2 + 4*cos^2(angle))^(0.5) */ /* we reverse the -ev and ev here, to get in ascending sorted order. */ evalue[k-1] = -ev; @@ -72,40 +70,40 @@ static void testMatrix_Kahan(plasma_complex64_t* diag, plasma_complex64_t *offd, /****************************************************************************** * This tests an eigenvector X for the eigenvalue lambda. - * We should have A*X = lambda*X. Thus, (A*X)/lambda = X. + * We should have A*X = lambda*X. Thus, (A*X)/lambda = X. * We perform the matrix multiply for each element X[i], and divide the result * by lambda, yielding mmRes[i] which should equal X[i]. We sum the squares of * these results, and the squares of X[i], to compute the Frobenious Norm. We * return the absolute difference of these norms as the error in the vector. * * Matrix multiply; A * X = Y. - * A = [diag[0], offd[0], + * A = [diag[0], offd[0], * [offd[0], diag[1], offd[1] * [ 0, offd[1], diag[2], offd[2], * ... * [ 0...0 offd[n-2], diag[n-1] ] *****************************************************************************/ -static double testEVec(plasma_complex64_t *diag, plasma_complex64_t *offd, - int n, plasma_complex64_t *X, plasma_complex64_t lambda) { +static double testEVec(double *diag, double *offd, + int n, double *X, double lambda) { int i; double mmRes, vmRes, error, sumMM=0., sumVec=0., invLambda = 1.0/lambda; mmRes = (diag[0]*X[0] + offd[0]*X[1])*invLambda; vmRes = X[0]; sumMM += mmRes*mmRes; - sumVec += vmRes*vmRes; + sumVec += vmRes*vmRes; mmRes = (offd[n-2]*X[n-2] + diag[n-1]*X[n-1])*invLambda; vmRes = X[n-1]; sumMM += mmRes*mmRes; - sumVec += vmRes*vmRes; - + sumVec += vmRes*vmRes; + for (i=1; i<(n-1); i++) { mmRes = (offd[i-1]*X[i-1] + diag[i]*X[i] + offd[i]*X[i+1])*invLambda; vmRes = X[i]; sumMM += mmRes*mmRes; - sumVec += vmRes*vmRes; + sumVec += vmRes*vmRes; } sumMM = sqrt(sumMM); @@ -116,7 +114,7 @@ static double testEVec(plasma_complex64_t *diag, plasma_complex64_t *offd, /***************************************************************************//** - * @brief Tests ZSTEVX2. + * @brief Tests DSTEVX2. * * @param[in,out] param - array of parameters * @param[in] run - whether to run test @@ -124,7 +122,7 @@ static double testEVec(plasma_complex64_t *diag, plasma_complex64_t *offd, * Sets used flags in param indicating parameters that are used. * If run is true, also runs test and stores output parameters. ******************************************************************************/ -void test_zstevx2(param_value_t param[], bool run) +void test_dstevx2(param_value_t param[], bool run) { int i,j; /***************************************************************** @@ -149,34 +147,34 @@ void test_zstevx2(param_value_t param[], bool run) /***************************************************************** * Allocate and initialize arrays. ****************************************************************/ - plasma_complex64_t *Diag = - (plasma_complex64_t*)malloc((size_t)m*sizeof(plasma_complex64_t)); + double *Diag = + (double*)malloc((size_t)m*sizeof(double)); assert(Diag != NULL); - plasma_complex64_t *Offd = - (plasma_complex64_t*)malloc((size_t)(m-1)*sizeof(plasma_complex64_t)); + double *Offd = + (double*)malloc((size_t)(m-1)*sizeof(double)); assert(Offd != NULL); - plasma_complex64_t *eigenvalues = - (plasma_complex64_t*)malloc((size_t)m*sizeof(plasma_complex64_t)); + double *eigenvalues = + (double*)malloc((size_t)m*sizeof(double)); assert(eigenvalues != NULL); - plasma_complex64_t *pVal = - (plasma_complex64_t*)malloc((size_t)m*sizeof(plasma_complex64_t)); + double *pVal = + (double*)malloc((size_t)m*sizeof(double)); assert(pVal != NULL); int *pMul = (int*)malloc((size_t)m*sizeof(int)); assert(pMul != NULL); /************************************************************************** - * Kahan has eigenvalues from [-2.0 to +2.0]. However, eigenvalues are + * Kahan has eigenvalues from [-2.0 to +2.0]. However, eigenvalues are * dense near -2.0 and +2.0, so for large matrices, the density may cause * eigenvalues separated by less than machine precision, which causes us * multiplicity (eigenvalues are identical at machine precision). We first - * see this in single precision at m=14734, with a multiplicity of 2. + * see this in single precision at m=14734, with a multiplicity of 2. *************************************************************************/ - plasma_complex64_t myDiag=1.e-5; + double myDiag=1.e-5; testMatrix_Kahan(Diag, Offd, eigenvalues, m, myDiag); double minAbsEV=__DBL_MAX__, maxAbsEV=0., Kond; for (i=0; i worstEigenvalue_error) { worstEigenvalue_index = i; worstEigenvalue_error = error; - worstEigenvalue_eps = ev_eps; + worstEigenvalue_eps = ev_eps; worstEigenvalue_mpcty = pMul[i]; } evIdx++; /* advance known eigenvalue index for a multiplicity. */ if (evIdx == m) break; } - - i++; /* advance to next discovered eigenvalue. */ + + i++; /* advance to next discovered eigenvalue. */ } /********************************************************************** @@ -320,14 +318,14 @@ void test_zstevx2(param_value_t param[], bool run) * being too liberal. Obviously this is related to the number of bits * of error in the result. The condition number (Kond) of the Kahan * matrix also grows nearly linearly with m; Kond is computed above. - *********************************************************************/ + *********************************************************************/ for (i=0; i worstEigenvector_error) { - worstEigenvector_error = vErr; + worstEigenvector_error = vErr; worstEigenvector_index = i; } } @@ -342,7 +340,7 @@ void test_zstevx2(param_value_t param[], bool run) /***************************************************************** * Free arrays. ****************************************************************/ -TestingDone: +TestingDone: if (Diag != NULL) free(Diag); if (Offd != NULL) free(Offd); if (eigenvalues != NULL) free(eigenvalues); @@ -352,5 +350,5 @@ void test_zstevx2(param_value_t param[], bool run) if (test) { /* free any test specific matrices; currently none. */ - } + } } From 8634a734727985289dafafeaff9da86ca469aa68 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Fri, 17 Jan 2025 15:41:08 -0500 Subject: [PATCH 4/5] cmake: generate files --- CMakeLists.txt | 634 +++++++++++++++++++++-------------- tools/generate_precisions.py | 43 --- 2 files changed, 376 insertions(+), 301 deletions(-) delete mode 100644 tools/generate_precisions.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 102e71ea..7310d498 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,11 @@ project( PLASMA VERSION 24.8.7 LANGUAGES C set(CMAKE_SUPPRESS_REGENERATION on) +if (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.31.0) + cmake_policy( SET CMP0171 NEW ) # recognize CMake's `codegen` target + set( CODEGEN "CODEGEN" ) +endif() + if (${CMAKE_VERSION} VERSION_GREATER 3.11.99) cmake_policy(PUSH) cmake_policy(SET CMP0074 NEW) # allows to use CBLAS_ROOT and LAPACKE_ROOT @@ -13,21 +18,12 @@ endif() #set( CMAKE_THREAD_PREFER_PTHREAD 1 ) #find_package( Threads ) -if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/compute/scamax.c") - message( STATUS "Some generated files already exist, proceeding" ) -else () - message( STATUS "Missing files some precision files, trying to generate" ) - - include( FindPython ) # requires CMake 3.12 - - if (Python_FOUND) - message( STATUS "Found Python interpreter wth ID ${Python_INTERPRETER_ID} and EXE ${Python_EXECUTABLE}" ) - message( STATUS "Generating files for all precisions. This may take a few minutes." ) - execute_process(COMMAND "${Python_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/tools/generate_precisions.py" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}") - else () +include( FindPython ) # requires CMake 3.12 +if (Python_FOUND) + message( STATUS "Found Python interpreter wth ID ${Python_INTERPRETER_ID} and EXE ${Python_EXECUTABLE}" ) +else() message( FATAL_ERROR "Couldn't find Python interpreter, cannot generate all precision files." ) - endif () -endif () +endif() # PLASMA uses C99 features (in-loop definition of for loop variables) if (CMAKE_VERSION VERSION_LESS "3.1") @@ -264,139 +260,230 @@ else() message(FATAL_ERROR "OpenMP not found.") endif() -add_library(plasma SHARED include/plasma.h -compute/clag2z.c compute/dzamax.c compute/scamax.c compute/samax.c compute/damax.c compute/pclag2z.c compute/pdzamax.c -compute/pzdesc2ge.c compute/pzdesc2pb.c compute/pzdesc2tr.c compute/pzgbtrf.c -compute/pzge2desc.c compute/pzgeadd.c compute/pzgelqf.c compute/pzgelqf_tree.c -compute/pzgemm.c compute/pzgeqrf.c compute/pzgeqrf_tree.c compute/pzgeswp.c -compute/pzgetrf.c compute/pzgetri_aux.c compute/pzhemm.c compute/pzher2k.c -compute/pzherk.c compute/pzhetrf_aasen.c compute/pzlacpy.c compute/pzlag2c.c -compute/pzlangb.c compute/pzlange.c compute/pzlanhe.c compute/pzlansy.c -compute/pzlantr.c compute/pzlascl.c compute/pzlaset.c compute/pzlauum.c -compute/pzpb2desc.c compute/pzpbtrf.c compute/pzpotrf.c compute/pzsymm.c -compute/pzsyr2k.c compute/pzsyrk.c compute/pztbsm.c compute/pztr2desc.c -compute/pztradd.c compute/pztrmm.c compute/pztrsm.c compute/pztrtri.c -compute/pzunglq.c compute/pzunglq_tree.c compute/pzungqr.c -compute/pzungqr_tree.c compute/pzunmlq.c compute/pzunmlq_tree.c -compute/pzunmqr.c compute/pzunmqr_tree.c compute/zcgbsv.c compute/zcgesv.c -compute/zcposv.c compute/zdesc2ge.c compute/zdesc2pb.c compute/zdesc2tr.c -compute/zgbsv.c compute/zgbtrf.c compute/zgbtrs.c compute/zge2desc.c -compute/zgeadd.c compute/zgeinv.c compute/zgelqf.c compute/zgelqs.c -compute/zgels.c compute/zgemm.c compute/zgeqrf.c compute/zgeqrs.c -compute/zgesv.c compute/zgeswp.c compute/zgetrf.c compute/zgetri_aux.c -compute/zgetri.c compute/zgetrs.c compute/zhemm.c compute/zher2k.c -compute/zherk.c compute/zhesv.c compute/zhetrf.c compute/zhetrs.c -compute/zlacpy.c compute/clag2z.c compute/zlag2c.c compute/zlangb.c compute/zlange.c -compute/zlanhe.c compute/zlansy.c compute/zlantr.c compute/zlascl.c -compute/zlaset.c compute/zlauum.c compute/zpb2desc.c compute/zpbsv.c -compute/zpbtrf.c compute/zpbtrs.c compute/zpoinv.c compute/zposv.c -compute/zpotrf.c compute/zpotri.c compute/zpotrs.c compute/zsymm.c -compute/zsyr2k.c compute/zsyrk.c compute/ztr2desc.c compute/ztradd.c -compute/ztrmm.c compute/ztrsm.c compute/ztrtri.c compute/zunglq.c -compute/zungqr.c compute/zunmlq.c compute/zunmqr.c compute/cgelqf.c -compute/cgemm.c compute/cgeqrf.c compute/cpotrf.c compute/cpotrs.c -compute/csymm.c compute/csyr2k.c compute/csyrk.c compute/ctradd.c -compute/ctrmm.c compute/ctrsm.c compute/ctrtri.c compute/cunglq.c -compute/cungqr.c compute/cunmlq.c compute/cunmqr.c compute/dgelqf.c -compute/dgemm.c compute/dgeqrf.c compute/dorglq.c compute/dorgqr.c -compute/dormlq.c compute/dormqr.c compute/dpotrf.c compute/dpotrs.c -compute/dsymm.c compute/dsyr2k.c compute/dsyrk.c compute/dtradd.c -compute/dtrmm.c compute/dtrsm.c compute/dtrtri.c compute/sgelqf.c -compute/sgemm.c compute/sgeqrf.c compute/sorglq.c compute/sorgqr.c -compute/sormlq.c compute/sormqr.c compute/spotrf.c compute/spotrs.c -compute/ssymm.c compute/ssyr2k.c compute/ssyrk.c compute/stradd.c -compute/strmm.c compute/strsm.c compute/strtri.c -compute/dsposv.c compute/dgbsv.c compute/cgbsv.c compute/sgbsv.c -compute/dgbtrf.c compute/dgbtrs.c compute/cgbtrf.c compute/cgbtrs.c -compute/sgbtrf.c compute/sgbtrs.c compute/dgeadd.c compute/cgeadd.c -compute/sgeadd.c compute/dgeinv.c compute/cgeinv.c compute/sgeinv.c -compute/dgelqs.c compute/cgelqs.c compute/sgelqs.c compute/dgels.c -compute/cgels.c compute/sgels.c compute/dgeqrs.c compute/cgeqrs.c -compute/sgeqrs.c compute/dsgesv.c compute/dsgbsv.c compute/dgesv.c -compute/cgesv.c compute/sgesv.c compute/dgetrf.c compute/cgetrf.c -compute/sgetrf.c compute/dgetri.c compute/cgetri.c compute/sgetri.c -compute/dgetri_aux.c compute/cgetri_aux.c compute/sgetri_aux.c -compute/dgetrf.c compute/dgetrs.c compute/cgetrf.c compute/cgetrs.c -compute/sgetrf.c compute/sgetrs.c compute/chemm.c compute/cher2k.c -compute/cherk.c compute/dsytrf.c compute/dsytrs.c compute/chetrf.c -compute/chetrs.c compute/ssytrf.c compute/ssytrs.c compute/dsysv.c -compute/chesv.c compute/ssysv.c compute/dlacpy.c compute/clacpy.c -compute/slacpy.c compute/dlag2s.c compute/slag2d.c compute/dlange.c -compute/clange.c compute/slange.c compute/clanhe.c compute/dlansy.c -compute/clansy.c compute/slansy.c compute/dlantr.c compute/clantr.c -compute/slantr.c compute/dlascl.c compute/clascl.c compute/slascl.c -compute/dlaset.c compute/claset.c compute/slaset.c compute/dgeswp.c -compute/cgeswp.c compute/sgeswp.c compute/dlauum.c compute/clauum.c -compute/slauum.c compute/dpbsv.c compute/cpbsv.c compute/spbsv.c -compute/dpbtrf.c compute/dpbtrs.c compute/cpbtrf.c compute/cpbtrs.c -compute/spbtrf.c compute/spbtrs.c compute/dlangb.c compute/clangb.c -compute/slangb.c compute/dposv.c compute/cposv.c compute/sposv.c -compute/dpoinv.c compute/cpoinv.c compute/spoinv.c compute/dpotri.c -compute/cpotri.c compute/spotri.c -compute/slaebz2.c compute/dlaebz2.c -compute/slaneg2.c compute/dlaneg2.c -compute/sstevx2.c compute/dstevx2.c -compute/pslange.c compute/pclaset.c compute/psorglq_tree.c -compute/psormqr_tree.c compute/pdgelqf_tree.c compute/pslag2d.c -compute/pcunmqr_tree.c compute/psgeqrf_tree.c compute/pspotrf.c -compute/pdsytrf_aasen.c compute/pslauum.c compute/pssytrf_aasen.c -compute/pstrsm.c compute/psgeqrf.c compute/pcgelqf_tree.c -compute/pcunglq_tree.c compute/pctrmm.c compute/pstrtri.c -compute/pcungqr_tree.c compute/pcsymm.c compute/psormqr.c compute/pdgemm.c -compute/pdlacpy.c compute/psgeadd.c compute/pdtrmm.c compute/pcungqr.c -compute/pcgemm.c compute/pslansy.c compute/pdtradd.c compute/pdormqr_tree.c -compute/pdtbsm.c compute/psormlq.c compute/pdpotrf.c compute/pcunglq.c -compute/pchemm.c compute/psgeswp.c compute/pcher2k.c compute/pdgetri_aux.c -compute/pcgeqrf_tree.c compute/pdorglq.c compute/pdlange.c -compute/pcunmlq_tree.c compute/psgetrf.c compute/pdgeqrf.c compute/pdlauum.c -compute/pdlaset.c compute/pclascl.c compute/pclauum.c compute/pcgeadd.c -compute/pdorglq_tree.c compute/pdgetrf.c compute/pdtrsm.c compute/psorglq.c -compute/pslangb.c compute/pdormlq_tree.c compute/pcherk.c compute/pcpbtrf.c -compute/psgemm.c compute/pdgeqrf_tree.c compute/pdlascl.c compute/pdsyr2k.c -compute/pdlantr.c compute/pdgeadd.c compute/pclansy.c compute/psgetri_aux.c -compute/pclantr.c compute/pstradd.c compute/pcgbtrf.c compute/pcsyrk.c -compute/pctradd.c compute/psgelqf_tree.c compute/pslantr.c compute/pdlag2s.c compute/pslag2d.c -compute/pchetrf_aasen.c compute/pssymm.c compute/pcunmqr.c compute/pclacpy.c -compute/pdsyrk.c compute/pcsyr2k.c compute/pdgelqf.c compute/pdamax.c -compute/pslacpy.c compute/pdormqr.c compute/pctrsm.c compute/pclangb.c -compute/pdlangb.c compute/pscamax.c compute/pdpbtrf.c compute/pcgeqrf.c -compute/pdgbtrf.c compute/psamax.c compute/pslascl.c compute/psgbtrf.c -compute/pdgeswp.c compute/pspbtrf.c compute/pctbsm.c compute/pdorgqr.c -compute/pcgelqf.c compute/pcpotrf.c compute/pstbsm.c compute/pstrmm.c -compute/pssyr2k.c compute/pclange.c compute/psorgqr.c compute/psormlq_tree.c -compute/pssyrk.c compute/pdorgqr_tree.c compute/pdsymm.c compute/pslaset.c -compute/pdlansy.c compute/pcgeswp.c compute/psorgqr_tree.c compute/pctrtri.c -compute/pcgetri_aux.c compute/pdormlq.c compute/pcunmlq.c compute/pcgetrf.c -compute/pclanhe.c compute/pdtrtri.c compute/psgelqf.c -compute/zdesc2ge.c compute/zdesc2pb.c compute/zdesc2tr.c -compute/cdesc2ge.c compute/cdesc2pb.c compute/cdesc2tr.c -compute/ddesc2ge.c compute/ddesc2pb.c compute/ddesc2tr.c -compute/sdesc2ge.c compute/sdesc2pb.c compute/sdesc2tr.c -compute/pzdesc2ge.c compute/pzdesc2pb.c compute/pzdesc2tr.c -compute/pcdesc2ge.c compute/pcdesc2pb.c compute/pcdesc2tr.c -compute/pddesc2ge.c compute/pddesc2pb.c compute/pddesc2tr.c -compute/psdesc2ge.c compute/psdesc2pb.c compute/psdesc2tr.c -compute/zge2desc.c compute/zpb2desc.c compute/ztr2desc.c -compute/cge2desc.c compute/cpb2desc.c compute/ctr2desc.c -compute/dge2desc.c compute/dpb2desc.c compute/dtr2desc.c -compute/sge2desc.c compute/spb2desc.c compute/str2desc.c -compute/pzge2desc.c compute/pzpb2desc.c compute/pztr2desc.c -compute/pcge2desc.c compute/pcpb2desc.c compute/pctr2desc.c -compute/pdge2desc.c compute/pdpb2desc.c compute/pdtr2desc.c -compute/psge2desc.c compute/pspb2desc.c compute/pstr2desc.c -compute/zgbmm.c compute/dgbmm.c compute/sgbmm.c compute/cgbmm.c -compute/zgbset.c compute/dgbset.c compute/sgbset.c compute/cgbset.c -compute/zgb2desc.c compute/dgb2desc.c compute/sgb2desc.c compute/cgb2desc.c -compute/pzgb2desc.c compute/pdgb2desc.c compute/psgb2desc.c compute/pcgb2desc.c -compute/zgesdd.c compute/dgesdd.c compute/sgesdd.c compute/cgesdd.c -compute/pzgbbrd_static.c compute/pcgbbrd_static.c compute/pdgbbrd_static.c compute/psgbbrd_static.c -compute/pzgecpy_tile2lapack_band.c compute/pcgecpy_tile2lapack_band.c compute/pdgecpy_tile2lapack_band.c compute/psgecpy_tile2lapack_band.c -compute/pzlarft_blgtrd.c compute/pclarft_blgtrd.c compute/pdlarft_blgtrd.c compute/pslarft_blgtrd.c -compute/pzunmqr_blgtrd.c compute/pcunmqr_blgtrd.c compute/pdormqr_blgtrd.c compute/psormqr_blgtrd.c -compute/pcge2gb.c compute/pdge2gb.c compute/psge2gb.c compute/pzge2gb.c -control/constants.c control/context.c control/descriptor.c -control/tree.c control/tuning.c control/workspace.c control/version.c) +#------------------------------------------------------------------------------- +# Parses a list of template source files to find what files should be generated. +# +# @param[in,out] src +# On input, variable that is a list of template files (source and +# headers) for codegen to process. May have non-template source files; +# codegen ignores them. +# On output, the list of generated files is appended. +# +# Example: +# set( src zgemm.c plasma_z.h ) +# generate_files( src ) +# # On output, src is zgemm.c plasma_z.h sgemm.c dgemm.c cgemm.c plasma_s.h +# # plasma_d.h plasma_c.h +# add_library( plasma ${src} ) +# +function( generate_files src ) + message( DEBUG "----- generate_files -----" ) + message( DEBUG "src is ${src} = <${${src}}>" ) + message( DEBUG "cache is ${src}_cache = <${${src}_cache}>" ) + + if (NOT "${${src}}" STREQUAL "${${src}_cache}") + message( STATUS "Running codegen to find files to generate for ${src}" ) + execute_process( + COMMAND "${Python_EXECUTABLE}" "tools/codegen.py" "--depend" ${${src}} + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + RESULT_VARIABLE error + OUTPUT_VARIABLE ${src}_depends ) + message( DEBUG "codegen error ${error}" ) + message( DEBUG "depends is ${src}_depends = <<<\n${${src}_depends}>>>" ) + + if (error) + message( STATUS "codegen returned error; cannot generate source files." ) + else() + # Cache src so we don't have to re-run codegen to get the + # list of dependencies again if src doesn't change. + set( ${src}_cache ${${src}} CACHE INTERNAL "" ) + + # Split lines and cache it. + string( REGEX REPLACE "\n" ";" ${src}_depends "${${src}_depends}" ) + set( ${src}_depends ${${src}_depends} CACHE INTERNAL "" ) + message( DEBUG "depends is ${src}_depends = <<<${${src}_depends}>>>" ) + endif() + endif() + + message( STATUS "Adding codegen commands to generate files for ${src}" ) + foreach( depend ${${src}_depends} ) + message( DEBUG "depend = <${depend}>" ) + string( REGEX MATCH "^(.*): (.*)$" out "${depend}" ) + set( outputs ${CMAKE_MATCH_1} ) + set( input ${CMAKE_MATCH_2} ) + string( REGEX REPLACE " " ";" outputs "${outputs}" ) + list( TRANSFORM outputs PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/" + OUTPUT_VARIABLE src_outputs ) + message( DEBUG " input: <${input}>" ) + message( DEBUG " outputs: <${outputs}>" ) + message( DEBUG " src_outputs: <${src_outputs}>" ) + add_custom_command( + OUTPUT ${src_outputs} + COMMAND "${Python_EXECUTABLE}" "tools/codegen.py" "${input}" + DEPENDS "${input}" + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + VERBATIM ${CODEGEN} ) + + list( APPEND ${src} "${outputs}" ) + message( DEBUG " src: <${${src}}>" ) + message( DEBUG "" ) + endforeach() + set( ${src} ${${src}} PARENT_SCOPE ) # propagate changes + message( DEBUG "src is ${src} = <${${src}}>" ) +endfunction() + +#------------------------------------------------------------------------------- +# List all template files (sources and headers) and non-template source +# files, e.g., zgemm.c, plasma_z.h, test.c. +# Do not list generated files, e.g., sgemm.c, plasma_s.h. +# Please add files in alphabetical order. +set( plasma_src + compute/clag2z.c + compute/dlaebz2.c + compute/dlaneg2.c + compute/dstevx2.c + compute/dzamax.c + compute/pclag2z.c + compute/pdzamax.c + compute/pzdesc2ge.c + compute/pzdesc2pb.c + compute/pzdesc2tr.c + compute/pzgb2desc.c + compute/pzgbbrd_static.c + compute/pzgbtrf.c + compute/pzge2desc.c + compute/pzge2gb.c + compute/pzgeadd.c + compute/pzgecpy_tile2lapack_band.c + compute/pzgelqf.c + compute/pzgelqf_tree.c + compute/pzgemm.c + compute/pzgeqrf.c + compute/pzgeqrf_tree.c + compute/pzgeswp.c + compute/pzgetrf.c + compute/pzgetri_aux.c + compute/pzhemm.c + compute/pzher2k.c + compute/pzherk.c + compute/pzhetrf_aasen.c + compute/pzlacpy.c + compute/pzlag2c.c + compute/pzlangb.c + compute/pzlange.c + compute/pzlanhe.c + compute/pzlansy.c + compute/pzlantr.c + compute/pzlarft_blgtrd.c + compute/pzlascl.c + compute/pzlaset.c + compute/pzlauum.c + compute/pzpb2desc.c + compute/pzpbtrf.c + compute/pzpotrf.c + compute/pzsymm.c + compute/pzsyr2k.c + compute/pzsyrk.c + compute/pztbsm.c + compute/pztr2desc.c + compute/pztradd.c + compute/pztrmm.c + compute/pztrsm.c + compute/pztrtri.c + compute/pzunglq.c + compute/pzunglq_tree.c + compute/pzungqr.c + compute/pzungqr_tree.c + compute/pzunmlq.c + compute/pzunmlq_tree.c + compute/pzunmqr.c + compute/pzunmqr_blgtrd.c + compute/pzunmqr_tree.c + compute/zcgbsv.c + compute/zcgesv.c + compute/zcposv.c + compute/zdesc2ge.c + compute/zdesc2pb.c + compute/zdesc2tr.c + compute/zgb2desc.c + compute/zgbmm.c + compute/zgbset.c + compute/zgbsv.c + compute/zgbtrf.c + compute/zgbtrs.c + compute/zge2desc.c + compute/zgeadd.c + compute/zgeinv.c + compute/zgelqf.c + compute/zgelqs.c + compute/zgels.c + compute/zgemm.c + compute/zgeqrf.c + compute/zgeqrs.c + compute/zgesdd.c + compute/zgesv.c + compute/zgeswp.c + compute/zgetrf.c + compute/zgetri.c + compute/zgetri_aux.c + compute/zgetrs.c + compute/zhemm.c + compute/zher2k.c + compute/zherk.c + compute/zhesv.c + compute/zhetrf.c + compute/zhetrs.c + compute/zlacpy.c + compute/zlag2c.c + compute/zlangb.c + compute/zlange.c + compute/zlanhe.c + compute/zlansy.c + compute/zlantr.c + compute/zlascl.c + compute/zlaset.c + compute/zlauum.c + compute/zpb2desc.c + compute/zpbsv.c + compute/zpbtrf.c + compute/zpbtrs.c + compute/zpoinv.c + compute/zposv.c + compute/zpotrf.c + compute/zpotri.c + compute/zpotrs.c + compute/zsymm.c + compute/zsyr2k.c + compute/zsyrk.c + compute/ztr2desc.c + compute/ztradd.c + compute/ztrmm.c + compute/ztrsm.c + compute/ztrtri.c + compute/zunglq.c + compute/zungqr.c + compute/zunmlq.c + compute/zunmqr.c + + control/constants.c + control/context.c + control/descriptor.c + control/tree.c + control/tuning.c + control/version.c + control/workspace.c + + include/core_lapack_z.h + include/plasma.h + include/plasma_internal_z.h + include/plasma_internal_zc.h + include/plasma_z.h + include/plasma_zc.h + include/plasma_zlaebz2_work.h +) +generate_files( plasma_src ) +add_library( plasma SHARED ${plasma_src} ) # CMake knows about "plasma" library at this point so inform CMake where the headers are target_include_directories(plasma PUBLIC @@ -404,126 +491,157 @@ target_include_directories(plasma PUBLIC $ ) -add_library(plasma_core_blas SHARED include/plasma_core_blas.h -core_blas/core_clag2z.c core_blas/core_dcabs1.c core_blas/core_scabs1.c core_blas/core_dzamax.c core_blas/core_zgeadd.c core_blas/core_zgelqt.c -core_blas/core_zgemm.c core_blas/core_zgeqrt.c core_blas/core_zgessq.c core_blas/core_zgeswp.c core_blas/core_zgetrf.c -core_blas/core_zhegst.c core_blas/core_zhemm.c core_blas/core_zher2k.c core_blas/core_zherk.c core_blas/core_zhessq.c -core_blas/core_zheswp.c core_blas/core_zlacpy_band.c core_blas/core_zlacpy.c core_blas/core_zlag2c.c core_blas/core_zlange.c -core_blas/core_zlanhe.c core_blas/core_zlansy.c core_blas/core_zlantr.c core_blas/core_zlascl.c core_blas/core_zlaset.c -core_blas/core_zlauum.c core_blas/core_zpamm.c core_blas/core_zpemv.c core_blas/core_zparfb.c core_blas/core_zpemv.c core_blas/core_zpotrf.c -core_blas/core_zsymm.c core_blas/core_zsyr2k.c core_blas/core_zsyrk.c core_blas/core_zsyssq.c core_blas/core_ztradd.c -core_blas/core_ztrmm.c core_blas/core_ztrsm.c core_blas/core_ztrssq.c core_blas/core_ztrtri.c core_blas/core_ztslqt.c -core_blas/core_ztsmlq.c core_blas/core_ztsmqr.c core_blas/core_ztsqrt.c core_blas/core_zttlqt.c core_blas/core_zttmlq.c -core_blas/core_zttmqr.c core_blas/core_zttqrt.c core_blas/core_zunmlq.c core_blas/core_zunmqr.c -core_blas/core_cgeadd.c core_blas/core_cgemm.c core_blas/core_cgeswp.c -core_blas/core_cgetrf.c core_blas/core_cheswp.c core_blas/core_clacpy.c -core_blas/core_clacpy_band.c core_blas/core_cparfb.c core_blas/core_ctrsm.c -core_blas/core_dgeadd.c core_blas/core_dgemm.c core_blas/core_dgeswp.c -core_blas/core_dgetrf.c core_blas/core_dlacpy.c core_blas/core_dlacpy_band.c -core_blas/core_dparfb.c core_blas/core_dsyswp.c core_blas/core_dtrsm.c -core_blas/core_sgeadd.c core_blas/core_sgemm.c core_blas/core_sgeswp.c -core_blas/core_sgetrf.c core_blas/core_slacpy.c core_blas/core_slacpy_band.c -core_blas/core_sparfb.c core_blas/core_ssyswp.c core_blas/core_strsm.c -core_blas/core_cgelqt.c core_blas/core_cgeqrt.c core_blas/core_cgessq.c -core_blas/core_chegst.c core_blas/core_chemm.c core_blas/core_cher2k.c -core_blas/core_cherk.c core_blas/core_chessq.c core_blas/core_clange.c -core_blas/core_clanhe.c core_blas/core_clansy.c core_blas/core_clantr.c -core_blas/core_clascl.c core_blas/core_claset.c core_blas/core_clauum.c -core_blas/core_cpamm.c core_blas/core_cpemv.c core_blas/core_cpotrf.c -core_blas/core_csymm.c core_blas/core_csyr2k.c core_blas/core_csyrk.c -core_blas/core_csyssq.c core_blas/core_ctradd.c core_blas/core_ctrmm.c -core_blas/core_ctrssq.c core_blas/core_ctrtri.c core_blas/core_ctslqt.c -core_blas/core_ctsmlq.c core_blas/core_ctsmqr.c core_blas/core_ctsqrt.c -core_blas/core_cttlqt.c core_blas/core_cttmlq.c core_blas/core_cttmqr.c -core_blas/core_cttqrt.c core_blas/core_cunmlq.c core_blas/core_cunmqr.c -core_blas/core_damax.c core_blas/core_dgelqt.c core_blas/core_dgeqrt.c -core_blas/core_dgessq.c core_blas/core_dlag2s.c core_blas/core_dlange.c -core_blas/core_dlansy.c core_blas/core_dlantr.c core_blas/core_dlascl.c -core_blas/core_dlaset.c core_blas/core_dlauum.c core_blas/core_dormlq.c -core_blas/core_dormqr.c core_blas/core_dpamm.c core_blas/core_dpemv.c -core_blas/core_dpotrf.c core_blas/core_dsygst.c core_blas/core_dsymm.c -core_blas/core_dsyr2k.c core_blas/core_dsyrk.c core_blas/core_dsyssq.c -core_blas/core_dtradd.c core_blas/core_dtrmm.c core_blas/core_dtrssq.c -core_blas/core_dtrtri.c core_blas/core_dtslqt.c core_blas/core_dtsmlq.c -core_blas/core_dtsmqr.c core_blas/core_dtsqrt.c core_blas/core_dttlqt.c -core_blas/core_dttmlq.c core_blas/core_dttmqr.c core_blas/core_dttqrt.c -core_blas/core_samax.c core_blas/core_scamax.c core_blas/core_sgelqt.c -core_blas/core_sgeqrt.c core_blas/core_sgessq.c core_blas/core_slag2d.c -core_blas/core_slange.c core_blas/core_slansy.c core_blas/core_slantr.c -core_blas/core_slascl.c core_blas/core_slaset.c core_blas/core_slauum.c -core_blas/core_sormlq.c core_blas/core_sormqr.c core_blas/core_spamm.c -core_blas/core_spemv.c core_blas/core_spotrf.c core_blas/core_ssygst.c -core_blas/core_ssymm.c core_blas/core_ssyr2k.c core_blas/core_ssyrk.c -core_blas/core_ssyssq.c core_blas/core_stradd.c core_blas/core_strmm.c -core_blas/core_strssq.c core_blas/core_strtri.c core_blas/core_stslqt.c -core_blas/core_stsmlq.c core_blas/core_stsmqr.c core_blas/core_stsqrt.c -core_blas/core_sttlqt.c core_blas/core_sttmlq.c core_blas/core_sttmqr.c -core_blas/core_sttqrt.c control/barrier.c control/async.c -core_blas/core_cgbtype1cb.c core_blas/core_dgbtype1cb.c core_blas/core_sgbtype1cb.c core_blas/core_zgbtype1cb.c -core_blas/core_cgbtype2cb.c core_blas/core_dgbtype2cb.c core_blas/core_sgbtype2cb.c core_blas/core_zgbtype2cb.c -core_blas/core_cgbtype3cb.c core_blas/core_dgbtype3cb.c core_blas/core_sgbtype3cb.c core_blas/core_zgbtype3cb.c -core_blas/core_clarfb_gemm.c core_blas/core_dlarfb_gemm.c core_blas/core_slarfb_gemm.c core_blas/core_zlarfb_gemm.c -core_blas/core_clacpy.c core_blas/core_dlacpy.c core_blas/core_slacpy.c core_blas/core_zlacpy.c +#------------------------------------------------------------------------------- +# See note above on plasma_src. +# Please add files in alphabetical order. +set( plasma_core_blas_src + control/async.c + control/barrier.c + + core_blas/core_clag2z.c + core_blas/core_dcabs1.c + core_blas/core_dzamax.c + core_blas/core_zgbtype1cb.c + core_blas/core_zgbtype2cb.c + core_blas/core_zgbtype3cb.c + core_blas/core_zgeadd.c + core_blas/core_zgelqt.c + core_blas/core_zgemm.c + core_blas/core_zgeqrt.c + core_blas/core_zgessq.c + core_blas/core_zgeswp.c + core_blas/core_zgetrf.c + core_blas/core_zhegst.c + core_blas/core_zhemm.c + core_blas/core_zher2k.c + core_blas/core_zherk.c + core_blas/core_zhessq.c + core_blas/core_zheswp.c + core_blas/core_zlacpy.c + core_blas/core_zlacpy_band.c + core_blas/core_zlag2c.c + core_blas/core_zlange.c + core_blas/core_zlanhe.c + core_blas/core_zlansy.c + core_blas/core_zlantr.c + core_blas/core_zlarfb_gemm.c + core_blas/core_zlascl.c + core_blas/core_zlaset.c + core_blas/core_zlauum.c + core_blas/core_zpamm.c + core_blas/core_zparfb.c + core_blas/core_zpemv.c + core_blas/core_zpotrf.c + core_blas/core_zsymm.c + core_blas/core_zsyr2k.c + core_blas/core_zsyrk.c + core_blas/core_zsyssq.c + core_blas/core_ztradd.c + core_blas/core_ztrmm.c + core_blas/core_ztrsm.c + core_blas/core_ztrssq.c + core_blas/core_ztrtri.c + core_blas/core_ztslqt.c + core_blas/core_ztsmlq.c + core_blas/core_ztsmqr.c + core_blas/core_ztsqrt.c + core_blas/core_zttlqt.c + core_blas/core_zttmlq.c + core_blas/core_zttmqr.c + core_blas/core_zttqrt.c + core_blas/core_zunmlq.c + core_blas/core_zunmqr.c + + include/core_lapack_z.h + include/plasma_core_blas.h + include/plasma_core_blas_z.h + include/plasma_core_blas_zc.h + include/plasma.h + include/plasma_internal_z.h + include/plasma_internal_zc.h + include/plasma_z.h + include/plasma_zc.h ) +generate_files( plasma_core_blas_src ) +add_library( plasma_core_blas SHARED ${plasma_core_blas_src} ) + target_include_directories(plasma_core_blas PUBLIC $ $ ) -add_executable(plasmatest test/test.h test/test.c include/plasma.h -test/test_dzamax.c test/test_damax.c test/test_scamax.c test/test_samax.c -test/test_zcposv.c test/test_dsposv.c test/test_zgbsv.c test/test_dgbsv.c -test/test_cgbsv.c test/test_sgbsv.c test/test_zgbmm.c test/test_dgbmm.c -test/test_cgbmm.c test/test_sgbmm.c test/test_zgbtrf.c test/test_dgbtrf.c -test/test_cgbtrf.c test/test_sgbtrf.c test/test_zgeadd.c test/test_dgeadd.c -test/test_cgeadd.c test/test_sgeadd.c test/test_zgeinv.c test/test_dgeinv.c -test/test_cgeinv.c test/test_sgeinv.c test/test_zgelqf.c test/test_dgelqf.c -test/test_cgelqf.c test/test_sgelqf.c test/test_zgelqs.c test/test_dgelqs.c -test/test_cgelqs.c test/test_sgelqs.c test/test_zgels.c test/test_dgels.c -test/test_cgels.c test/test_sgels.c test/test_zgemm.c test/test_dgemm.c -test/test_cgemm.c test/test_sgemm.c test/test_zgeqrf.c test/test_dgeqrf.c -test/test_cgeqrf.c test/test_sgeqrf.c test/test_zgeqrs.c test/test_dgeqrs.c -test/test_cgeqrs.c test/test_sgeqrs.c test/test_zcgesv.c test/test_dsgesv.c -test/test_zcgbsv.c test/test_dsgbsv.c test/test_zgesv.c test/test_dgesv.c -test/test_cgesv.c test/test_sgesv.c test/test_zgetrf.c test/test_dgetrf.c -test/test_cgetrf.c test/test_sgetrf.c test/test_zgetri.c test/test_dgetri.c -test/test_cgetri.c test/test_sgetri.c test/test_zgetri_aux.c -test/test_dgetri_aux.c test/test_cgetri_aux.c test/test_sgetri_aux.c -test/test_zgetrs.c test/test_dgetrs.c test/test_cgetrs.c test/test_sgetrs.c -test/test_zhemm.c test/test_chemm.c test/test_zher2k.c test/test_cher2k.c -test/test_zherk.c test/test_cherk.c test/test_zhetrf.c test/test_dsytrf.c -test/test_chetrf.c test/test_ssytrf.c test/test_zhesv.c test/test_dsysv.c -test/test_chesv.c test/test_ssysv.c test/test_zlacpy.c test/test_dlacpy.c -test/test_clacpy.c test/test_slacpy.c test/test_zlag2c.c test/test_clag2z.c -test/test_dlag2s.c test/test_slag2d.c test/test_zlange.c test/test_dlange.c -test/test_clange.c test/test_slange.c test/test_zlanhe.c test/test_clanhe.c -test/test_zlansy.c test/test_dlansy.c test/test_clansy.c test/test_slansy.c -test/test_zlantr.c test/test_dlantr.c test/test_clantr.c test/test_slantr.c -test/test_zlascl.c test/test_dlascl.c test/test_clascl.c test/test_slascl.c -test/test_zlaset.c test/test_dlaset.c test/test_claset.c test/test_slaset.c -test/test_zgeswp.c test/test_dgeswp.c test/test_cgeswp.c test/test_sgeswp.c -test/test_zlauum.c test/test_dlauum.c test/test_clauum.c test/test_slauum.c -test/test_zpbsv.c test/test_dpbsv.c test/test_cpbsv.c test/test_spbsv.c -test/test_zpbtrf.c test/test_dpbtrf.c test/test_cpbtrf.c test/test_spbtrf.c -test/test_zlangb.c test/test_dlangb.c test/test_clangb.c test/test_slangb.c -test/test_zposv.c test/test_dposv.c test/test_cposv.c test/test_sposv.c -test/test_zpoinv.c test/test_dpoinv.c test/test_cpoinv.c test/test_spoinv.c -test/test_zpotrf.c test/test_dpotrf.c test/test_cpotrf.c test/test_spotrf.c -test/test_zpotri.c test/test_dpotri.c test/test_cpotri.c test/test_spotri.c -test/test_zpotrs.c test/test_dpotrs.c test/test_cpotrs.c test/test_spotrs.c -test/test_dstevx2.c test/test_sstevx2.c -test/test_zsymm.c test/test_dsymm.c test/test_csymm.c test/test_ssymm.c -test/test_zsyr2k.c test/test_dsyr2k.c test/test_csyr2k.c test/test_ssyr2k.c -test/test_zsyrk.c test/test_dsyrk.c test/test_csyrk.c test/test_ssyrk.c -test/test_ztradd.c test/test_dtradd.c test/test_ctradd.c test/test_stradd.c -test/test_ztrmm.c test/test_dtrmm.c test/test_ctrmm.c test/test_strmm.c -test/test_ztrsm.c test/test_dtrsm.c test/test_ctrsm.c test/test_strsm.c -test/test_ztrtri.c test/test_dtrtri.c test/test_ctrtri.c test/test_strtri.c -test/test_zgesdd.c test/test_dgesdd.c test/test_cgesdd.c test/test_sgesdd.c -test/test_zunmlq.c test/test_dormlq.c test/test_cunmlq.c test/test_sormlq.c -test/test_zunmqr.c test/test_dormqr.c test/test_cunmqr.c test/test_sormqr.c) +#------------------------------------------------------------------------------- +# See note above on plasma_src. +# Please add files in alphabetical order. +set( plasma_test_src + include/plasma.h + + test/test.c + test/test.h + test/test_clag2z.c + test/test_dstevx2.c + test/test_dzamax.c + test/test_z.h + test/test_zc.h + test/test_zcgbsv.c + test/test_zcgesv.c + test/test_zcposv.c + test/test_zgbmm.c + test/test_zgbsv.c + test/test_zgbtrf.c + test/test_zgeadd.c + test/test_zgeinv.c + test/test_zgelqf.c + test/test_zgelqs.c + test/test_zgels.c + test/test_zgemm.c + test/test_zgeqrf.c + test/test_zgeqrs.c + test/test_zgesdd.c + test/test_zgesv.c + test/test_zgeswp.c + test/test_zgetrf.c + test/test_zgetri.c + test/test_zgetri_aux.c + test/test_zgetrs.c + test/test_zhemm.c + test/test_zher2k.c + test/test_zherk.c + test/test_zhesv.c + test/test_zhetrf.c + test/test_zlacpy.c + test/test_zlag2c.c + test/test_zlangb.c + test/test_zlange.c + test/test_zlanhe.c + test/test_zlansy.c + test/test_zlantr.c + test/test_zlascl.c + test/test_zlaset.c + test/test_zlauum.c + test/test_zpbsv.c + test/test_zpbtrf.c + test/test_zpoinv.c + test/test_zposv.c + test/test_zpotrf.c + test/test_zpotri.c + test/test_zpotrs.c + test/test_zsymm.c + test/test_zsyr2k.c + test/test_zsyrk.c + test/test_ztradd.c + test/test_ztrmm.c + test/test_ztrsm.c + test/test_ztrtri.c + test/test_zunmlq.c + test/test_zunmqr.c +) + +generate_files( plasma_test_src ) +add_executable( plasmatest ${plasma_test_src} ) +#------------------------------------------------------------------------------- find_library(MATH_LIBRARY m) if( MATH_LIBRARY ) # OpenBLAS needs to link C math library (usually -lm) but MKL doesn't diff --git a/tools/generate_precisions.py b/tools/generate_precisions.py deleted file mode 100644 index 8da61713..00000000 --- a/tools/generate_precisions.py +++ /dev/null @@ -1,43 +0,0 @@ -#! /usr/bin/env python -# -*- encoding: ascii -*- - -"To be executed from the top most directory where 'tools/codegen.py' is available." - -import os -import sys - -Output_Files = False # show files to be generated but don't generate - -def codegen(letters, filenames, fn_format): - for filename in filenames.split(): - if Output_Files: - os.system(sys.executable + " tools/codegen.py --output {}".format(fn_format.format(filename))) - continue - for letter in letters.split(): - os.system(sys.executable + " tools/codegen.py -p {} {}".format(letter, fn_format.format(filename))) - -def main(argv): - global Output_Files - if "--output" in argv: - Output_Files = True - - elif "--help" in argv or "-h" in argv: - print("{} [--output]\n".format(argv[0])) - print("--output show files to be generated but don't generate") - return 0 - - codegen("s d c", "plasma_z plasma_internal_z core_lapack_z plasma_core_blas_z plasma_zlaebz2_work", "include/{}.h") - codegen("ds", "include/plasma_zc.h include/plasma_internal_zc.h include/plasma_core_blas_zc.h test/test_zc.h", "{}") - codegen("s d c", "dzamax zgelqf zgemm zgbmm zgeqrf zgesdd zunglq zungqr zunmlq zunmqr zpotrf zpotrs zsymm zsyr2k zsyrk ztradd ztrmm ztrsm ztrtri zunglq zungqr zunmlq zunmqr zgbsv zgbtrf zgbtrs zgeadd zgeinv zgelqs zgels zgeqrs zgesv zgeswp zgetrf zgetri zgetrs zhemm zher2k zherk zhesv zhetrf zhetrs zlacpy zlangb zlange zlanhe zlansy zlantr zlascl zlaset zlauum zpbsv zpbtrf zpbtrs zpoinv zposv zpotri zgetri_aux zdesc2ge zdesc2pb zdesc2tr zge2desc zgb2desc zgbset zpb2desc ztr2desc pdzamax pzgbtrf pzgeadd pzgelqf pzgelqf_tree pzgemm pzgeqrf pzgeqrf_tree pzgeswp pzgetrf pzgetri_aux pzhemm pzher2k pzherk pzhetrf_aasen pzlacpy pzlangb pzlange pzlanhe pzlansy pzlantr pzlascl pzlaset pzlauum pzpbtrf pzpotrf pzsymm pzsyr2k pzsyrk pztbsm pztradd pztrmm pztrsm pztrtri pzunglq pzunglq_tree pzungqr pzungqr_tree pzunmlq pzunmlq_tree pzunmqr pzunmqr_tree pzdesc2ge pzdesc2pb pzdesc2tr pzge2desc pzgb2desc pzpb2desc pztr2desc pzge2gb pzgbbrd_static pzgecpy_tile2lapack_band pzlarft_blgtrd pzunmqr_blgtrd", "compute/{}.c") - codegen("s d", "zlaebz2 zlaneg2 zstevx2", "compute/{}.c") - codegen("ds", "zcposv zcgesv zcgbsv clag2z zlag2c pclag2z pzlag2c", "compute/{}.c") - codegen("s d c", "zgeadd zgemm zgeswp zgetrf zheswp zlacpy zlacpy_band zheswp ztrsm dzamax zgelqt zgeqrt zgessq zhegst zhemm zher2k zherk zhessq zlange zlanhe zlansy zlantr zlascl zlaset zlauum zunmlq zunmqr zpemv zpamm zpotrf zhegst zsymm zsyr2k zsyrk zsyssq ztradd ztrmm ztrssq ztrtri ztslqt ztsmlq ztsmqr ztsqrt zttlqt zttmlq zttmqr zttqrt zunmlq zunmqr zparfb dcabs1 zlarfb_gemm zgbtype1cb zgbtype2cb zgbtype3cb", "core_blas/core_{}.c") - codegen("ds", "zlag2c clag2z", "core_blas/core_{}.c") - codegen("s d c", "z.h", "test/test_{}") - codegen("s d", "zstevx2.c", "test/test_{}") - codegen("s d c", "dzamax zgbsv zgbtrf zgeadd zgeinv zgelqf zgelqs zgels zgemm zgbmm zgeqrf zgeqrs zgesv zgeswp zgetrf zgetri_aux zgetri zgetrs zhemm zher2k zherk zhesv zhetrf zlacpy zlangb zlange zlanhe zlansy zlantr zlascl zlaset zlauum zpbsv zpbtrf zpoinv zposv zpotrf zpotri zpotrs zsymm zsyr2k zsyrk ztradd ztrmm ztrsm ztrtri zunmlq zunmqr zgesdd", "test/test_{}.c") - codegen("ds", "zcposv zcgesv zcgbsv zlag2c clag2z", "test/test_{}.c") - return 0 - -if "__main__" == __name__: - sys.exit(main(sys.argv)) From d042b50b776150841826f1d785b7ed075539d270 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Fri, 31 Jan 2025 20:24:20 -0500 Subject: [PATCH 5/5] cmake: -Wall option --- CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7310d498..ce2551ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -641,6 +641,18 @@ set( plasma_test_src generate_files( plasma_test_src ) add_executable( plasmatest ${plasma_test_src} ) +#------------------------------------------------------------------------------- +if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.15) + # Conditionally add -Wall. See CMake tutorial. + set( gcc_like "$" ) + target_compile_options( + plasma PRIVATE $<${gcc_like}:$> ) + target_compile_options( + plasma_core_blas PRIVATE $<${gcc_like}:$> ) + target_compile_options( + plasmatest PRIVATE $<${gcc_like}:$> ) +endif() + #------------------------------------------------------------------------------- find_library(MATH_LIBRARY m) if( MATH_LIBRARY )