Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Codegen #41

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,7 @@ compute/dgetri.c
compute/dgetri_aux.c
compute/dgetrs.c
compute/dlacpy.c
compute/dlaebz2.c
compute/dlag2s.c
compute/dlaneg2.c
compute/dlangb.c
compute/dlange.c
compute/dlansy.c
Expand All @@ -122,7 +120,6 @@ compute/dpotrs.c
compute/dsgbsv.c
compute/dsgesv.c
compute/dsposv.c
compute/dstevx2.c
compute/dsymm.c
compute/dsyr2k.c
compute/dsyrk.c
Expand Down Expand Up @@ -606,7 +603,6 @@ test/test_ds.h
test/test_dsgbsv.c
test/test_dsgesv.c
test/test_dsposv.c
test/test_dstevx2.c
test/test_dsymm.c
test/test_dsyr2k.c
test/test_dsyrk.c
Expand Down
646 changes: 388 additions & 258 deletions CMakeLists.txt

Large diffs are not rendered by default.

131 changes: 64 additions & 67 deletions compute/zlaebz2.c → compute/dlaebz2.c
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
/**
*
* @file
* @file
*
* PLASMA is a software package provided by:
* University of Tennessee, US,
*
* @precisions normal z -> s d
* @precisions normal d -> s
*
**/

#include "plasma.h"
#include "plasma_internal.h" /* needed for imin, imax. */
#include "plasma_zlaebz2_work.h" /* work areas. */
#include "plasma_dlaebz2_work.h" /* work areas. */

#include <string.h>
#include <omp.h>
Expand All @@ -22,29 +22,26 @@
*
* @ingroup plasma_gemm
*
*
* This file is a z-template to generate s and d code.
* Only s and d are compiled; not c or z.
* This code is not designed to be called directly by users; it is a subroutine
* for zstevx2.c.
* for dstevx2.c.
*
* Specifically, this is a task-based parallel algorithm, the parameters are
* contained in the already initialized and populated zlaebz2_Control_t; For
* example, from zstevx2:
* contained in the already initialized and populated dlaebz2_Control_t; For
* example, from dstevx2:
*
* #pragma omp parallel
* {
* #pragma omp single
* {
* plasma_zlaebz2(&Control, ...etc...);
* plasma_dlaebz2(&Control, ...etc...);
* }
* }
*
*
*
*******************************************************************************
*
*
* @param[in] *Control
* A pointer to the global variables needed.
* A pointer to the global variables needed.
*
* @param[in] Control->N
* int number of rows in the matrix.
Expand All @@ -66,20 +63,20 @@
* PlasmaVec if user desires eigenvectors computed.
*
* @param[in] Control->il
* int enum. The lowerBound of an index range if range is
* PlasmaRangeI.
* int enum. The lowerBound of an index range if range is
* PlasmaRangeI.
*
* @param[in] Control->iu
* int enum. The upperBound of an index range, if range is
* int enum. The upperBound of an index range, if range is
* PlasmaRangeI.
*
* @param[in] Control->stein_arrays
* array of [max_threads], type zlaebz2_Stein_Array_t, contains work
* array of [max_threads], type dlaebz2_Stein_Array_t, contains work
* areas per thread for invoking _stein (inverse iteration to find
* eigenvectors).
*
* @param[in] Control->baseIdx
* The index of the least eigenvalue to be found in the bracket,
* The index of the least eigenvalue to be found in the bracket,
* used to calculate the offset into the return vectors/arrays.
*
* @param[out] Control->error
Expand Down Expand Up @@ -130,7 +127,7 @@
*
*
* This algorithm uses Bisection by the Scaled Sturm Sequence, implemented in
* plasma_zlaebz2, followed by the LAPACK routine _STEIN, which uses inverse
* plasma_dlaebz2, followed by the LAPACK routine _STEIN, which uses inverse
* iteration to find the eigenvalue. The initial 'bracket' parameters should
* contain the full range for the eigenvalues we are to discover. The algorithm
* is recursively task based, at each division the bracket is divided into two
Expand All @@ -151,7 +148,7 @@
*****************************************************************************/

/*******************************************************************************
* Use LAPACK zstein to find a single eigenvector. We may use this routine
* Use LAPACK dstein to find a single eigenvector. We may use this routine
* multiple times, so instead of allocating/freeing the work spaces repeatedly,
* we have an array of pointers, per thread, to workspaces we allocate if not
* already allocated for this thread. So we don't allocate more than once per
Expand All @@ -160,9 +157,9 @@
* to converge.
*******************************************************************************/

int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd,
plasma_complex64_t u, plasma_complex64_t *v, int N,
zlaebz2_Stein_Array_t *myArrays) {
int plasma_dstein( double *diag, double *offd,
double u, double *v, int N,
dlaebz2_Stein_Array_t *myArrays) {
int M=1, LDZ=N, INFO;
int thread = omp_get_thread_num();

Expand All @@ -176,22 +173,22 @@ int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd,
if (myArrays[thread].ISPLIT != NULL) myArrays[thread].ISPLIT[0]=N;
}

if (myArrays[thread].WORK == NULL) myArrays[thread].WORK = (plasma_complex64_t*) calloc(5*N, sizeof(plasma_complex64_t));
if (myArrays[thread].WORK == NULL) myArrays[thread].WORK = (double*) calloc(5*N, sizeof(double));
if (myArrays[thread].IWORK == NULL) myArrays[thread].IWORK = (int*) calloc(N, sizeof(int));
if (myArrays[thread].IFAIL == NULL) myArrays[thread].IFAIL = (int*) calloc(N, sizeof(int));
if (myArrays[thread].IBLOCK == NULL ||
myArrays[thread].ISPLIT == NULL ||
myArrays[thread].WORK == NULL ||
myArrays[thread].IWORK == NULL ||
if (myArrays[thread].IBLOCK == NULL ||
myArrays[thread].ISPLIT == NULL ||
myArrays[thread].WORK == NULL ||
myArrays[thread].IWORK == NULL ||
myArrays[thread].IFAIL == NULL) {
return(PlasmaErrorOutOfMemory);
}

plasma_complex64_t W = u;
double W = u;

/* We use the 'work' version so we can re-use our work arrays; using LAPACKE_zstein() */
/* would re-allocate and release work areas on every call. */
INFO = LAPACKE_zstein_work(LAPACK_COL_MAJOR, N, diag, offd, M, &W, myArrays[thread].IBLOCK,
/* We use the 'work' version so we can re-use our work arrays; using LAPACKE_dstein() */
/* would re-allocate and release work areas on every call. */
INFO = LAPACKE_dstein_work(LAPACK_COL_MAJOR, N, diag, offd, M, &W, myArrays[thread].IBLOCK,
myArrays[thread].ISPLIT, v, LDZ, myArrays[thread].WORK, myArrays[thread].IWORK,
myArrays[thread].IFAIL);
return(INFO);
Expand All @@ -213,23 +210,23 @@ int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd,
* nLT_Low or nLT_hi is computed.
* ***************************************************************************/

void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
plasma_complex64_t upperBound, int nLT_low, int nLT_hi, int numEV) {
void plasma_dlaebz2(dlaebz2_Control_t *Control, double lowerBound,
double upperBound, int nLT_low, int nLT_hi, int numEV) {

plasma_complex64_t *diag = Control->diag;
plasma_complex64_t *offd = Control->offd;
double *diag = Control->diag;
double *offd = Control->offd;
int N = Control->N;
plasma_complex64_t cp;

double cp;
int flag=0, evLess;

if (nLT_low < 0) {
nLT_low = plasma_zlaneg2(diag, offd, N, lowerBound);
nLT_low = plasma_dlaneg2(diag, offd, N, lowerBound);
flag=1;
}

if (nLT_hi < 0) {
nLT_hi = plasma_zlaneg2(diag, offd, N, upperBound);
nLT_hi = plasma_dlaneg2(diag, offd, N, upperBound);
flag=1;
}

Expand All @@ -243,31 +240,31 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
if (Control->range == PlasmaRangeI) {
if (nLT_hi < Control->il || /* e.g if il=500, and nLT_hi=499, this bracket is under range of interest. */
nLT_low > Control->iu) { /* e.g if iu=1000, and nLT_low=1001, this bracket is above range of interest. */
return;
return;
}
}
}

/* Bisect the bracket until we can't anymore. */

flag = 0;
for (;;) {
cp = (lowerBound+upperBound)*0.5;
if (cp == lowerBound || cp == upperBound) {
/* Our bracket has been narrowed to machine epsilon for this magnitude (=ulp).
/* Our bracket has been narrowed to machine epsilon for this magnitude (=ulp).
* We are done; the bracket is always [low,high). 'high' is not included, so
* we have numEV eigenvalues at low, whether it == 1 or is > 1. We find
* the eigenvector. (We can test multiplicity with GluedWilk).
*/
break; /* exit for(;;). */
} else {
/* we have a new cutpoint. */
evLess = plasma_zlaneg2(diag, offd, N, cp);
evLess = plasma_dlaneg2(diag, offd, N, cp);
if (evLess < 0) {
/* We could not compute the Sturm sequence for it. */
flag = -1; /* indicate an error. */
break; /* exit for (;;). */
}

/* Discard empty halves in both PlasmaRangeV and PlasmaRangeI.
* If #EV < cutpoint is the same as the #EV < high, it means
* no EV are in [cutpoint, hi]. We can discard that range.
Expand All @@ -277,16 +274,16 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
upperBound = cp;
continue;
}

/* If #EV < cutpoint is the same as #EV < low, it means no
* EV are in [low, cutpoint]. We can discard that range.
* EV are in [low, cutpoint]. We can discard that range.
*/

if (evLess == nLT_low) {
lowerBound = cp;
continue;
}

/* Note: If we were PlasmaRangeV, the initial bounds given by the user are the ranges,
* so we have nothing further to do. In PlasmaRangeI; the initial bounds are Gerschgorin
* limits and not enough: We must further narrow to the desired indices.
Expand All @@ -295,8 +292,8 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
if (Control->range == PlasmaRangeI) {
/* For PlasmaRangeI:
* Recall that il, iu are 1-relative; while evLess is zero-relative; i.e.
* if [il,iu]=[1,2], evless must be 0, or 1.
* when evLess<cp == il-1, or just <il, cp is a good boundary and
* if [il,iu]=[1,2], evless must be 0, or 1.
* when evLess<cp == il-1, or just <il, cp is a good boundary and
* we can discard the lower half.
*
* To judge the upper half, the cutpoint must be < iu, so if it is >= iu,
Expand All @@ -311,7 +308,7 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
numEV = (nLT_hi-nLT_low);
continue;
}

if (evLess >= Control->iu) {
/* The upper half [cp, upperBound) is not needed, it has no indices > iu; */
upperBound = cp;
Expand All @@ -320,24 +317,24 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
continue;
}
} /*end if index search. */

/* Here, the cutpoint has EV on both left right. We push off the right bracket.
* The new lowerBound is the cp, the upperBound is unchanged, the number of
* The new lowerBound is the cp, the upperBound is unchanged, the number of
* eigenvalues changes. */
#pragma omp task
plasma_zlaebz2(Control, cp, upperBound, evLess, nLT_hi, (nLT_hi-evLess));
plasma_dlaebz2(Control, cp, upperBound, evLess, nLT_hi, (nLT_hi-evLess));

/* Update the Left side I kept. The new number of EV less than upperBound
* is evLess, recompute number of EV in the bracket. */
* is evLess, recompute number of EV in the bracket. */
upperBound = cp;
nLT_hi = evLess;
numEV =( evLess - nLT_low);
continue;
numEV =( evLess - nLT_low);
continue;
}
} /* end for (;;) for Bisection. */

/* Okay, count this eigenpair done, add to the Done list.
* NOTE: nLT_low is the global zero-relative index of
* NOTE: nLT_low is the global zero-relative index of
* this set of mpcity eigenvalues.
* No other brackets can change our entry, so we
* don't need any thread block or atomicity.
Expand All @@ -349,24 +346,24 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
} else { /* range == PlasmaRangeV */
myIdx = nLT_low - Control->baseIdx;
}

if (Control->jobtype == PlasmaVec) {
/* get the eigenvector. */
int ret=plasma_zstein(diag, offd, lowerBound, &(Control->pVec[myIdx*N]), N, Control->stein_arrays);
int ret=plasma_dstein(diag, offd, lowerBound, &(Control->pVec[myIdx*N]), N, Control->stein_arrays);
if (ret != 0) {
#pragma omp critical (UpdateStack)
{
/* Only store first error we encounter */
/* Only store first error we encounter */
if (Control->error == 0) Control->error = ret;
}
}
}

/* Add eigenvalue and multiplicity. */
Control->pVal[myIdx]=lowerBound;
Control->pMul[myIdx]=numEV;
// #pragma omp atomic

// #pragma omp atomic
// Control->finished += numEV;
}

Loading