From 450d667cbbc7afc7bbaeb4a358160e32849c1935 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Tue, 14 Jan 2025 17:15:19 -0500
Subject: [PATCH 01/12] more consistent space in headers

---
 include/plasma_core_blas_z.h  | 1136 +++++++++++++++++--------------
 include/plasma_core_blas_zc.h |   34 +-
 include/plasma_descriptor.h   |   55 +-
 include/plasma_internal_z.h   |  600 ++++++++--------
 include/plasma_internal_zc.h  |   10 +-
 include/plasma_tree.h         |   29 +-
 include/plasma_tuning.h       |  168 +++--
 include/plasma_z.h            | 1210 ++++++++++++++++++---------------
 include/plasma_zc.h           |   90 +--
 9 files changed, 1836 insertions(+), 1496 deletions(-)

diff --git a/include/plasma_core_blas_z.h b/include/plasma_core_blas_z.h
index e42c944a..317d0229 100644
--- a/include/plasma_core_blas_z.h
+++ b/include/plasma_core_blas_z.h
@@ -30,189 +30,222 @@ extern "C" {
 double plasma_core_dcabs1(plasma_complex64_t alpha);
 #endif
 
-void plasma_core_zgbtype1cb(plasma_enum_t uplo, int n, int nb,
-                      plasma_complex64_t *A, int lda,
-                      plasma_complex64_t *VQ, plasma_complex64_t *TAUQ,
-                      plasma_complex64_t *VP, plasma_complex64_t *TAUP,
-                      int st, int ed, int sweep, int Vblksiz, int WANTZ,
-                      plasma_complex64_t *work);
-    
-void plasma_core_zgbtype2cb(plasma_enum_t uplo, int n, int nb,
-                      plasma_complex64_t *A, int lda,
-                      plasma_complex64_t *VQ, plasma_complex64_t *TAUQ,
-                      plasma_complex64_t *VP, plasma_complex64_t *TAUP,
-                      int st, int ed, int sweep, int Vblksiz, int WANTZ,
-                      plasma_complex64_t *work);
-    
-void plasma_core_zgbtype3cb(plasma_enum_t uplo, int n, int nb,
-                      plasma_complex64_t *A, int lda,
-                      plasma_complex64_t *VQ, plasma_complex64_t *TAUQ,
-                      plasma_complex64_t *VP, plasma_complex64_t *TAUP,
-                      int st, int ed, int sweep, int Vblksiz, int WANTZ,
-                      plasma_complex64_t *work);
-    
-int plasma_core_zgeadd(plasma_enum_t transa,
-                int m, int n,
-                plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                plasma_complex64_t beta,        plasma_complex64_t *B, int ldb);
-
-int plasma_core_zgelqt(int m, int n, int ib,
-                plasma_complex64_t *A, int lda,
-                plasma_complex64_t *T, int ldt,
-                plasma_complex64_t *tau,
-                plasma_complex64_t *work);
-
-void plasma_core_zgemm(plasma_enum_t transa, plasma_enum_t transb,
-                int m, int n, int k,
-                plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                                          const plasma_complex64_t *B, int ldb,
-                plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
-
-int plasma_core_zgeqrt(int m, int n, int ib,
-                plasma_complex64_t *A, int lda,
-                plasma_complex64_t *T, int ldt,
-                plasma_complex64_t *tau,
-                plasma_complex64_t *work);
-
-void plasma_core_zgessq(int m, int n,
-                 const plasma_complex64_t *A, int lda,
-                 double *scale, double *sumsq);
-
-void plasma_core_zgetrf(plasma_desc_t A, int *ipiv, int ib, int rank, int size,
-                 volatile int *max_idx, volatile plasma_complex64_t *max_val,
-                 volatile int *info, plasma_barrier_t *barrier);
-
-int plasma_core_zhegst(int itype, plasma_enum_t uplo,
-                int n,
-                plasma_complex64_t *A, int lda,
-                plasma_complex64_t *B, int ldb);
-
-void plasma_core_zhemm(plasma_enum_t side, plasma_enum_t uplo,
-                int m, int n,
-                plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                                          const plasma_complex64_t *B, int ldb,
-                plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
-
-void plasma_core_zher2k(plasma_enum_t uplo, plasma_enum_t trans,
-                 int n, int k,
-                 plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                                           const plasma_complex64_t *B, int ldb,
-                 double beta,                    plasma_complex64_t *C, int ldc);
-
-void plasma_core_zherk(plasma_enum_t uplo, plasma_enum_t trans,
-                int n, int k,
-                double alpha, const plasma_complex64_t *A, int lda,
-                double beta,        plasma_complex64_t *C, int ldc);
-
-void plasma_core_zhessq(plasma_enum_t uplo,
-                 int n,
-                 const plasma_complex64_t *A, int lda,
-                 double *scale, double *sumsq);
-
-void plasma_core_zsyssq(plasma_enum_t uplo,
-                 int n,
-                 const plasma_complex64_t *A, int lda,
-                 double *scale, double *sumsq);
-
-void plasma_core_zlacpy(plasma_enum_t uplo, plasma_enum_t transa,
-                 int m, int n,
-                 const plasma_complex64_t *A, int lda,
-                       plasma_complex64_t *B, int ldb);
-
-void plasma_core_zlacpy_lapack2tile_band(plasma_enum_t uplo,
-                                  int it, int jt,
-                                  int m, int n, int nb, int kl, int ku,
-                                  const plasma_complex64_t *A, int lda,
-                                        plasma_complex64_t *B, int ldb);
-
-void plasma_core_zlacpy_tile2lapack_band(plasma_enum_t uplo,
-                                  int it, int jt,
-                                  int m, int n, int nb, int kl, int ku,
-                                  const plasma_complex64_t *B, int ldb,
-                                        plasma_complex64_t *A, int lda);
-
-void plasma_core_zlange(plasma_enum_t norm,
-                 int m, int n,
-                 const plasma_complex64_t *A, int lda,
-                 double *work, double *result);
-
-void plasma_core_zlanhe(plasma_enum_t norm, plasma_enum_t uplo,
-                 int n,
-                 const plasma_complex64_t *A, int lda,
-                 double *work, double *value);
-
-void plasma_core_zlansy(plasma_enum_t norm, plasma_enum_t uplo,
-                 int n,
-                 const plasma_complex64_t *A, int lda,
-                 double *work, double *value);
-
-void plasma_core_zlantr(plasma_enum_t norm, plasma_enum_t uplo, plasma_enum_t diag,
-                 int m, int n,
-                 const plasma_complex64_t *A, int lda,
-                 double *work, double *value);
-
-int plasma_core_zlarfb_gemm(plasma_enum_t side, plasma_enum_t trans, int direct, int storev,
-                     int M, int N, int K,
-                     const plasma_complex64_t *V, int LDV,
-                     const plasma_complex64_t *T, int LDT,
-                     plasma_complex64_t *C, int LDC,
-                     plasma_complex64_t *WORK, int LDWORK);
-
-void plasma_core_zlascl(plasma_enum_t uplo,
-                 double cfrom, double cto,
-                 int m, int n,
-                 plasma_complex64_t *A, int lda);
-
-void plasma_core_zlaset(plasma_enum_t uplo,
-                 int m, int n,
-                 plasma_complex64_t alpha, plasma_complex64_t beta,
-                 plasma_complex64_t *A, int lda);
-
-void plasma_core_zgeswp(plasma_enum_t colrow,
-                 plasma_desc_t A, int k1, int k2, const int *ipiv, int incx);
-
-void plasma_core_zheswp(int rank, int num_threads,
-                 int uplo, plasma_desc_t A, int k1, int k2, const int *ipiv,
-                 int incx, plasma_barrier_t *barrier);
-
-int plasma_core_zlauum(plasma_enum_t uplo,
-                int n,
-                plasma_complex64_t *A, int lda);
-
-int plasma_core_zpamm(plasma_enum_t op, plasma_enum_t side, plasma_enum_t storev,
-               int m, int n, int k, int l,
-               const plasma_complex64_t *A1, int lda1,
-                     plasma_complex64_t *A2, int lda2,
-               const plasma_complex64_t *V,  int ldv,
-                     plasma_complex64_t *W,  int ldw);
-
-int plasma_core_zparfb(plasma_enum_t side, plasma_enum_t trans, plasma_enum_t direct,
-                plasma_enum_t storev,
-                int m1, int n1, int m2, int n2, int k, int l,
-                      plasma_complex64_t *A1,   int lda1,
-                      plasma_complex64_t *A2,   int lda2,
-                const plasma_complex64_t *V,    int ldv,
-                const plasma_complex64_t *T,    int ldt,
-                      plasma_complex64_t *work, int ldwork);
-
-int plasma_core_zpemv(plasma_enum_t trans, int storev,
-               int m, int n, int l,
-               plasma_complex64_t alpha,
-               const plasma_complex64_t *A, int lda,
-               const plasma_complex64_t *X, int incx,
-               plasma_complex64_t beta,
-               plasma_complex64_t *Y, int incy,
-               plasma_complex64_t *work);
-
-int plasma_core_zpotrf(plasma_enum_t uplo,
-                int n,
-                plasma_complex64_t *A, int lda);
-
-void plasma_core_zsymm(plasma_enum_t side, plasma_enum_t uplo,
-                int m, int n,
-                plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                                          const plasma_complex64_t *B, int ldb,
-                plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
+void plasma_core_zgbtype1cb(
+    plasma_enum_t uplo, int n, int nb,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *VQ, plasma_complex64_t *TAUQ,
+    plasma_complex64_t *VP, plasma_complex64_t *TAUP,
+    int st, int ed, int sweep, int Vblksiz, int WANTZ,
+    plasma_complex64_t *work);
+
+void plasma_core_zgbtype2cb(
+    plasma_enum_t uplo, int n, int nb,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *VQ, plasma_complex64_t *TAUQ,
+    plasma_complex64_t *VP, plasma_complex64_t *TAUP,
+    int st, int ed, int sweep, int Vblksiz, int WANTZ,
+    plasma_complex64_t *work);
+
+void plasma_core_zgbtype3cb(
+    plasma_enum_t uplo, int n, int nb,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *VQ, plasma_complex64_t *TAUQ,
+    plasma_complex64_t *VP, plasma_complex64_t *TAUP,
+    int st, int ed, int sweep, int Vblksiz, int WANTZ,
+    plasma_complex64_t *work);
+
+int plasma_core_zgeadd(
+    plasma_enum_t transa,
+    int m, int n,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+    plasma_complex64_t beta,        plasma_complex64_t *B, int ldb);
+
+int plasma_core_zgelqt(
+    int m, int n, int ib,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *T, int ldt,
+    plasma_complex64_t *tau,
+    plasma_complex64_t *work);
+
+void plasma_core_zgemm(
+    plasma_enum_t transa, plasma_enum_t transb,
+    int m, int n, int k,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+                              const plasma_complex64_t *B, int ldb,
+    plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
+
+int plasma_core_zgeqrt(
+    int m, int n, int ib,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *T, int ldt,
+    plasma_complex64_t *tau,
+    plasma_complex64_t *work);
+
+void plasma_core_zgessq(
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *scale, double *sumsq);
+
+void plasma_core_zgetrf(
+    plasma_desc_t A, int *ipiv, int ib, int rank, int size,
+    volatile int *max_idx, volatile plasma_complex64_t *max_val,
+    volatile int *info, plasma_barrier_t *barrier);
+
+int plasma_core_zhegst(
+    int itype, plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *B, int ldb);
+
+void plasma_core_zhemm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    int m, int n,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+                              const plasma_complex64_t *B, int ldb,
+    plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
+
+void plasma_core_zher2k(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    int n, int k,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+                              const plasma_complex64_t *B, int ldb,
+    double beta,                    plasma_complex64_t *C, int ldc);
+
+void plasma_core_zherk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    int n, int k,
+    double alpha, const plasma_complex64_t *A, int lda,
+    double beta,        plasma_complex64_t *C, int ldc);
+
+void plasma_core_zhessq(
+    plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *scale, double *sumsq);
+
+void plasma_core_zsyssq(
+    plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *scale, double *sumsq);
+
+void plasma_core_zlacpy(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    plasma_complex64_t *B, int ldb);
+
+void plasma_core_zlacpy_lapack2tile_band(
+    plasma_enum_t uplo,
+    int it, int jt,
+    int m, int n, int nb, int kl, int ku,
+    const plasma_complex64_t *A, int lda,
+    plasma_complex64_t *B, int ldb);
+
+void plasma_core_zlacpy_tile2lapack_band(
+    plasma_enum_t uplo,
+    int it, int jt,
+    int m, int n, int nb, int kl, int ku,
+    const plasma_complex64_t *B, int ldb,
+    plasma_complex64_t *A, int lda);
+
+void plasma_core_zlange(
+    plasma_enum_t norm,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *work, double *result);
+
+void plasma_core_zlanhe(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *work, double *value);
+
+void plasma_core_zlansy(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *work, double *value);
+
+void plasma_core_zlantr(
+    plasma_enum_t norm, plasma_enum_t uplo, plasma_enum_t diag,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *work, double *value);
+
+int plasma_core_zlarfb_gemm(
+    plasma_enum_t side, plasma_enum_t trans, int direct, int storev,
+    int M, int N, int K,
+    const plasma_complex64_t *V, int LDV,
+    const plasma_complex64_t *T, int LDT,
+    plasma_complex64_t *C, int LDC,
+    plasma_complex64_t *WORK, int LDWORK);
+
+void plasma_core_zlascl(
+    plasma_enum_t uplo,
+    double cfrom, double cto,
+    int m, int n,
+    plasma_complex64_t *A, int lda);
+
+void plasma_core_zlaset(
+    plasma_enum_t uplo,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t beta,
+    plasma_complex64_t *A, int lda);
+
+void plasma_core_zgeswp(
+    plasma_enum_t colrow,
+    plasma_desc_t A, int k1, int k2, const int *ipiv, int incx);
+
+void plasma_core_zheswp(
+    int rank, int num_threads,
+    int uplo, plasma_desc_t A, int k1, int k2, const int *ipiv,
+    int incx, plasma_barrier_t *barrier);
+
+int plasma_core_zlauum(
+    plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *A, int lda);
+
+int plasma_core_zpamm(
+    plasma_enum_t op, plasma_enum_t side, plasma_enum_t storev,
+    int m, int n, int k, int l,
+    const plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    plasma_complex64_t *W,  int ldw);
+
+int plasma_core_zparfb(
+    plasma_enum_t side, plasma_enum_t trans, plasma_enum_t direct,
+    plasma_enum_t storev,
+    int m1, int n1, int m2, int n2, int k, int l,
+    plasma_complex64_t *A1,   int lda1,
+    plasma_complex64_t *A2,   int lda2,
+    const plasma_complex64_t *V,    int ldv,
+    const plasma_complex64_t *T,    int ldt,
+    plasma_complex64_t *work, int ldwork);
+
+int plasma_core_zpemv(
+    plasma_enum_t trans, int storev,
+    int m, int n, int l,
+    plasma_complex64_t alpha,
+    const plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *X, int incx,
+    plasma_complex64_t beta,
+    plasma_complex64_t *Y, int incy,
+    plasma_complex64_t *work);
+
+int plasma_core_zpotrf(
+    plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *A, int lda);
+
+void plasma_core_zsymm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    int m, int n,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+                              const plasma_complex64_t *B, int ldb,
+    plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
 
 void plasma_core_zsyr2k(
     plasma_enum_t uplo, plasma_enum_t trans,
@@ -221,116 +254,133 @@ void plasma_core_zsyr2k(
                               const plasma_complex64_t *B, int ldb,
     plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
 
-void plasma_core_zsyrk(plasma_enum_t uplo, plasma_enum_t trans,
-                int n, int k,
-                plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
-
-int plasma_core_ztradd(plasma_enum_t uplo, plasma_enum_t transa,
-                int m, int n,
-                plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                plasma_complex64_t beta,        plasma_complex64_t *B, int ldb);
-
-void plasma_core_ztrmm(plasma_enum_t side, plasma_enum_t uplo,
-                plasma_enum_t transa, plasma_enum_t diag,
-                int m, int n,
-                plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                                                plasma_complex64_t *B, int ldb);
-
-void plasma_core_ztrsm(plasma_enum_t side, plasma_enum_t uplo,
-                plasma_enum_t transa, plasma_enum_t diag,
-                int m, int n,
-                plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
-                                                plasma_complex64_t *B, int ldb);
-
-void plasma_core_ztrssq(plasma_enum_t uplo, plasma_enum_t diag,
-                 int m, int n,
-                 const plasma_complex64_t *A, int lda,
-                 double *scale, double *sumsq);
-
-int plasma_core_ztrtri(plasma_enum_t uplo, plasma_enum_t diag,
-                int n,
-                plasma_complex64_t *A, int lda);
-
-int plasma_core_ztslqt(int m, int n, int ib,
-                plasma_complex64_t *A1, int lda1,
-                plasma_complex64_t *A2, int lda2,
-                plasma_complex64_t *T,  int ldt,
-                plasma_complex64_t *tau,
-                plasma_complex64_t *work);
-
-int plasma_core_ztsmlq(plasma_enum_t side, plasma_enum_t trans,
-                int m1, int n1, int m2, int n2, int k, int ib,
-                      plasma_complex64_t *A1,   int lda1,
-                      plasma_complex64_t *A2,   int lda2,
-                const plasma_complex64_t *V,    int ldv,
-                const plasma_complex64_t *T,    int ldt,
-                      plasma_complex64_t *work, int ldwork);
-
-int plasma_core_ztsmqr(plasma_enum_t side, plasma_enum_t trans,
-                int m1, int n1, int m2, int n2, int k, int ib,
-                      plasma_complex64_t *A1,   int lda1,
-                      plasma_complex64_t *A2,   int lda2,
-                const plasma_complex64_t *V,    int ldv,
-                const plasma_complex64_t *T,    int ldt,
-                      plasma_complex64_t *work, int ldwork);
-
-int plasma_core_ztsqrt(int m, int n, int ib,
-                plasma_complex64_t *A1, int lda1,
-                plasma_complex64_t *A2, int lda2,
-                plasma_complex64_t *T,  int ldt,
-                plasma_complex64_t *tau,
-                plasma_complex64_t *work);
-
-int plasma_core_zttlqt(int m, int n, int ib,
-                plasma_complex64_t *A1, int lda1,
-                plasma_complex64_t *A2, int lda2,
-                plasma_complex64_t *T,  int ldt,
-                plasma_complex64_t *tau,
-                plasma_complex64_t *work);
-
-int plasma_core_zttmlq(plasma_enum_t side, plasma_enum_t trans,
-                int m1, int n1, int m2, int n2, int k, int ib,
-                      plasma_complex64_t *A1,   int lda1,
-                      plasma_complex64_t *A2,   int lda2,
-                const plasma_complex64_t *V,    int ldv,
-                const plasma_complex64_t *T,    int ldt,
-                      plasma_complex64_t *work, int ldwork);
-
-int plasma_core_zttmqr(plasma_enum_t side, plasma_enum_t trans,
-                int m1, int n1, int m2, int n2, int k, int ib,
-                      plasma_complex64_t *A1,   int lda1,
-                      plasma_complex64_t *A2,   int lda2,
-                const plasma_complex64_t *V,    int ldv,
-                const plasma_complex64_t *T,    int ldt,
-                      plasma_complex64_t *work, int ldwork);
-
-int plasma_core_zttqrt(int m, int n, int ib,
-                plasma_complex64_t *A1, int lda1,
-                plasma_complex64_t *A2, int lda2,
-                plasma_complex64_t *T,  int ldt,
-                plasma_complex64_t *tau,
-                plasma_complex64_t *work);
-
-int plasma_core_zunmlq(plasma_enum_t side, plasma_enum_t trans,
-                int m, int n, int k, int ib,
-                const plasma_complex64_t *A,    int lda,
-                const plasma_complex64_t *T,    int ldt,
-                      plasma_complex64_t *C,    int ldc,
-                      plasma_complex64_t *work, int ldwork);
-
-int plasma_core_zunmqr(plasma_enum_t side, plasma_enum_t trans,
-                int m, int n, int k, int ib,
-                const plasma_complex64_t *A,    int lda,
-                const plasma_complex64_t *T,    int ldt,
-                      plasma_complex64_t *C,    int ldc,
-                      plasma_complex64_t *work, int ldwork);
+void plasma_core_zsyrk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    int n, int k,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+    plasma_complex64_t beta,        plasma_complex64_t *C, int ldc);
+
+int plasma_core_ztradd(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    int m, int n,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+    plasma_complex64_t beta,        plasma_complex64_t *B, int ldb);
+
+void plasma_core_ztrmm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t transa, plasma_enum_t diag,
+    int m, int n,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+                                    plasma_complex64_t *B, int ldb);
+
+void plasma_core_ztrsm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t transa, plasma_enum_t diag,
+    int m, int n,
+    plasma_complex64_t alpha, const plasma_complex64_t *A, int lda,
+                                    plasma_complex64_t *B, int ldb);
+
+void plasma_core_ztrssq(
+    plasma_enum_t uplo, plasma_enum_t diag,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *scale, double *sumsq);
+
+int plasma_core_ztrtri(
+    plasma_enum_t uplo, plasma_enum_t diag,
+    int n,
+    plasma_complex64_t *A, int lda);
+
+int plasma_core_ztslqt(
+    int m, int n, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    plasma_complex64_t *T,  int ldt,
+    plasma_complex64_t *tau,
+    plasma_complex64_t *work);
+
+int plasma_core_ztsmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+    plasma_complex64_t *A1,   int lda1,
+    plasma_complex64_t *A2,   int lda2,
+    const plasma_complex64_t *V,    int ldv,
+    const plasma_complex64_t *T,    int ldt,
+    plasma_complex64_t *work, int ldwork);
+
+int plasma_core_ztsmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+    plasma_complex64_t *A1,   int lda1,
+    plasma_complex64_t *A2,   int lda2,
+    const plasma_complex64_t *V,    int ldv,
+    const plasma_complex64_t *T,    int ldt,
+    plasma_complex64_t *work, int ldwork);
+
+int plasma_core_ztsqrt(
+    int m, int n, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    plasma_complex64_t *T,  int ldt,
+    plasma_complex64_t *tau,
+    plasma_complex64_t *work);
+
+int plasma_core_zttlqt(
+    int m, int n, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    plasma_complex64_t *T,  int ldt,
+    plasma_complex64_t *tau,
+    plasma_complex64_t *work);
+
+int plasma_core_zttmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+    plasma_complex64_t *A1,   int lda1,
+    plasma_complex64_t *A2,   int lda2,
+    const plasma_complex64_t *V,    int ldv,
+    const plasma_complex64_t *T,    int ldt,
+    plasma_complex64_t *work, int ldwork);
+
+int plasma_core_zttmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+    plasma_complex64_t *A1,   int lda1,
+    plasma_complex64_t *A2,   int lda2,
+    const plasma_complex64_t *V,    int ldv,
+    const plasma_complex64_t *T,    int ldt,
+    plasma_complex64_t *work, int ldwork);
+
+int plasma_core_zttqrt(
+    int m, int n, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    plasma_complex64_t *T,  int ldt,
+    plasma_complex64_t *tau,
+    plasma_complex64_t *work);
+
+int plasma_core_zunmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m, int n, int k, int ib,
+    const plasma_complex64_t *A,    int lda,
+    const plasma_complex64_t *T,    int ldt,
+    plasma_complex64_t *C,    int ldc,
+    plasma_complex64_t *work, int ldwork);
+
+int plasma_core_zunmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m, int n, int k, int ib,
+    const plasma_complex64_t *A,    int lda,
+    const plasma_complex64_t *T,    int ldt,
+    plasma_complex64_t *C,    int ldc,
+    plasma_complex64_t *work, int ldwork);
 
 /******************************************************************************/
-void plasma_core_omp_dzamax(int colrow, int m, int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *values,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_core_omp_dzamax(
+    int colrow, int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *values,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 void plasma_core_omp_zgeadd(
     plasma_enum_t transa, int m, int n,
@@ -338,11 +388,12 @@ void plasma_core_omp_zgeadd(
     plasma_complex64_t beta,        plasma_complex64_t *B, int ldb,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_core_omp_zgelqt(int m, int n, int ib,
-                     plasma_complex64_t *A, int lda,
-                     plasma_complex64_t *T, int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_core_omp_zgelqt(
+    int m, int n, int ib,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *T, int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 void plasma_core_omp_zgemm(
     plasma_enum_t transa, plasma_enum_t transb,
@@ -352,28 +403,32 @@ void plasma_core_omp_zgemm(
     plasma_complex64_t beta,        plasma_complex64_t *C, int ldc,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_core_omp_zgeqrt(int m, int n, int ib,
-                     plasma_complex64_t *A, int lda,
-                     plasma_complex64_t *T, int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zgessq(int m, int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *scale, double *sumsq,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_core_omp_zgeqrt(
+    int m, int n, int ib,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *T, int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_core_omp_zgessq_aux(int n,
-                         const double *scale, const double *sumsq,
-                         double *value,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
+void plasma_core_omp_zgessq(
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *scale, double *sumsq,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_core_omp_zhegst(int itype, plasma_enum_t uplo,
-                     int n,
-                     plasma_complex64_t *A, int lda,
-                     plasma_complex64_t *B, int ldb,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_core_omp_zgessq_aux(
+    int n,
+    const double *scale, const double *sumsq,
+    double *value,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_core_omp_zhegst(
+    int itype, plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *B, int ldb,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 void plasma_core_omp_zhemm(
     plasma_enum_t side, plasma_enum_t uplo,
@@ -391,124 +446,143 @@ void plasma_core_omp_zher2k(
     double beta,                    plasma_complex64_t *C, int ldc,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_core_omp_zherk(plasma_enum_t uplo, plasma_enum_t trans,
-                    int n, int k,
-                    double alpha, const plasma_complex64_t *A, int lda,
-                    double beta,        plasma_complex64_t *C, int ldc,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zhessq(plasma_enum_t uplo,
-                     int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *scale, double *sumsq,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zsyssq(plasma_enum_t uplo,
-                     int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *scale, double *sumsq,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zsyssq_aux(int m, int n,
-                         const double *scale, const double *sumsq,
-                         double *value,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_core_omp_zlacpy(plasma_enum_t uplo, plasma_enum_t transa,
-                     int m, int n,
-                     const plasma_complex64_t *A, int lda,
-                           plasma_complex64_t *B, int ldb,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zlacpy_lapack2tile_band(plasma_enum_t uplo,
-                                      int it, int jt,
-                                      int m, int n, int nb, int kl, int ku,
-                                      const plasma_complex64_t *A, int lda,
-                                            plasma_complex64_t *B, int ldb);
-
-void plasma_core_omp_zlacpy_tile2lapack_band(plasma_enum_t uplo,
-                                      int it, int jt,
-                                      int m, int n, int nb, int kl, int ku,
-                                      const plasma_complex64_t *B, int ldb,
-                                            plasma_complex64_t *A, int lda);
-
-void plasma_core_omp_zlange(plasma_enum_t norm,
-                     int m, int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *work, double *result,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zlange_aux(plasma_enum_t norm,
-                         int m, int n,
-                         const plasma_complex64_t *A, int lda,
-                         double *value,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_core_omp_zlanhe(plasma_enum_t norm, plasma_enum_t uplo,
-                     int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *work, double *value,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zlanhe_aux(plasma_enum_t norm, plasma_enum_t uplo,
-                         int n,
-                         const plasma_complex64_t *A, int lda,
-                         double *value,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_core_omp_zlansy(plasma_enum_t norm, plasma_enum_t uplo,
-                     int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *work, double *value,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zlansy_aux(plasma_enum_t norm, plasma_enum_t uplo,
-                         int n,
-                         const plasma_complex64_t *A, int lda,
-                         double *value,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_core_omp_zlantr(plasma_enum_t norm, plasma_enum_t uplo, plasma_enum_t diag,
-                     int m, int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *work, double *value,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zlantr_aux(plasma_enum_t norm, plasma_enum_t uplo,
-                         plasma_enum_t diag,
-                         int m, int n,
-                         const plasma_complex64_t *A, int lda,
-                         double *value,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_core_omp_zlascl(plasma_enum_t uplo,
-                     double cfrom, double cto,
-                     int m, int n,
-                     plasma_complex64_t *A, int lda,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zlaset(plasma_enum_t uplo,
-                     int mb, int nb,
-                     int i, int j,
-                     int m, int n,
-                     plasma_complex64_t alpha, plasma_complex64_t beta,
-                     plasma_complex64_t *A);
-
-void plasma_core_omp_zlauum(plasma_enum_t uplo,
-                     int n,
-                     plasma_complex64_t *A, int lda,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zpotrf(plasma_enum_t uplo,
-                     int n,
-                     plasma_complex64_t *A, int lda,
-                     int iinfo,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_core_omp_zherk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    int n, int k,
+    double alpha, const plasma_complex64_t *A, int lda,
+    double beta,        plasma_complex64_t *C, int ldc,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zhessq(
+    plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *scale, double *sumsq,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zsyssq(
+    plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *scale, double *sumsq,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zsyssq_aux(
+    int m, int n,
+    const double *scale, const double *sumsq,
+    double *value,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_core_omp_zlacpy(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    plasma_complex64_t *B, int ldb,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zlacpy_lapack2tile_band(
+    plasma_enum_t uplo,
+    int it, int jt,
+    int m, int n, int nb, int kl, int ku,
+    const plasma_complex64_t *A, int lda,
+    plasma_complex64_t *B, int ldb);
+
+void plasma_core_omp_zlacpy_tile2lapack_band(
+    plasma_enum_t uplo,
+    int it, int jt,
+    int m, int n, int nb, int kl, int ku,
+    const plasma_complex64_t *B, int ldb,
+    plasma_complex64_t *A, int lda);
+
+void plasma_core_omp_zlange(
+    plasma_enum_t norm,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *work, double *result,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zlange_aux(
+    plasma_enum_t norm,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *value,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_core_omp_zlanhe(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zlanhe_aux(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *value,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_core_omp_zlansy(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zlansy_aux(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    int n,
+    const plasma_complex64_t *A, int lda,
+    double *value,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_core_omp_zlantr(
+    plasma_enum_t norm, plasma_enum_t uplo, plasma_enum_t diag,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zlantr_aux(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    plasma_enum_t diag,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *value,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_core_omp_zlascl(
+    plasma_enum_t uplo,
+    double cfrom, double cto,
+    int m, int n,
+    plasma_complex64_t *A, int lda,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zlaset(
+    plasma_enum_t uplo,
+    int mb, int nb,
+    int i, int j,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t beta,
+    plasma_complex64_t *A);
+
+void plasma_core_omp_zlauum(
+    plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *A, int lda,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zpotrf(
+    plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *A, int lda,
+    int iinfo,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 void plasma_core_omp_zsymm(
     plasma_enum_t side, plasma_enum_t uplo,
@@ -556,97 +630,109 @@ void plasma_core_omp_ztrsm(
                                     plasma_complex64_t *B, int ldb,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_core_omp_ztrssq(plasma_enum_t uplo, plasma_enum_t diag,
-                     int m, int n,
-                     const plasma_complex64_t *A, int lda,
-                     double *scale, double *sumsq,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_ztrtri(plasma_enum_t uplo, plasma_enum_t diag,
-                     int n,
-                     plasma_complex64_t *A, int lda,
-                     int iinfo,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_ztslqt(int m, int n, int ib,
-                     plasma_complex64_t *A1, int lda1,
-                     plasma_complex64_t *A2, int lda2,
-                     plasma_complex64_t *T,  int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_ztsmlq(plasma_enum_t side, plasma_enum_t trans,
-                     int m1, int n1, int m2, int n2, int k, int ib,
-                           plasma_complex64_t *A1, int lda1,
-                           plasma_complex64_t *A2, int lda2,
-                     const plasma_complex64_t *V,  int ldv,
-                     const plasma_complex64_t *T,  int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_ztsmqr(plasma_enum_t side, plasma_enum_t trans,
-                     int m1, int n1, int m2, int n2, int k, int ib,
-                           plasma_complex64_t *A1, int lda1,
-                           plasma_complex64_t *A2, int lda2,
-                     const plasma_complex64_t *V, int ldv,
-                     const plasma_complex64_t *T, int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_ztsqrt(int m, int n, int ib,
-                     plasma_complex64_t *A1, int lda1,
-                     plasma_complex64_t *A2, int lda2,
-                     plasma_complex64_t *T,  int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zttlqt(int m, int n, int ib,
-                     plasma_complex64_t *A1, int lda1,
-                     plasma_complex64_t *A2, int lda2,
-                     plasma_complex64_t *T,  int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zttmlq(plasma_enum_t side, plasma_enum_t trans,
-                     int m1, int n1, int m2, int n2, int k, int ib,
-                           plasma_complex64_t *A1, int lda1,
-                           plasma_complex64_t *A2, int lda2,
-                     const plasma_complex64_t *V,  int ldv,
-                     const plasma_complex64_t *T,  int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zttmqr(plasma_enum_t side, plasma_enum_t trans,
-                     int m1, int n1, int m2, int n2, int k, int ib,
-                           plasma_complex64_t *A1, int lda1,
-                           plasma_complex64_t *A2, int lda2,
-                     const plasma_complex64_t *V, int ldv,
-                     const plasma_complex64_t *T, int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zttqrt(int m, int n, int ib,
-                     plasma_complex64_t *A1, int lda1,
-                     plasma_complex64_t *A2, int lda2,
-                     plasma_complex64_t *T,  int ldt,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zunmlq(plasma_enum_t side, plasma_enum_t trans,
-                     int m, int n, int k, int ib,
-                     const plasma_complex64_t *A, int lda,
-                     const plasma_complex64_t *T, int ldt,
-                           plasma_complex64_t *C, int ldc,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_zunmqr(plasma_enum_t side, plasma_enum_t trans,
-                     int m, int n, int k, int ib,
-                     const plasma_complex64_t *A, int lda,
-                     const plasma_complex64_t *T, int ldt,
-                           plasma_complex64_t *C, int ldc,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_core_omp_ztrssq(
+    plasma_enum_t uplo, plasma_enum_t diag,
+    int m, int n,
+    const plasma_complex64_t *A, int lda,
+    double *scale, double *sumsq,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_ztrtri(
+    plasma_enum_t uplo, plasma_enum_t diag,
+    int n,
+    plasma_complex64_t *A, int lda,
+    int iinfo,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_ztslqt(
+    int m, int n, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_ztsmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_ztsmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V, int ldv,
+    const plasma_complex64_t *T, int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_ztsqrt(
+    int m, int n, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zttlqt(
+    int m, int n, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zttmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zttmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V, int ldv,
+    const plasma_complex64_t *T, int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zttqrt(
+    int m, int n, int ib,
+    plasma_complex64_t *A1, int lda1,
+    plasma_complex64_t *A2, int lda2,
+    plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zunmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m, int n, int k, int ib,
+    const plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *T, int ldt,
+    plasma_complex64_t *C, int ldc,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_zunmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m, int n, int k, int ib,
+    const plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *T, int ldt,
+    plasma_complex64_t *C, int ldc,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 #undef COMPLEX
 
diff --git a/include/plasma_core_blas_zc.h b/include/plasma_core_blas_zc.h
index ccda882a..0d67553b 100644
--- a/include/plasma_core_blas_zc.h
+++ b/include/plasma_core_blas_zc.h
@@ -21,24 +21,28 @@ extern "C" {
 #endif
 
 /******************************************************************************/
-int plasma_core_zlag2c(int m, int n,
-                 plasma_complex64_t *A,  int lda,
-                 plasma_complex32_t *As, int ldas);
+int plasma_core_zlag2c(
+    int m, int n,
+    plasma_complex64_t *A,  int lda,
+    plasma_complex32_t *As, int ldas);
 
-void plasma_core_clag2z(int m, int n,
-                 plasma_complex32_t *As, int ldas,
-                 plasma_complex64_t *A,  int lda);
+void plasma_core_clag2z(
+    int m, int n,
+    plasma_complex32_t *As, int ldas,
+    plasma_complex64_t *A,  int lda);
 
 /******************************************************************************/
-void plasma_core_omp_zlag2c(int m, int n,
-                     plasma_complex64_t *A,  int lda,
-                     plasma_complex32_t *As, int ldas,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_core_omp_clag2z(int m, int n,
-                     plasma_complex32_t *As, int ldas,
-                     plasma_complex64_t *A,  int lda,
-                     plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_core_omp_zlag2c(
+    int m, int n,
+    plasma_complex64_t *A,  int lda,
+    plasma_complex32_t *As, int ldas,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_clag2z(
+    int m, int n,
+    plasma_complex32_t *As, int ldas,
+    plasma_complex64_t *A,  int lda,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/plasma_descriptor.h b/include/plasma_descriptor.h
index fa0de4fc..4a3400f4 100644
--- a/include/plasma_descriptor.h
+++ b/include/plasma_descriptor.h
@@ -246,33 +246,39 @@ static inline int plasma_tile_mmain_band(plasma_desc_t A, int m, int n)
 }
 
 /******************************************************************************/
-int plasma_desc_general_create(plasma_enum_t dtyp, int mb, int nb,
-                               int lm, int ln, int i, int j, int m, int n,
-                               plasma_desc_t *A);
-
-int plasma_desc_general_band_create(plasma_enum_t dtyp, plasma_enum_t uplo,
-                                    int mb, int nb, int lm, int ln,
-                                    int i, int j, int m, int n, int kl, int ku,
-                                    plasma_desc_t *A);
-
-int plasma_desc_triangular_create(plasma_enum_t dtyp, plasma_enum_t uplo, int mb, int nb,
-                                  int lm, int ln, int i, int j, int m, int n,
-                                  plasma_desc_t *A);
+int plasma_desc_general_create(
+    plasma_enum_t dtyp, int mb, int nb,
+    int lm, int ln, int i, int j, int m, int n,
+    plasma_desc_t *A);
+
+int plasma_desc_general_band_create(
+    plasma_enum_t dtyp, plasma_enum_t uplo,
+    int mb, int nb, int lm, int ln,
+    int i, int j, int m, int n, int kl, int ku,
+    plasma_desc_t *A);
+
+int plasma_desc_triangular_create(
+    plasma_enum_t dtyp, plasma_enum_t uplo, int mb, int nb,
+    int lm, int ln, int i, int j, int m, int n,
+    plasma_desc_t *A);
 
 int plasma_desc_destroy(plasma_desc_t *A);
 
-int plasma_desc_general_init(plasma_enum_t precision, void *matrix,
-                             int mb, int nb, int lm, int ln, int i, int j,
-                             int m, int n, plasma_desc_t *A);
+int plasma_desc_general_init(
+    plasma_enum_t precision, void *matrix,
+    int mb, int nb, int lm, int ln, int i, int j,
+    int m, int n, plasma_desc_t *A);
 
-int plasma_desc_general_band_init(plasma_enum_t precision, plasma_enum_t uplo,
-                                  void *matrix, int mb, int nb, int lm, int ln,
-                                  int i, int j, int m, int n, int kl, int ku,
-                                  plasma_desc_t *A);
+int plasma_desc_general_band_init(
+    plasma_enum_t precision, plasma_enum_t uplo,
+    void *matrix, int mb, int nb, int lm, int ln,
+    int i, int j, int m, int n, int kl, int ku,
+    plasma_desc_t *A);
 
-int plasma_desc_triangular_init(plasma_enum_t precision, plasma_enum_t uplo, void *matrix,
-                                int mb, int nb, int lm, int ln, int i, int j,
-                                int m, int n, plasma_desc_t *A);
+int plasma_desc_triangular_init(
+    plasma_enum_t precision, plasma_enum_t uplo, void *matrix,
+    int mb, int nb, int lm, int ln, int i, int j,
+    int m, int n, plasma_desc_t *A);
 
 int plasma_desc_check(plasma_desc_t A);
 int plasma_desc_general_check(plasma_desc_t A);
@@ -280,8 +286,9 @@ int plasma_desc_general_band_check(plasma_desc_t A);
 
 plasma_desc_t plasma_desc_view(plasma_desc_t A, int i, int j, int m, int n);
 
-int plasma_descT_create(plasma_desc_t A, int ib, plasma_enum_t householder_mode,
-                        plasma_desc_t *T);
+int plasma_descT_create(
+    plasma_desc_t A, int ib, plasma_enum_t householder_mode,
+    plasma_desc_t *T);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/plasma_internal_z.h b/include/plasma_internal_z.h
index fbb191f8..7be57888 100644
--- a/include/plasma_internal_z.h
+++ b/include/plasma_internal_z.h
@@ -22,280 +22,336 @@ extern "C" {
 #endif
 
 /******************************************************************************/
-void plasma_pdzamax(plasma_enum_t colrow,
-                    plasma_desc_t A, double *work, double *values,
+void plasma_pdzamax(
+    plasma_enum_t colrow,
+    plasma_desc_t A, double *work, double *values,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzgbtrf(
+    plasma_desc_t A, int *ipiv,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzdesc2ge(
+    plasma_desc_t A,
+    plasma_complex64_t *pA, int lda,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzdesc2pb(
+    plasma_desc_t A,
+    plasma_complex64_t *pA, int lda,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzgbbrd_static(
+    plasma_enum_t uplo, int minmn, int nb, int Vblksiz,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *VQ, plasma_complex64_t *TAUQ,
+    plasma_complex64_t *VP, plasma_complex64_t *TAUP,
+    double *D, double *E, int WANTZ,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzdesc2tr(
+    plasma_desc_t A,
+    plasma_complex64_t *pA, int lda,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzgb2desc(
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzge2desc(
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzgeadd(
+    plasma_enum_t transa,
+    plasma_complex64_t alpha,  plasma_desc_t A,
+    plasma_complex64_t beta,   plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzgelqf(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzgelqf_tree(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzgemm(
+    plasma_enum_t transa, plasma_enum_t transb,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzgeqrf(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzgeqrf_tree(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzgetri_aux(
+    plasma_desc_t A, plasma_desc_t W,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzgetrf(
+    plasma_desc_t A, int *ipiv,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzge2gb(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_workspace_t work,
                     plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_pzgbtrf(plasma_desc_t A, int *ipiv,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzdesc2ge(plasma_desc_t A,
-                      plasma_complex64_t *pA, int lda,
-                      plasma_sequence_t *sequence,
-                      plasma_request_t *request);
-
-void plasma_pzdesc2pb(plasma_desc_t A,
-                      plasma_complex64_t *pA, int lda,
-                      plasma_sequence_t *sequence,
-                      plasma_request_t *request);
-
-void plasma_pzgbbrd_static(plasma_enum_t uplo, int minmn, int nb, int Vblksiz,
-                    plasma_complex64_t *A, int lda,
-                    plasma_complex64_t *VQ, plasma_complex64_t *TAUQ,
-                    plasma_complex64_t *VP, plasma_complex64_t *TAUP,
-                    double *D, double *E, int WANTZ,
-                    plasma_workspace_t work,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-    
-void plasma_pzdesc2tr(plasma_desc_t A,
-                      plasma_complex64_t *pA, int lda,
-                      plasma_sequence_t *sequence,
-                      plasma_request_t *request);
-
-void plasma_pzgb2desc(plasma_complex64_t *pA, int lda,
-                      plasma_desc_t A,
-                      plasma_sequence_t *sequence,
-                      plasma_request_t *request);
-
-void plasma_pzge2desc(plasma_complex64_t *pA, int lda,
-                      plasma_desc_t A,
-                      plasma_sequence_t *sequence,
-                      plasma_request_t *request);
-
-void plasma_pzgeadd(plasma_enum_t transa,
-                    plasma_complex64_t alpha,  plasma_desc_t A,
-                    plasma_complex64_t beta,   plasma_desc_t B,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzgelqf(plasma_desc_t A, plasma_desc_t T,
-                    plasma_workspace_t work,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzgelqf_tree(plasma_desc_t A, plasma_desc_t T,
-                         plasma_workspace_t work,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_pzgemm(plasma_enum_t transa, plasma_enum_t transb,
-                   plasma_complex64_t alpha, plasma_desc_t A,
-                                             plasma_desc_t B,
-                   plasma_complex64_t beta,  plasma_desc_t C,
-                   plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzgeqrf(plasma_desc_t A, plasma_desc_t T,
-                    plasma_workspace_t work,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzgeqrf_tree(plasma_desc_t A, plasma_desc_t T,
-                         plasma_workspace_t work,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_pzgetri_aux(plasma_desc_t A, plasma_desc_t W,
-                        plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzgetrf(plasma_desc_t A, int *ipiv,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzge2gb(plasma_desc_t A, plasma_desc_t T,
-                    plasma_workspace_t work,
-                    plasma_sequence_t *sequence, plasma_request_t *request);    
-
-void plasma_pzgecpy_tile2lapack_band(plasma_enum_t uplo, plasma_desc_t A,
-                                     plasma_complex64_t *pA_band, int lda_band,
-                                     plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzhecpy_tile2lapack_band(plasma_enum_t uplo, plasma_desc_t A,
-                               plasma_complex64_t *AB, int ldab,
-                               plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzhemm(plasma_enum_t side, plasma_enum_t uplo,
-                   plasma_complex64_t alpha, plasma_desc_t A,
-                                             plasma_desc_t B,
-                   plasma_complex64_t beta,  plasma_desc_t C,
-                   plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzher2k(plasma_enum_t uplo, plasma_enum_t trans,
-                    plasma_complex64_t alpha, plasma_desc_t A,
-                                              plasma_desc_t B,
-                    double beta,              plasma_desc_t C,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzherk(plasma_enum_t uplo, plasma_enum_t trans,
-                   double alpha, plasma_desc_t A,
-                   double beta,  plasma_desc_t C,
-                   plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzhetrf_aasen(plasma_enum_t uplo,
-                          plasma_desc_t A, int *ipiv,
-                          plasma_desc_t T,
-                          plasma_desc_t W,
-                          plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlacpy(plasma_enum_t uplo, plasma_enum_t transa,
-                    plasma_desc_t A, plasma_desc_t B,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlangb(plasma_enum_t norm,
-                    plasma_desc_t A, double *work, double *value,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlarft_blgtrd(int N, int NB,int Vblksiz,
-    			   plasma_complex64_t *V,
-    			   plasma_complex64_t *T,
-   			   plasma_complex64_t *TAU,
- 			   plasma_sequence_t *sequence,
- 			   plasma_request_t *request);
-
-void plasma_pzlange(plasma_enum_t norm,
-                    plasma_desc_t A, double *work, double *value,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlanhe(plasma_enum_t norm, plasma_enum_t uplo,
-                    plasma_desc_t A, double *work, double *value,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlansy(plasma_enum_t norm, plasma_enum_t uplo,
-                    plasma_desc_t A, double *work, double *value,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlantr(plasma_enum_t norm, plasma_enum_t uplo, plasma_enum_t diag,
-                    plasma_desc_t A, double *work, double *value,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlangb(plasma_enum_t norm,
-                    plasma_desc_t A, double *work, double *value,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlascl(plasma_enum_t uplo,
-                    double cfrom, double cto,
-                    plasma_desc_t A,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlaset(plasma_enum_t uplo,
-                    plasma_complex64_t alpha, plasma_complex64_t beta,
-                    plasma_desc_t A,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzgeswp(plasma_enum_t colrow,
-                    plasma_desc_t A, int *ipiv, int incx,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzlauum(plasma_enum_t uplo, plasma_desc_t A,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzpb2desc(plasma_complex64_t *pA, int lda,
-                      plasma_desc_t A,
-                      plasma_sequence_t *sequence,
-                      plasma_request_t *request);
-
-void plasma_pzpbtrf(plasma_enum_t uplo, plasma_desc_t A,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzpotrf(plasma_enum_t uplo, plasma_desc_t A,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzsymm(plasma_enum_t side, plasma_enum_t uplo,
-                   plasma_complex64_t alpha, plasma_desc_t A,
-                                             plasma_desc_t B,
-                   plasma_complex64_t beta,  plasma_desc_t C,
-                   plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzsyr2k(plasma_enum_t uplo, plasma_enum_t trans,
-                    plasma_complex64_t alpha, plasma_desc_t A,
-                                              plasma_desc_t B,
-                    plasma_complex64_t beta,  plasma_desc_t C,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzsyrk(plasma_enum_t uplo, plasma_enum_t trans,
-                   plasma_complex64_t alpha, plasma_desc_t A,
-                   plasma_complex64_t beta,  plasma_desc_t C,
-                   plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pztbsm(plasma_enum_t side, plasma_enum_t uplo,
-                   plasma_enum_t trans, plasma_enum_t diag,
-                   plasma_complex64_t alpha, plasma_desc_t A,
-                                             plasma_desc_t B,
-                   const int *ipiv,
-                   plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pztr2desc(plasma_complex64_t *pA, int lda,
-                      plasma_desc_t A,
-                      plasma_sequence_t *sequence,
-                      plasma_request_t *request);
-
-void plasma_pztradd(plasma_enum_t uplo, plasma_enum_t transa,
-                    plasma_complex64_t alpha,  plasma_desc_t A,
-                    plasma_complex64_t beta,   plasma_desc_t B,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pztrmm(plasma_enum_t side, plasma_enum_t uplo,
-                   plasma_enum_t trans, plasma_enum_t diag,
-                   plasma_complex64_t alpha, plasma_desc_t A,
-                                             plasma_desc_t B,
-                   plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pztrsm(plasma_enum_t side, plasma_enum_t uplo,
-                   plasma_enum_t trans, plasma_enum_t diag,
-                   plasma_complex64_t alpha, plasma_desc_t A,
-                                             plasma_desc_t B,
-                   plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pztrtri(plasma_enum_t uplo, plasma_enum_t diag,
-                    plasma_desc_t A,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzunglq(plasma_desc_t A, plasma_desc_t T, plasma_desc_t Q,
-                    plasma_workspace_t work,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzunglq_tree(plasma_desc_t A, plasma_desc_t T, plasma_desc_t Q,
-                         plasma_workspace_t work,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_pzungqr(plasma_desc_t A, plasma_desc_t T, plasma_desc_t Q,
-                    plasma_workspace_t work,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzungqr_tree(plasma_desc_t A, plasma_desc_t T, plasma_desc_t Q,
-                         plasma_workspace_t work,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_pzunmlq(plasma_enum_t side, plasma_enum_t trans,
-                    plasma_desc_t A, plasma_desc_t T, plasma_desc_t B,
-                    plasma_workspace_t work,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzunmlq_tree(plasma_enum_t side, plasma_enum_t trans,
-                         plasma_desc_t A, plasma_desc_t T, plasma_desc_t B,
-                         plasma_workspace_t work,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_pzunmqr(plasma_enum_t side, plasma_enum_t trans,
-                    plasma_desc_t A, plasma_desc_t T, plasma_desc_t B,
-                    plasma_workspace_t work,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_pzunmqr_blgtrd(plasma_enum_t side,
-			   plasma_enum_t trans,
-			   int N, int NB, int NE, 
-			   int Vblksiz, int WANTZ,
-			   plasma_complex64_t *V,
- 			   plasma_complex64_t *T,
-  			   plasma_complex64_t *TAU,
-		           plasma_complex64_t *E, int LDE,
-			   plasma_workspace_t work,
-   			   plasma_sequence_t *sequence,
-   			   plasma_request_t *request);
-
-void plasma_pzunmqr_tree(plasma_enum_t side, plasma_enum_t trans,
-                         plasma_desc_t A, plasma_desc_t T, plasma_desc_t B,
-                         plasma_workspace_t work,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
+void plasma_pzgecpy_tile2lapack_band(
+    plasma_enum_t uplo, plasma_desc_t A,
+    plasma_complex64_t *pA_band, int lda_band,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzhecpy_tile2lapack_band(
+    plasma_enum_t uplo, plasma_desc_t A,
+    plasma_complex64_t *AB, int ldab,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzhemm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzher2k(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    double beta,              plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzherk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    double alpha, plasma_desc_t A,
+    double beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzhetrf_aasen(
+    plasma_enum_t uplo,
+    plasma_desc_t A, int *ipiv,
+    plasma_desc_t T,
+    plasma_desc_t W,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlacpy(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    plasma_desc_t A, plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlangb(
+    plasma_enum_t norm,
+    plasma_desc_t A, double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlarft_blgtrd(
+    int N, int NB,int Vblksiz,
+    plasma_complex64_t *V,
+    plasma_complex64_t *T,
+    plasma_complex64_t *TAU,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzlange(
+    plasma_enum_t norm,
+    plasma_desc_t A, double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlanhe(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    plasma_desc_t A, double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlansy(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    plasma_desc_t A, double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlantr(
+    plasma_enum_t norm, plasma_enum_t uplo, plasma_enum_t diag,
+    plasma_desc_t A, double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlangb(
+    plasma_enum_t norm,
+    plasma_desc_t A, double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlascl(
+    plasma_enum_t uplo,
+    double cfrom, double cto,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlaset(
+    plasma_enum_t uplo,
+    plasma_complex64_t alpha, plasma_complex64_t beta,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzgeswp(
+    plasma_enum_t colrow,
+    plasma_desc_t A, int *ipiv, int incx,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzlauum(
+    plasma_enum_t uplo, plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzpb2desc(
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzpbtrf(
+    plasma_enum_t uplo, plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzpotrf(
+    plasma_enum_t uplo, plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzsymm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzsyr2k(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzsyrk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    plasma_complex64_t alpha, plasma_desc_t A,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pztbsm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t trans, plasma_enum_t diag,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    const int *ipiv,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pztr2desc(
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pztradd(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    plasma_complex64_t alpha,  plasma_desc_t A,
+    plasma_complex64_t beta,   plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pztrmm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t trans, plasma_enum_t diag,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pztrsm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t trans, plasma_enum_t diag,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pztrtri(
+    plasma_enum_t uplo, plasma_enum_t diag,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzunglq(
+    plasma_desc_t A, plasma_desc_t T, plasma_desc_t Q,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzunglq_tree(
+    plasma_desc_t A, plasma_desc_t T, plasma_desc_t Q,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzungqr(
+    plasma_desc_t A, plasma_desc_t T, plasma_desc_t Q,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzungqr_tree(
+    plasma_desc_t A, plasma_desc_t T, plasma_desc_t Q,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzunmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    plasma_desc_t A, plasma_desc_t T, plasma_desc_t B,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzunmlq_tree(
+    plasma_enum_t side, plasma_enum_t trans,
+    plasma_desc_t A, plasma_desc_t T, plasma_desc_t B,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzunmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    plasma_desc_t A, plasma_desc_t T, plasma_desc_t B,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzunmqr_blgtrd(
+    plasma_enum_t side,
+    plasma_enum_t trans,
+    int N, int NB, int NE,
+    int Vblksiz, int WANTZ,
+    plasma_complex64_t *V,
+    plasma_complex64_t *T,
+    plasma_complex64_t *TAU,
+    plasma_complex64_t *E, int LDE,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_pzunmqr_tree(
+    plasma_enum_t side, plasma_enum_t trans,
+    plasma_desc_t A, plasma_desc_t T, plasma_desc_t B,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/plasma_internal_zc.h b/include/plasma_internal_zc.h
index 79452d7a..5948b2e9 100644
--- a/include/plasma_internal_zc.h
+++ b/include/plasma_internal_zc.h
@@ -22,11 +22,13 @@ extern "C" {
 #endif
 
 /******************************************************************************/
-void plasma_pzlag2c(plasma_desc_t A, plasma_desc_t As,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_pzlag2c(
+    plasma_desc_t A, plasma_desc_t As,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_pclag2z(plasma_desc_t As, plasma_desc_t A,
-                    plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_pclag2z(
+    plasma_desc_t As, plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/plasma_tree.h b/include/plasma_tree.h
index 10420972..124f0a09 100644
--- a/include/plasma_tree.h
+++ b/include/plasma_tree.h
@@ -30,11 +30,12 @@ enum {
  *  QR and LQ factorization.
  * @see plasma_omp_zgeqrf
  **/
-static inline int plasma_tree_insert_operation(int *operations,
-                                               int loperations,
-                                               int ind_op,
-                                               plasma_enum_t kernel,
-                                               int col, int row, int rowpiv)
+static inline int plasma_tree_insert_operation(
+    int *operations,
+    int loperations,
+    int ind_op,
+    plasma_enum_t kernel,
+    int col, int row, int rowpiv)
 {
     assert(ind_op < loperations);
 
@@ -53,10 +54,11 @@ static inline int plasma_tree_insert_operation(int *operations,
  *  QR and LQ factorization.
  * @see plasma_omp_zgeqrf
  **/
-static inline void plasma_tree_get_operation(int *operations,
-                                             int ind_op,
-                                             plasma_enum_t *kernel,
-                                             int *col, int *row, int *rowpiv)
+static inline void plasma_tree_get_operation(
+    int *operations,
+    int ind_op,
+    plasma_enum_t *kernel,
+    int *col, int *row, int *rowpiv)
 {
     *kernel = operations[ind_op*4];
     *col    = operations[ind_op*4+1];
@@ -64,9 +66,10 @@ static inline void plasma_tree_get_operation(int *operations,
     *rowpiv = operations[ind_op*4+3];
 }
 
-void plasma_tree_operations(int mt, int nt,
-                            int **operations, int *num_operations,
-                            plasma_sequence_t *sequence,
-                            plasma_request_t *request);
+void plasma_tree_operations(
+    int mt, int nt,
+    int **operations, int *num_operations,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
 
 #endif // PLASMA_TREE_H
diff --git a/include/plasma_tuning.h b/include/plasma_tuning.h
index 0633a227..342efa10 100644
--- a/include/plasma_tuning.h
+++ b/include/plasma_tuning.h
@@ -18,64 +18,120 @@ extern "C" {
 
 /******************************************************************************/
 void plasma_tuning_init(plasma_context_t *plasma);
+
 void plasma_tuning_finalize(plasma_context_t *plasma);
 
-void plasma_tune_gbmm(plasma_context_t *plasma, plasma_enum_t dtyp,
-                      int m, int n, int k, int kl, int ku);
-void plasma_tune_gbtrf(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n, int bw);
-void plasma_tune_geadd(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_geinv(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_gelqf(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_gemm(plasma_context_t *plasma, plasma_enum_t dtyp,
-                      int m, int n, int k);
-void plasma_tune_geqrf(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_geswp(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_getrf(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_hetrf(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n);
-void plasma_tune_lacpy(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_lag2c(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_lange(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_lansy(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n);
-void plasma_tune_lantr(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_lascl(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_laset(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_lauum(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n);
-void plasma_tune_pbtrf(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n);
-void plasma_tune_poinv(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n);
-void plasma_tune_potrf(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n);
-void plasma_tune_symm(plasma_context_t *plasma, plasma_enum_t dtyp,
-                      int m, int n);
-void plasma_tune_syr2k(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n, int k);
-void plasma_tune_syrk(plasma_context_t *plasma, plasma_enum_t dtyp,
-                      int n, int k);
-void plasma_tune_tradd(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int m, int n);
-void plasma_tune_trmm(plasma_context_t *plasma, plasma_enum_t dtyp,
-                      int m, int n);
-void plasma_tune_trsm(plasma_context_t *plasma, plasma_enum_t dtyp,
-                      int m, int n);
-void plasma_tune_trtri(plasma_context_t *plasma, plasma_enum_t dtyp,
-                       int n);
+void plasma_tune_gbmm(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n, int k, int kl, int ku);
+
+void plasma_tune_gbtrf(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n, int bw);
+
+void plasma_tune_geadd(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_geinv(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_gelqf(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_gemm(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n, int k);
+
+void plasma_tune_geqrf(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_geswp(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_getrf(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_hetrf(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n);
+
+void plasma_tune_lacpy(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_lag2c(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_lange(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_lansy(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n);
+
+void plasma_tune_lantr(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_lascl(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_laset(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_lauum(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n);
+
+void plasma_tune_pbtrf(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n);
+
+void plasma_tune_poinv(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n);
+
+void plasma_tune_potrf(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n);
+
+void plasma_tune_symm(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_syr2k(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n, int k);
+
+void plasma_tune_syrk(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n, int k);
+
+void plasma_tune_tradd(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_trmm(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_trsm(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int m, int n);
+
+void plasma_tune_trtri(
+    plasma_context_t *plasma, plasma_enum_t dtyp,
+    int n);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/plasma_z.h b/include/plasma_z.h
index f26448cb..4f1a856c 100644
--- a/include/plasma_z.h
+++ b/include/plasma_z.h
@@ -26,565 +26,681 @@ extern "C" {
 /***************************************************************************//**
  *  Standard interface.
  **/
-int plasma_dzamax(plasma_enum_t colrow,
-                  int m, int n,
-                  plasma_complex64_t *pA, int lda, double *values);
-
-int plasma_zgbmm(plasma_enum_t transa, plasma_enum_t transb,
-                 int m, int n, int k, int kl, int ku,
-                 plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                                           plasma_complex64_t *pB, int ldb,
-                 plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
-
-int plasma_zgbsv(int n, int kl, int ku, int nrhs,
-                 plasma_complex64_t *pAB, int ldab, int *ipiv,
-                 plasma_complex64_t *pB,  int ldb);
-
-int plasma_zgbtrf(int m, int n, int kl, int ku,
-                  plasma_complex64_t *pA, int lda, int *ipiv);
-
-int plasma_zgbtrs(plasma_enum_t transa, int n, int kl, int ku, int nrhs,
-                  plasma_complex64_t *pAB, int ldab,
-                  int *ipiv,
-                  plasma_complex64_t *pB,  int ldb);
-
-int plasma_zgeadd(plasma_enum_t transa,
-                  int m, int n,
-                  plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                  plasma_complex64_t beta,  plasma_complex64_t *pB, int ldb);
+int plasma_dzamax(
+    plasma_enum_t colrow,
+    int m, int n,
+    plasma_complex64_t *pA, int lda, double *values);
+
+int plasma_zgbmm(
+    plasma_enum_t transa, plasma_enum_t transb,
+    int m, int n, int k, int kl, int ku,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+    plasma_complex64_t *pB, int ldb,
+    plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
+
+int plasma_zgbsv(
+    int n, int kl, int ku, int nrhs,
+    plasma_complex64_t *pAB, int ldab, int *ipiv,
+    plasma_complex64_t *pB,  int ldb);
+
+int plasma_zgbtrf(
+    int m, int n, int kl, int ku,
+    plasma_complex64_t *pA, int lda, int *ipiv);
+
+int plasma_zgbtrs(
+    plasma_enum_t transa, int n, int kl, int ku, int nrhs,
+    plasma_complex64_t *pAB, int ldab,
+    int *ipiv,
+    plasma_complex64_t *pB,  int ldb);
+
+int plasma_zgeadd(
+    plasma_enum_t transa,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+    plasma_complex64_t beta,  plasma_complex64_t *pB, int ldb);
 
 int plasma_zgeinv(int m, int n, plasma_complex64_t *pA, int lda, int *ipiv);
 
-int plasma_zgelqf(int m, int n,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t *T);
-
-int plasma_zgelqs(int m, int n, int nrhs,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t T,
-                  plasma_complex64_t *pB, int ldb);
-
-int plasma_zgels(plasma_enum_t trans,
-                 int m, int n, int nrhs,
-                 plasma_complex64_t *pA, int lda,
-                 plasma_desc_t *T,
-                 plasma_complex64_t *pB, int ldb);
-
-int plasma_zgemm(plasma_enum_t transa, plasma_enum_t transb,
-                 int m, int n, int k,
-                 plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                                           plasma_complex64_t *pB, int ldb,
-                 plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
-
-int plasma_zgeqrf(int m, int n,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t *T);
-
-int plasma_zgeqrs(int m, int n, int nrhs,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t T,
-                  plasma_complex64_t *pB, int ldb);
-
-int plasma_zgesv(int n, int nrhs,
-                 plasma_complex64_t *pA, int lda, int *ipiv,
-                 plasma_complex64_t *pB, int ldb);
-
-void plasma_omp_zgesdd(plasma_enum_t jobu, plasma_enum_t jobvt,
-                       plasma_desc_t A, plasma_desc_t T,
-                       double *S,
-                       plasma_complex64_t *pU,  int ldu,
-                       plasma_complex64_t *pVT, int ldvt,
-                       plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-    
-int plasma_zgesdd(plasma_enum_t jobu, plasma_enum_t jobvt,
-                  int m, int n,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t *T,
-                  double *S,
-                  plasma_complex64_t *pU,  int ldu,
-                  plasma_complex64_t *pVT, int ldvt);
-    
-int plasma_zgetrf(int m, int n,
-                  plasma_complex64_t *pA, int lda, int *ipiv);
+int plasma_zgelqf(
+    int m, int n,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t *T);
+
+int plasma_zgelqs(
+    int m, int n, int nrhs,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t T,
+    plasma_complex64_t *pB, int ldb);
+
+int plasma_zgels(
+    plasma_enum_t trans,
+    int m, int n, int nrhs,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t *T,
+    plasma_complex64_t *pB, int ldb);
+
+int plasma_zgemm(
+    plasma_enum_t transa, plasma_enum_t transb,
+    int m, int n, int k,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+                              plasma_complex64_t *pB, int ldb,
+    plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
+
+int plasma_zgeqrf(
+    int m, int n,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t *T);
+
+int plasma_zgeqrs(
+    int m, int n, int nrhs,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t T,
+    plasma_complex64_t *pB, int ldb);
+
+int plasma_zgesv(
+    int n, int nrhs,
+    plasma_complex64_t *pA, int lda, int *ipiv,
+    plasma_complex64_t *pB, int ldb);
+
+void plasma_omp_zgesdd(
+    plasma_enum_t jobu, plasma_enum_t jobvt,
+    plasma_desc_t A, plasma_desc_t T,
+    double *S,
+    plasma_complex64_t *pU,  int ldu,
+    plasma_complex64_t *pVT, int ldvt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+int plasma_zgesdd(
+    plasma_enum_t jobu, plasma_enum_t jobvt,
+    int m, int n,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t *T,
+    double *S,
+    plasma_complex64_t *pU,  int ldu,
+    plasma_complex64_t *pVT, int ldvt);
+
+int plasma_zgetrf(
+    int m, int n,
+    plasma_complex64_t *pA, int lda, int *ipiv);
 
 int plasma_zgetri(int n, plasma_complex64_t *pA, int lda, int *ipiv);
 
 int plasma_zgetri_aux(int n, plasma_complex64_t *pA, int lda);
 
-int plasma_zgetrs(plasma_enum_t trans, int n, int nrhs,
-                  plasma_complex64_t *pA, int lda, int *ipiv,
-                  plasma_complex64_t *pB, int ldb);
-
-int plasma_zhemm(plasma_enum_t side, plasma_enum_t uplo,
-                 int m, int n,
-                 plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                                           plasma_complex64_t *pB, int ldb,
-                 plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
-
-int plasma_zher2k(plasma_enum_t uplo, plasma_enum_t trans,
-                  int n, int k,
-                  plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                                            plasma_complex64_t *pB, int ldb,
-                  double beta,              plasma_complex64_t *pC, int ldc);
-
-int plasma_zherk(plasma_enum_t uplo, plasma_enum_t trans,
-                 int n, int k,
-                 double alpha, plasma_complex64_t *pA, int lda,
-                 double beta,  plasma_complex64_t *pC, int ldc);
-
-int plasma_zhetrf(plasma_enum_t uplo,
-                  int n,
-                  plasma_complex64_t *pA, int lda, int *ipiv,
-                  plasma_complex64_t *pT, int ldt, int *ipiv2);
-
-int plasma_zhesv(plasma_enum_t uplo, int n, int nrhs,
-                 plasma_complex64_t *pA, int lda,
-                 int *ipiv,
-                 plasma_complex64_t *pT, int ldt,
-                 int *ipiv2,
-                 plasma_complex64_t *pB,  int ldb);
-
-int plasma_zhetrs(plasma_enum_t uplo, int n, int nrhs,
-                  plasma_complex64_t *pA, int lda,
-                  int *ipiv,
-                  plasma_complex64_t *pT, int ldt,
-                  int *ipiv2,
-                  plasma_complex64_t *pB,  int ldb);
-
-int plasma_zlacpy(plasma_enum_t uplo, plasma_enum_t transa,
-                  int m, int n,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_complex64_t *pB, int ldb);
-
-void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
-        plasma_complex64_t upperBound, int nLT_low, int nLT_hi, int numEV);
-
-int plasma_zlaneg2(plasma_complex64_t *diag, plasma_complex64_t *offd, 
-                   int n, plasma_complex64_t u);
-
-double plasma_zlangb(plasma_enum_t norm,
-                     int m, int n, int kl, int ku,
-                     plasma_complex64_t *pAB, int ldab);
-
-double plasma_zlange(plasma_enum_t norm,
-                     int m, int n,
-                     plasma_complex64_t *pA, int lda);
-
-double plasma_zlanhe(plasma_enum_t norm, plasma_enum_t uplo,
-                     int n,
-                     plasma_complex64_t *pA, int lda);
-
-double plasma_zlansy(plasma_enum_t norm, plasma_enum_t uplo,
-                     int n,
-                     plasma_complex64_t *pA, int lda);
-
-double plasma_zlantr(plasma_enum_t norm, plasma_enum_t uplo, plasma_enum_t diag,
-                     int m, int n,
-                     plasma_complex64_t *pA, int lda);
-
-double plasma_zlangb(plasma_enum_t norm,
-                     int m, int n, int kl, int ku,
-                     plasma_complex64_t *pAB, int ldab);
-
-int plasma_zlascl(plasma_enum_t uplo,
-                  double cfrom, double cto,
-                  int m, int n,
-                  plasma_complex64_t *pA, int lda);
-
-int plasma_zlaset(plasma_enum_t uplo,
-                  int m, int n,
-                  plasma_complex64_t alpha, plasma_complex64_t beta,
-                  plasma_complex64_t *pA, int lda);
-
-int plasma_zgeswp(plasma_enum_t colrow,
-                  int m, int n,
-                  plasma_complex64_t *pA, int lda,
-                  int *ipiv, int incx);
-
-int plasma_zlauum(plasma_enum_t uplo, int n,
-                  plasma_complex64_t *pA, int lda);
-
-int plasma_zpbsv(plasma_enum_t uplo,
-                 int n, int kd, int nrhs,
-                 plasma_complex64_t *pAB, int ldab,
-                 plasma_complex64_t *pB,  int ldb);
-
-int plasma_zpbtrf(plasma_enum_t uplo,
-                  int n, int kd,
-                  plasma_complex64_t *pAB, int ldab);
-
-int plasma_zpbtrs(plasma_enum_t uplo,
-                  int n, int kd, int nrhs,
-                  plasma_complex64_t *pAB, int ldab,
-                  plasma_complex64_t *pB,  int ldb);
-
-int plasma_zpoinv(plasma_enum_t uplo,
-                  int n,
-                  plasma_complex64_t *pA, int lda);
-
-int plasma_zposv(plasma_enum_t uplo,
-                 int n, int nrhs,
-                 plasma_complex64_t *pA, int lda,
-                 plasma_complex64_t *pB, int ldb);
-
-int plasma_zpotrf(plasma_enum_t uplo,
-                  int n,
-                  plasma_complex64_t *pA, int lda);
-
-int plasma_zpotri(plasma_enum_t uplo,
-                  int n,
-                  plasma_complex64_t *pA, int lda);
-
-int plasma_zpotrs(plasma_enum_t uplo,
-                  int n, int nrhs,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_complex64_t *pB, int ldb);
-
-int plasma_zstevx2(plasma_enum_t jobtype, plasma_enum_t range, int n, int k, 
-                   plasma_complex64_t *diag, plasma_complex64_t *offd,
-                   plasma_complex64_t vl, plasma_complex64_t vu, int il,
-                   int iu, int *pFound, plasma_complex64_t *pVal, int *pMul,
-                   plasma_complex64_t *pVec);
-
-int plasma_zsymm(plasma_enum_t side, plasma_enum_t uplo,
-                 int m, int n,
-                 plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                                           plasma_complex64_t *pB, int ldb,
-                 plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
-
-int plasma_zsyr2k(plasma_enum_t uplo, plasma_enum_t trans,
-                  int n, int k,
-                  plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                                            plasma_complex64_t *pB, int ldb,
-                  plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
-
-int plasma_zsyrk(plasma_enum_t uplo, plasma_enum_t trans,
-                 int n, int k,
-                 plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                 plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
-
-int plasma_ztradd(plasma_enum_t uplo, plasma_enum_t transa,
-                  int m, int n,
-                  plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                  plasma_complex64_t beta,  plasma_complex64_t *pB, int ldb);
-
-int plasma_ztrmm(plasma_enum_t side, plasma_enum_t uplo,
-                 plasma_enum_t transa, plasma_enum_t diag,
-                 int m, int n,
-                 plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                                           plasma_complex64_t *pB, int ldb);
-
-int plasma_ztrsm(plasma_enum_t side, plasma_enum_t uplo,
-                 plasma_enum_t transa, plasma_enum_t diag,
-                 int m, int n,
-                 plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
-                                           plasma_complex64_t *pB, int ldb);
-
-int plasma_ztrtri(plasma_enum_t uplo, plasma_enum_t diag,
-                  int n, plasma_complex64_t *pA, int lda);
-
-int plasma_zunglq(int m, int n, int k,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t T,
-                  plasma_complex64_t *pQ, int ldq);
-
-int plasma_zungqr(int m, int n, int k,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t T,
-                  plasma_complex64_t *pQ, int ldq);
-
-int plasma_zunmlq(plasma_enum_t side, plasma_enum_t trans,
-                  int m, int n, int k,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t T,
-                  plasma_complex64_t *pC, int ldc);
-
-int plasma_zunmqr(plasma_enum_t side, plasma_enum_t trans,
-                  int m, int n, int k,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t T,
-                  plasma_complex64_t *pC, int ldc);
+int plasma_zgetrs(
+    plasma_enum_t trans, int n, int nrhs,
+    plasma_complex64_t *pA, int lda, int *ipiv,
+    plasma_complex64_t *pB, int ldb);
+
+int plasma_zhemm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+                              plasma_complex64_t *pB, int ldb,
+    plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
+
+int plasma_zher2k(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    int n, int k,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+                              plasma_complex64_t *pB, int ldb,
+    double beta,              plasma_complex64_t *pC, int ldc);
+
+int plasma_zherk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    int n, int k,
+    double alpha, plasma_complex64_t *pA, int lda,
+    double beta,  plasma_complex64_t *pC, int ldc);
+
+int plasma_zhetrf(
+    plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *pA, int lda, int *ipiv,
+    plasma_complex64_t *pT, int ldt, int *ipiv2);
+
+int plasma_zhesv(
+    plasma_enum_t uplo, int n, int nrhs,
+    plasma_complex64_t *pA, int lda,
+    int *ipiv,
+    plasma_complex64_t *pT, int ldt,
+    int *ipiv2,
+    plasma_complex64_t *pB,  int ldb);
+
+int plasma_zhetrs(
+    plasma_enum_t uplo, int n, int nrhs,
+    plasma_complex64_t *pA, int lda,
+    int *ipiv,
+    plasma_complex64_t *pT, int ldt,
+    int *ipiv2,
+    plasma_complex64_t *pB,  int ldb);
+
+int plasma_zlacpy(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    int m, int n,
+    plasma_complex64_t *pA, int lda,
+    plasma_complex64_t *pB, int ldb);
+
+void plasma_zlaebz2(
+    zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
+    plasma_complex64_t upperBound, int nLT_low, int nLT_hi, int numEV);
+
+int plasma_zlaneg2(plasma_complex64_t *diag, plasma_complex64_t *offd,
+    int n, plasma_complex64_t u);
+
+double plasma_zlangb(
+    plasma_enum_t norm,
+    int m, int n, int kl, int ku,
+    plasma_complex64_t *pAB, int ldab);
+
+double plasma_zlange(
+    plasma_enum_t norm,
+    int m, int n,
+    plasma_complex64_t *pA, int lda);
+
+double plasma_zlanhe(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *pA, int lda);
+
+double plasma_zlansy(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *pA, int lda);
+
+double plasma_zlantr(
+    plasma_enum_t norm, plasma_enum_t uplo, plasma_enum_t diag,
+    int m, int n,
+    plasma_complex64_t *pA, int lda);
+
+double plasma_zlangb(
+    plasma_enum_t norm,
+    int m, int n, int kl, int ku,
+    plasma_complex64_t *pAB, int ldab);
+
+int plasma_zlascl(
+    plasma_enum_t uplo,
+    double cfrom, double cto,
+    int m, int n,
+    plasma_complex64_t *pA, int lda);
+
+int plasma_zlaset(
+    plasma_enum_t uplo,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t beta,
+    plasma_complex64_t *pA, int lda);
+
+int plasma_zgeswp(
+    plasma_enum_t colrow,
+    int m, int n,
+    plasma_complex64_t *pA, int lda,
+    int *ipiv, int incx);
+
+int plasma_zlauum(
+    plasma_enum_t uplo, int n,
+    plasma_complex64_t *pA, int lda);
+
+int plasma_zpbsv(
+    plasma_enum_t uplo,
+    int n, int kd, int nrhs,
+    plasma_complex64_t *pAB, int ldab,
+    plasma_complex64_t *pB,  int ldb);
+
+int plasma_zpbtrf(
+    plasma_enum_t uplo,
+    int n, int kd,
+    plasma_complex64_t *pAB, int ldab);
+
+int plasma_zpbtrs(
+    plasma_enum_t uplo,
+    int n, int kd, int nrhs,
+    plasma_complex64_t *pAB, int ldab,
+    plasma_complex64_t *pB,  int ldb);
+
+int plasma_zpoinv(
+    plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *pA, int lda);
+
+int plasma_zposv(
+    plasma_enum_t uplo,
+    int n, int nrhs,
+    plasma_complex64_t *pA, int lda,
+    plasma_complex64_t *pB, int ldb);
+
+int plasma_zpotrf(
+    plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *pA, int lda);
+
+int plasma_zpotri(
+    plasma_enum_t uplo,
+    int n,
+    plasma_complex64_t *pA, int lda);
+
+int plasma_zpotrs(
+    plasma_enum_t uplo,
+    int n, int nrhs,
+    plasma_complex64_t *pA, int lda,
+    plasma_complex64_t *pB, int ldb);
+
+int plasma_zstevx2(plasma_enum_t jobtype, plasma_enum_t range, int n, int k,
+    plasma_complex64_t *diag, plasma_complex64_t *offd,
+    plasma_complex64_t vl, plasma_complex64_t vu, int il,
+    int iu, int *pFound, plasma_complex64_t *pVal, int *pMul,
+    plasma_complex64_t *pVec);
+
+int plasma_zsymm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+                              plasma_complex64_t *pB, int ldb,
+    plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
+
+int plasma_zsyr2k(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    int n, int k,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+                              plasma_complex64_t *pB, int ldb,
+    plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
+
+int plasma_zsyrk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    int n, int k,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+    plasma_complex64_t beta,  plasma_complex64_t *pC, int ldc);
+
+int plasma_ztradd(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+    plasma_complex64_t beta,  plasma_complex64_t *pB, int ldb);
+
+int plasma_ztrmm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t transa, plasma_enum_t diag,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+                              plasma_complex64_t *pB, int ldb);
+
+int plasma_ztrsm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t transa, plasma_enum_t diag,
+    int m, int n,
+    plasma_complex64_t alpha, plasma_complex64_t *pA, int lda,
+                              plasma_complex64_t *pB, int ldb);
+
+int plasma_ztrtri(
+    plasma_enum_t uplo, plasma_enum_t diag,
+    int n, plasma_complex64_t *pA, int lda);
+
+int plasma_zunglq(
+    int m, int n, int k,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t T,
+    plasma_complex64_t *pQ, int ldq);
+
+int plasma_zungqr(
+    int m, int n, int k,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t T,
+    plasma_complex64_t *pQ, int ldq);
+
+int plasma_zunmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m, int n, int k,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t T,
+    plasma_complex64_t *pC, int ldc);
+
+int plasma_zunmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m, int n, int k,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t T,
+    plasma_complex64_t *pC, int ldc);
 
 /***************************************************************************//**
  *  Tile asynchronous interface.
  **/
-void plasma_omp_dzamax(plasma_enum_t colrow, plasma_desc_t A,
-                       double *work, double *values,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgbmm(plasma_enum_t transa, plasma_enum_t transb,
-                      plasma_complex64_t alpha, plasma_desc_t A,
-                                                plasma_desc_t B,
-                      plasma_complex64_t beta,  plasma_desc_t C,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgbsv(plasma_desc_t AB, int *ipiv, plasma_desc_t B,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgbtrf(plasma_desc_t A, int *ipiv,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgbtrs(plasma_enum_t transa, plasma_desc_t AB, int *ipiv,
-                       plasma_desc_t B,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zdesc2ge(plasma_desc_t A,
-                         plasma_complex64_t *pA, int lda,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_omp_zdesc2pb(plasma_desc_t A,
-                         plasma_complex64_t *pA, int lda,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_omp_zdesc2tr(plasma_desc_t A,
-                         plasma_complex64_t *pA, int lda,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_omp_zgb2desc(plasma_complex64_t *pA, int lda,
-                         plasma_desc_t A,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_zgbset(int m, int n, int kl, int ku,
-                   plasma_complex64_t *pA, int lda);
-
-void plasma_omp_zge2desc(plasma_complex64_t *pA, int lda,
-                         plasma_desc_t A,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_omp_zgeadd(plasma_enum_t transa,
-                       plasma_complex64_t alpha, plasma_desc_t A,
-                       plasma_complex64_t beta,  plasma_desc_t B,
-                       plasma_sequence_t *sequence, plasma_request_t  *request);
-
-void plasma_omp_zgeinv(plasma_desc_t A, int *ipiv, plasma_desc_t W,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgelqf(plasma_desc_t A, plasma_desc_t T,
-                       plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgelqs(plasma_desc_t A, plasma_desc_t T,
-                       plasma_desc_t B, plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgels(plasma_enum_t trans,
-                      plasma_desc_t A, plasma_desc_t T,
-                      plasma_desc_t B, plasma_workspace_t work,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgemm(plasma_enum_t transa, plasma_enum_t transb,
-                      plasma_complex64_t alpha, plasma_desc_t A,
-                                                plasma_desc_t B,
-                      plasma_complex64_t beta,  plasma_desc_t C,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgeqrf(plasma_desc_t A, plasma_desc_t T,
-                       plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgeqrs(plasma_desc_t A, plasma_desc_t T,
-                       plasma_desc_t B, plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgesv(plasma_desc_t A, int *ipiv,
-                      plasma_desc_t B,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgetrf(plasma_desc_t A, int *ipiv,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgetri(plasma_desc_t A, int *ipiv, plasma_desc_t W,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgetri_aux(plasma_desc_t A, plasma_desc_t W,
-                           plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgetrs(plasma_enum_t trans, plasma_desc_t A, int *ipiv,
-                       plasma_desc_t B,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zhemm(plasma_enum_t side, plasma_enum_t uplo,
-                      plasma_complex64_t alpha, plasma_desc_t A,
-                                                plasma_desc_t B,
-                      plasma_complex64_t beta,  plasma_desc_t C,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zher2k(plasma_enum_t uplo, plasma_enum_t trans,
-                       plasma_complex64_t alpha, plasma_desc_t A,
-                                                 plasma_desc_t B,
-                       double beta,              plasma_desc_t C,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zherk(plasma_enum_t uplo, plasma_enum_t trans,
-                      double alpha, plasma_desc_t A,
-                      double beta,  plasma_desc_t C,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zhetrf(plasma_enum_t uplo,
-                       plasma_desc_t A, int *ipiv,
-                       plasma_desc_t T, int *ipiv2,
-                       plasma_desc_t W,
-                       plasma_sequence_t *sequence,
-                       plasma_request_t *request);
-
-void plasma_omp_zhesv(plasma_enum_t uplo,
-                      plasma_desc_t A, int *ipiv,
-                      plasma_desc_t T, int *ipiv2,
-                      plasma_desc_t B,
-                      plasma_desc_t W,
-                      plasma_sequence_t *sequence,
-                      plasma_request_t *request);
-
-void plasma_omp_zhetrs(plasma_enum_t uplo,
-                       plasma_desc_t A, int *ipiv,
-                       plasma_desc_t T, int *ipiv2,
-                       plasma_desc_t B,
-                       plasma_sequence_t *sequence,
-                       plasma_request_t *request);
-
-void plasma_omp_zlacpy(plasma_enum_t uplo, plasma_enum_t transa,
-                       plasma_desc_t A, plasma_desc_t B,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlangb(plasma_enum_t norm, plasma_desc_t AB,
-                       double *work, double *value,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlange(plasma_enum_t norm, plasma_desc_t A,
-                       double *work, double *value,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlanhe(plasma_enum_t norm, plasma_enum_t uplo, plasma_desc_t A,
-                       double *work, double *value,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlansy(plasma_enum_t norm, plasma_enum_t uplo, plasma_desc_t A,
-                       double *work, double *value,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlantr(plasma_enum_t norm, plasma_enum_t uplo,
-                       plasma_enum_t diag, plasma_desc_t A,
-                       double *work, double *value,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlangb(plasma_enum_t norm, plasma_desc_t AB,
-                       double *work, double *value,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlascl(plasma_enum_t uplo,
-                       double cfrom, double cto,
-                       plasma_desc_t A,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlaset(plasma_enum_t uplo,
-                       plasma_complex64_t alpha, plasma_complex64_t beta,
-                       plasma_desc_t A,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zgeswp(plasma_enum_t colrow,
-                       plasma_desc_t A,
-                       int *ipiv, int incx,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zlauum(plasma_enum_t uplo,
-                       plasma_desc_t A,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zpb2desc(plasma_complex64_t *pA, int lda,
-                         plasma_desc_t A,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_omp_zpbsv(plasma_enum_t uplo, plasma_desc_t AB, plasma_desc_t B,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zpbtrf(plasma_enum_t uplo, plasma_desc_t AB,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zpbtrs(plasma_enum_t uplo, plasma_desc_t AB, plasma_desc_t B,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zpoinv(plasma_enum_t uplo, plasma_desc_t A,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zposv(plasma_enum_t uplo, plasma_desc_t A, plasma_desc_t B,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zpotrf(plasma_enum_t uplo, plasma_desc_t A,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zpotri(plasma_enum_t uplo, plasma_desc_t A,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zpotrs(plasma_enum_t uplo, plasma_desc_t A, plasma_desc_t B,
-                        plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zsymm(plasma_enum_t side, plasma_enum_t uplo,
-                      plasma_complex64_t alpha, plasma_desc_t A,
-                                                plasma_desc_t B,
-                      plasma_complex64_t beta,  plasma_desc_t C,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zsyr2k(plasma_enum_t uplo, plasma_enum_t trans,
-                       plasma_complex64_t alpha, plasma_desc_t A,
-                                                 plasma_desc_t B,
-                       plasma_complex64_t beta,  plasma_desc_t C,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zsyrk(plasma_enum_t uplo, plasma_enum_t trans,
-                      plasma_complex64_t alpha, plasma_desc_t A,
-                      plasma_complex64_t beta,  plasma_desc_t C,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_ztr2desc(plasma_complex64_t *pA, int lda,
-                         plasma_desc_t A,
-                         plasma_sequence_t *sequence,
-                         plasma_request_t *request);
-
-void plasma_omp_ztradd(plasma_enum_t uplo, plasma_enum_t transa,
-                       plasma_complex64_t alpha, plasma_desc_t A,
-                       plasma_complex64_t beta,  plasma_desc_t B,
-                       plasma_sequence_t *sequence, plasma_request_t  *request);
-
-void plasma_omp_ztrmm(plasma_enum_t side, plasma_enum_t uplo,
-                      plasma_enum_t transa, plasma_enum_t diag,
-                      plasma_complex64_t alpha, plasma_desc_t A,
-                                                plasma_desc_t B,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_ztrsm(plasma_enum_t side, plasma_enum_t uplo,
-                      plasma_enum_t transa, plasma_enum_t diag,
-                      plasma_complex64_t alpha, plasma_desc_t A,
-                                                plasma_desc_t B,
-                      plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_ztrtri(plasma_enum_t uplo, plasma_enum_t diag,
-                       plasma_desc_t A,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zunglq(plasma_desc_t A, plasma_desc_t T,
-                       plasma_desc_t Q, plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zungqr(plasma_desc_t A, plasma_desc_t T,
-                       plasma_desc_t Q, plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zunmlq(plasma_enum_t side, plasma_enum_t trans,
-                       plasma_desc_t A, plasma_desc_t T,
-                       plasma_desc_t C, plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
-
-void plasma_omp_zunmqr(plasma_enum_t side, plasma_enum_t trans,
-                       plasma_desc_t A, plasma_desc_t T,
-                       plasma_desc_t C, plasma_workspace_t work,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_omp_dzamax(
+    plasma_enum_t colrow, plasma_desc_t A,
+    double *work, double *values,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgbmm(
+    plasma_enum_t transa, plasma_enum_t transb,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgbsv(
+    plasma_desc_t AB, int *ipiv, plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgbtrf(
+    plasma_desc_t A, int *ipiv,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgbtrs(
+    plasma_enum_t transa, plasma_desc_t AB, int *ipiv,
+    plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zdesc2ge(
+    plasma_desc_t A,
+    plasma_complex64_t *pA, int lda,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_zdesc2pb(
+    plasma_desc_t A,
+    plasma_complex64_t *pA, int lda,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_zdesc2tr(
+    plasma_desc_t A,
+    plasma_complex64_t *pA, int lda,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_zgb2desc(
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_zgbset(
+    int m, int n, int kl, int ku,
+    plasma_complex64_t *pA, int lda);
+
+void plasma_omp_zge2desc(
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_zgeadd(
+    plasma_enum_t transa,
+    plasma_complex64_t alpha, plasma_desc_t A,
+    plasma_complex64_t beta,  plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t  *request);
+
+void plasma_omp_zgeinv(
+    plasma_desc_t A, int *ipiv, plasma_desc_t W,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgelqf(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgelqs(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_desc_t B, plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgels(
+    plasma_enum_t trans,
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_desc_t B, plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgemm(
+    plasma_enum_t transa, plasma_enum_t transb,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgeqrf(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgeqrs(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_desc_t B, plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgesv(
+    plasma_desc_t A, int *ipiv,
+    plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgetrf(
+    plasma_desc_t A, int *ipiv,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgetri(
+    plasma_desc_t A, int *ipiv, plasma_desc_t W,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgetri_aux(
+    plasma_desc_t A, plasma_desc_t W,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgetrs(
+    plasma_enum_t trans, plasma_desc_t A, int *ipiv,
+    plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zhemm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zher2k(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    double beta,              plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zherk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    double alpha, plasma_desc_t A,
+    double beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zhetrf(
+    plasma_enum_t uplo,
+    plasma_desc_t A, int *ipiv,
+    plasma_desc_t T, int *ipiv2,
+    plasma_desc_t W,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_zhesv(
+    plasma_enum_t uplo,
+    plasma_desc_t A, int *ipiv,
+    plasma_desc_t T, int *ipiv2,
+    plasma_desc_t B,
+    plasma_desc_t W,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_zhetrs(
+    plasma_enum_t uplo,
+    plasma_desc_t A, int *ipiv,
+    plasma_desc_t T, int *ipiv2,
+    plasma_desc_t B,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_zlacpy(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    plasma_desc_t A, plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlangb(
+    plasma_enum_t norm, plasma_desc_t AB,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlange(
+    plasma_enum_t norm, plasma_desc_t A,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlanhe(
+    plasma_enum_t norm, plasma_enum_t uplo, plasma_desc_t A,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlansy(
+    plasma_enum_t norm, plasma_enum_t uplo, plasma_desc_t A,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlantr(
+    plasma_enum_t norm, plasma_enum_t uplo,
+    plasma_enum_t diag, plasma_desc_t A,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlangb(
+    plasma_enum_t norm, plasma_desc_t AB,
+    double *work, double *value,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlascl(
+    plasma_enum_t uplo,
+    double cfrom, double cto,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlaset(
+    plasma_enum_t uplo,
+    plasma_complex64_t alpha, plasma_complex64_t beta,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zgeswp(
+    plasma_enum_t colrow,
+    plasma_desc_t A,
+    int *ipiv, int incx,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zlauum(
+    plasma_enum_t uplo,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zpb2desc(
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_zpbsv(
+    plasma_enum_t uplo, plasma_desc_t AB, plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zpbtrf(
+    plasma_enum_t uplo, plasma_desc_t AB,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zpbtrs(
+    plasma_enum_t uplo, plasma_desc_t AB, plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zpoinv(
+    plasma_enum_t uplo, plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zposv(
+    plasma_enum_t uplo, plasma_desc_t A, plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zpotrf(
+    plasma_enum_t uplo, plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zpotri(
+    plasma_enum_t uplo, plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zpotrs(
+    plasma_enum_t uplo, plasma_desc_t A, plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zsymm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zsyr2k(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zsyrk(
+    plasma_enum_t uplo, plasma_enum_t trans,
+    plasma_complex64_t alpha, plasma_desc_t A,
+    plasma_complex64_t beta,  plasma_desc_t C,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_ztr2desc(
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence,
+    plasma_request_t *request);
+
+void plasma_omp_ztradd(
+    plasma_enum_t uplo, plasma_enum_t transa,
+    plasma_complex64_t alpha, plasma_desc_t A,
+    plasma_complex64_t beta,  plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t  *request);
+
+void plasma_omp_ztrmm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t transa, plasma_enum_t diag,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_ztrsm(
+    plasma_enum_t side, plasma_enum_t uplo,
+    plasma_enum_t transa, plasma_enum_t diag,
+    plasma_complex64_t alpha, plasma_desc_t A,
+                              plasma_desc_t B,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_ztrtri(
+    plasma_enum_t uplo, plasma_enum_t diag,
+    plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zunglq(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_desc_t Q, plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zungqr(
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_desc_t Q, plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zunmlq(
+    plasma_enum_t side, plasma_enum_t trans,
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_desc_t C, plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_omp_zunmqr(
+    plasma_enum_t side, plasma_enum_t trans,
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_desc_t C, plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/plasma_zc.h b/include/plasma_zc.h
index 38e7ab71..b29e8b53 100644
--- a/include/plasma_zc.h
+++ b/include/plasma_zc.h
@@ -24,58 +24,68 @@ extern "C" {
 /***************************************************************************//**
  *  Standard interface
  **/
-int plasma_zcgesv(int n, int nrhs,
-                  plasma_complex64_t *pA, int lda, int *ipiv,
-                  plasma_complex64_t *pB, int ldb,
-                  plasma_complex64_t *pX, int ldx, int *iter);
+int plasma_zcgesv(
+    int n, int nrhs,
+    plasma_complex64_t *pA, int lda, int *ipiv,
+    plasma_complex64_t *pB, int ldb,
+    plasma_complex64_t *pX, int ldx, int *iter);
 
-int plasma_zcposv(plasma_enum_t uplo, int n, int nrhs,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_complex64_t *pB, int ldb,
-                  plasma_complex64_t *pX, int ldx, int *iter);
+int plasma_zcposv(
+    plasma_enum_t uplo, int n, int nrhs,
+    plasma_complex64_t *pA, int lda,
+    plasma_complex64_t *pB, int ldb,
+    plasma_complex64_t *pX, int ldx, int *iter);
 
-int plasma_zcgbsv(int n, int kl, int ku, int nrhs,
-                  plasma_complex64_t *pAB, int ldab, int *ipiv,
-                  plasma_complex64_t *pB, int ldb,
-                  plasma_complex64_t *pX, int ldx, int *iter);
+int plasma_zcgbsv(
+    int n, int kl, int ku, int nrhs,
+    plasma_complex64_t *pAB, int ldab, int *ipiv,
+    plasma_complex64_t *pB, int ldb,
+    plasma_complex64_t *pX, int ldx, int *iter);
 
-int plasma_zlag2c(int m, int n,
-                  plasma_complex64_t *pA,  int lda,
-                  plasma_complex32_t *pAs, int ldas);
+int plasma_zlag2c(
+    int m, int n,
+    plasma_complex64_t *pA,  int lda,
+    plasma_complex32_t *pAs, int ldas);
 
-int plasma_clag2z(int m, int n,
-                  plasma_complex32_t *pAs, int ldas,
-                  plasma_complex64_t *pA,  int lda);
+int plasma_clag2z(
+    int m, int n,
+    plasma_complex32_t *pAs, int ldas,
+    plasma_complex64_t *pA,  int lda);
 
 /***************************************************************************//**
  *  Tile asynchronous interface
  **/
-void plasma_omp_zcgesv(plasma_desc_t A,  int *ipiv,
-                       plasma_desc_t B,  plasma_desc_t X,
-                       plasma_desc_t As, plasma_desc_t Xs, plasma_desc_t R,
-                       double *work, double *Rnorm, double *Xnorm, int *iter,
-                       plasma_sequence_t *sequence,
-                       plasma_request_t  *request);
+void plasma_omp_zcgesv(
+    plasma_desc_t A,  int *ipiv,
+    plasma_desc_t B,  plasma_desc_t X,
+    plasma_desc_t As, plasma_desc_t Xs, plasma_desc_t R,
+    double *work, double *Rnorm, double *Xnorm, int *iter,
+    plasma_sequence_t *sequence,
+    plasma_request_t  *request);
 
-void plasma_omp_zcposv(plasma_enum_t uplo,
-                       plasma_desc_t A,  plasma_desc_t B,  plasma_desc_t X,
-                       plasma_desc_t As, plasma_desc_t Xs, plasma_desc_t R,
-                       double *W,  double *Rnorm, double *Xnorm, int *iter,
-                       plasma_sequence_t *sequence,
-                       plasma_request_t  *request);
+void plasma_omp_zcposv(
+    plasma_enum_t uplo,
+    plasma_desc_t A,  plasma_desc_t B,  plasma_desc_t X,
+    plasma_desc_t As, plasma_desc_t Xs, plasma_desc_t R,
+    double *W,  double *Rnorm, double *Xnorm, int *iter,
+    plasma_sequence_t *sequence,
+    plasma_request_t  *request);
 
-void plasma_omp_zcgbsv(plasma_desc_t A,  int *ipiv,
-                       plasma_desc_t B,  plasma_desc_t X,
-                       plasma_desc_t As, plasma_desc_t Xs, plasma_desc_t R,
-                       double *work, double *Rnorm, double *Xnorm, int *iter,
-                       plasma_sequence_t *sequence,
-                       plasma_request_t  *request);
+void plasma_omp_zcgbsv(
+    plasma_desc_t A,  int *ipiv,
+    plasma_desc_t B,  plasma_desc_t X,
+    plasma_desc_t As, plasma_desc_t Xs, plasma_desc_t R,
+    double *work, double *Rnorm, double *Xnorm, int *iter,
+    plasma_sequence_t *sequence,
+    plasma_request_t  *request);
 
-void plasma_omp_zlag2c(plasma_desc_t A, plasma_desc_t As,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_omp_zlag2c(
+    plasma_desc_t A, plasma_desc_t As,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
-void plasma_omp_clag2z(plasma_desc_t As, plasma_desc_t A,
-                       plasma_sequence_t *sequence, plasma_request_t *request);
+void plasma_omp_clag2z(
+    plasma_desc_t As, plasma_desc_t A,
+    plasma_sequence_t *sequence, plasma_request_t *request);
 
 #ifdef __cplusplus
 }  // extern "C"

From 653f7c50422d1b285d78b8e131db74b01c551c6e Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 15 Jan 2025 13:15:00 -0500
Subject: [PATCH 02/12] Add PLASMA style guide. Piotr rescued it from being
 lost in the ether.

---
 docs/style-guide.md | 1316 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1316 insertions(+)
 create mode 100644 docs/style-guide.md

diff --git a/docs/style-guide.md b/docs/style-guide.md
new file mode 100644
index 00000000..0cb7c799
--- /dev/null
+++ b/docs/style-guide.md
@@ -0,0 +1,1316 @@
+[TOC]
+
+About this guide
+================
+
+This guide is mostly based on the
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+> Sometimes the Google rules are tweaked, sometimes contradicted.
+>
+> This guide includes rules beyond those in the Google guide.
+
+Other notable sources of best software engineering practices include:
+
+* [JSF Air Vehicle C++ Coding Standards](http://www.stroustrup.com/JSF-AV-rules.pdf),
+* [Code Complete](http://cc2e.com) by Steve McConnell,
+* [Source codes of the Trilinos project](https://trilinos.org),
+* [Microsoft .Net Guidelines](https://msdn.microsoft.com/en-us/library/ms229042(v=vs.110).aspx),
+* [Linux Kernel Coding Style](https://www.kernel.org/doc/Documentation/CodingStyle).
+
+Some conventions introduced in this guide originate from:
+
+* NVIDIA CUDA,
+* Intel MKL,
+* Microsoft.
+
+This guide is created using Markdown.
+For Markdown documentation consult:
+
+* http://daringfireball.net/projects/markdown/,
+* https://en.wikipedia.org/wiki/Markdown,
+
+General Guidelines
+==================
+
+* Be consistent in your own code.
+* Follow the conventions already established by the project.
+* If you spot inconsistencies, fix them.
+* Break rules if it helps readability. This is only a guide.
+
+> Established ICL projects (PLASMA, MAGMA, PaRSEC, PAPI) already have their
+> conventions. In most cases, existing project conventions override conventions
+> in this guide. Unless, you can fix bad practices in an existing project by consistently
+> applying a better convention across the entire source code, a task which can sometimes
+> be automated.
+>
+> Definitely follow this guide if starting a new project or a prototype.
+
+Standard Compliance
+===================
+
+C codes should be C99 compliant and compiled with the `-std=c99` flag,
+and C++ codes should be C++11 compliant and compiled with the `-std=c++11` flag.
+
+Avoid features present only in C but not in C++. That is, C code should compile with either C or C++ compiler.
+
+> Microsoft's C compiler doesn't support C99, so code must use C++11.
+
+Header Files
+============
+
+Self-contained Headers
+----------------------
+
+Header files should be self-contained and end in `.h` for C and in `.hh` for C++.
+Files that are meant for textual inclusion, but are not headers, should end in `.inc`.
+
+> Google uses `.cc` for C++ source files, but `.h` for C++ header files.
+> However, it is useful to have a distinction between C and C++ headers.
+> Therefore, we use `.c` and `.h` for C and `.cc` and `.hh` for C++.
+>
+> Trilinos uses another common convention of `.cpp` and `.hpp`.
+> However, in a long list of files, this puts a lot of <p>s on the screen.
+> The `.cc` and `.hh` endings are shorter and cleaner.
+
+\#define Guards
+--------------
+
+All header files should have `#define` guards to prevent multiple inclusion.
+The format of the symbol name should be `<ICL>_<PROJECT>_<FILE>_H` for C
+and `<ICL>_<PROJECT>_<FILE>_HH` for C++.
+
+```
+#!cpp
+
+#ifndef ICL_MAGMA_BLAS_H
+#define ICL_MAGMA_BLAS_H
+
+...
+
+#endif // ICL_MAGMA_BLAS_H
+```
+
+> Google uses an underscore at the end.
+> Some projects use an underscore at the beginning and an underscore at the end.
+> Trilinos does not use underscores.
+> In the case of header guards, beginning/ending underscores seem pointless.
+> Underscores in front of a name are used for system-level hacking.
+
+extern C
+--------
+
+C library headers should add extern "C" around function definitions, to allow C++ codes to use the library.
+
+```
+#!cpp
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+...
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+```
+
+Forward Declarations
+--------------------
+
+Avoid using forward declarations, i.e., declarations of classes, functions,
+or templates without associated definitions.
+Instead, `#include` the headers you need. This limits the number of places that
+have to be modified if the members change.
+Check [Google Style Guide](https://google.github.io/styleguide/cppguide.html#Forward_Declarations)
+to see why.
+
+Inline Functions
+----------------
+
+Mark small functions as inline, specifically those that serve as macro replacements.
+Although the compiler will identify on its own functions
+suitable for inlining, it is a good idea hint it directly.
+Define as inline member accessors and functions that would otherwise be macros,
+e.g., address arithmetic functions and alike.
+Check [Google Style Guide](https://google.github.io/styleguide/cppguide.html#Inline_Functions)
+for further guidelines.
+
+Names and Order of Included Header Files
+----------------------------------------
+
+Use the following order of inclusion:
+
+* your project headers,
+* standard headers,
+* other libraries' headers.
+
+For example, the include section might look like this:
+
+```
+#!cpp
+
+#include "common_magma.h"
+#include "batched_kernel_param.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include <omp.h>
+#include <cuda.h>
+#include <mkl_blas.h>
+```
+
+> This ordering is a little different from the one in the
+> [Google Style Guide](https://google.github.io/styleguide/cppguide.html#Names_and_Order_of_Includes).
+
+All of project's header files should be listed as descendants of the project's
+source directory without use of UNIX directory shortcuts `.` (the current directory)
+or `..` (the parent directory).
+
+You should include all the headers that define the symbols you rely upon.
+If you rely on symbols from `bar.h`, do not count on the fact that you included `foo.h`
+which (currently) includes `bar.h`.
+Include `bar.h` explicitly.
+However, any includes present in the related header do not need to be included again
+in the related `.cc` (i.e., `foo.cc` can rely on `foo.h`'s includes).
+
+Sometimes, system-specific code needs conditional includes.
+Such code can put conditional includes after other includes.
+Keep your system-specific code small and localized.
+
+```
+#!cpp
+
+#ifndef __APPLE__
+#include <pthread.h>
+#else
+#include <libkern/OSAtomic.h>
+
+...
+
+#endif // __APPLE__
+```
+
+Scoping
+=======
+
+Namespaces
+----------
+
+With few exceptions, place code in a namespace.
+Use named namespaces as follows:
+
+* Namespaces wrap the entire source file after includes.
+* Do not declare anything in namespace std, including forward declarations
+  of standard library classes. Declaring entities in namespace std is undefined
+  behavior. To declare entities from the standard library, include the appropriate
+  header file.
+* Do not use a `using` directive to make all names from a namespace available.
+* Do not use `using` declarations in `.hh` files, because anything imported
+  into a namespace in a `.hh` file becomes part of the public API exported by that file.
+* Use a `using` declaration anywhere in a `.cc` file (including in the global namespace),
+  and in functions, methods, and classes.
+* Minimize the use of namespace aliases.
+* Do not use inline namespaces.
+* Use Pascal case for namespace components.
+
+Namespaces should have unique names based on the project name.
+Generic components, that may be shared among multiple projects, such as, e.g.,
+an efficient implementation of a thread-safe hash table, may be placed in
+a namespace `Icl`, while project-specific components may be placed
+in a namespace, e.g., `Icl::Magma`.
+
+Format namespaces in `.hh` files as follows:
+
+```
+#!cpp
+
+#ifndef ICL_MAGMA_HH
+#define ICL_MAGMA_HH
+
+#include <cuda.h>
+
+namespace Icl {
+namespace Magma {
+
+...
+
+} // namespace Magma
+} // namespace Icl
+
+#endif // ICL_MAGMA_HH
+```
+
+Nonmember, Static Member, and Global Functions
+----------------------------------------------
+
+Prefer placing nonmember functions in a namespace; use completely global functions rarely.
+Prefer grouping functions with a namespace instead of using a class as if it was a namespace.
+
+Sometimes it is useful to define a function not bound to a class instance.
+Such a function can be either a static member or a nonmember function.
+Nonmember functions should not depend on external variables,
+and should nearly always exist in a namespace.
+Rather than creating classes only to group static member functions
+which do not share static data, use namespaces instead.
+
+If you must define a nonmember function and it is only needed in its `.cc` file,
+use an unnamed namespace or static linkage (e.g., `static int Foo() {...}`)
+to limit its scope.
+
+Local Variables
+---------------
+
+Place local variables in the narrowest scope possible.
+Declare loop counters inside the `for` and `while` statements if possible.
+
+Initialize variables that are initialized once in the declaration.
+If the variable is reused, use declaration in a separate line preceeding its first use,
+e.g.:
+
+```
+#!cpp
+
+cublasStatus_t retval;
+
+retval = cublasCreate(...);
+assert(retval == CUBLAS_STATUS_SUCCESS);
+
+retval = cublasDestroy(...);
+assert(retval == CUBLAS_STATUS_SUCCESS);
+```
+
+Static and Global Variables
+---------------------------
+
+Do not use global variables.
+Use static variables only for Plain Old Data (POD): only ints, chars, floats,
+or pointers, or arrays/structs of POD. Check the
+[Google Style Guide](https://google.github.io/styleguide/cppguide.html#Static_and_Global_Variables)
+for detailed explanations.
+
+Classes
+=======
+
+In the context of numerical software, many advanced features of C++ can,
+and should be, ignored.
+
+* Do not define implicit conversions.
+
+* Do not use delegating and inheriting constructors. Use helper functions instead.
+
+* Use a `struct` only for passive objects that carry data; everything else is a class.
+> Note that member variables in structs and classes have different naming rules.
+
+* Use composition instead of inheritance.
+  Encapsulate rather than derive.
+  If you end up using inheritance, try to limit `protected` to functions,
+  don’t use for data members.
+
+* Overload operators judiciously.
+  Define overloaded operators only if their meaning is obvious, unsurprising,
+  and consistent with the corresponding built-in operators.
+  Define operators only on your own types.
+> Operator overloading makes sense for numerical objects, such as complex numbers
+> or extended precision numbers.
+
+* Make data members private, unless they are `static const`.
+
+* Use private before public. Within each section, use the following order:
+    - constants,
+    - data members,
+    - constructors,
+    - destructors,
+    - methods.
+
+Copyable and Movable Types
+--------------------------
+
+In numerical libraries, we basically have two types of classes:
+small classes to represent complex numbers and extended precision numbers,
+and big classes to represent contexts and descriptors.
+Handling of the small classes is straightforward.
+They can be copyable and movable, and use overloaded operators.
+Handling of the big classes requires a little more caution, due to the issue
+of ownership of pointers.
+The issue basically boils down to the definition of a descriptor.
+
+Numerical libraries often introduce descriptors, which describe the layout
+of the data, but do not contain the actual memory references.
+Instead, memory pointers are passed alongside descriptors to library routines.
+ScaLAPACK can serve as a legacy example,
+NVIDIA cuDNN can serve as a contemporary example.
+In this case, the descriptor itself is perfectly copyable and mutable.
+
+However, at some level, this separation goes against object oriented programming
+(defies the principle encapsulation).
+PLASMA already set a precedence by including the memory pointer in the
+matrix descriptor.
+Decoupling memory references from descriptors becomes even more troublesome
+if a single descriptor describes data requiring more than one memory pointer,
+e.g., a low-rank matrix approximation represented by its SVD.
+
+Therefore, big classes, representing large mathematical objects,
+should be handled as follows:
+
+* Be self-contained, i.e., contain all memory pointers.
+
+* Set all memory pointers to NULL at the time of initialization
+  (in the constructor in C++).
+  Rely on constructors or factory methods or `Init()` methods to perform
+  allocations and initializations.
+  Free memory in destructors or `Finalize()` methods and include a check
+  for NULL.
+
+* Be non-copyable and non-movable and be passed by reference.
+
+Functions
+=========
+
+* Write short functions.
+  If possible, write functions that fit on one screen.
+  If a function exceeds about 50 lines, think about whether it can be broken up
+  without harming the structure of the program.
+
+* All parameters passed by reference must be labeled `const` unless they are
+  modified in the function.
+
+* Use function overloading judiciously.
+
+* Avoid default arguments.
+
+* Avoid trailing return type syntax.
+
+Parameter Ordering
+------------------
+
+Place all input-only parameters before any output parameters.
+In particular, do not add new parameters to the end of the function
+just because they are new; place new input-only parameters before the output parameters.
+
+Use the following ordering of parameters:
+
+* library handle/context,
+* data layout specifier,
+* input parameters,
+* output parameters,
+* flags.
+
+Place array size before the memory pointer.
+
+Other C++ Features
+==================
+
+* Use friend classes within reason.
+  A common use of friend is to have a `FooBuilder` class be a friend of `Foo`
+  so that it can construct the inner state of `Foo` correctly, without exposing
+  this state to the world.
+  In some cases it may be useful to make a unit test class a friend of the class it tests.
+
+* Avoid using Run Time Type Information.
+
+* Use traditional, C-style, casting.
+  In numerical codes virtually all casts are conversions and there is no ambiguity.
+  The syntax of C++ style casts is nasty.
+  Also, `(long long)x` is the only way to convert to the `long long` type, because
+  of the space.
+  And also, the proper name of `long` is `long int`, which also includes the space.
+
+* Do not use C++ streams.
+  Use C standard IO functions instead.
+  C++ streams are cumbersome and the
+  [Google Style Guide](https://google.github.io/styleguide/cppguide.html#Streams)
+  provides a long list of reasons why.
+
+* Use preincrement/predecrement (`++i`) as opposed to postincrement/postdecrement (`i++`).
+  This is a common C++ convention.
+
+* Use the following notations when initializing variables:
+  `1` for integers, `1.0` for doubles, `1.0f` for floats, `0x01` for bit patterns.
+
+* Prefer `sizeof(type)` to `(sizeof varname)`.
+  It is more traditional and more explicit.
+
+* Use `auto` for long type names.
+
+* Avoid complicated template programming.
+  Use templates for handling numerical types, such as complex or extended precision.
+  Some projects, such as BEAST, use templates for specialized purposes, such as
+  parametrization of tunable codes.
+
+* Do not use Boost for codes meant for public releases, such as numerical libraries.
+  Use it for small projects.
+
+* Feel free to use C++11 features.
+
+Use of enum
+-----------
+
+Use `enum` whenever possible.
+More type safety is good.
+Wrap then in typedefs, so the word `enum` only shows up in the definition.
+
+Use of const
+------------
+
+The use of `const` is strongly encouraged in external and internal interfaces.
+
+Use `const` such that the declaration can be read from right to left.
+
+> `const` is viral: if you pass a `const` variable to a function,
+> that function must have `const` in its prototype.
+>
+> Intel MKL uses `const` for all input function parameters.
+> NVIDIA cuBLAS does not use `const` for input parameters passed by value,
+> only for input parameters passed by pointer.
+
+Use of restrict
+---------------
+
+Avoid `restrict`. Only use if there is a clear performance benefit.
+
+Exceptions
+----------
+
+You may use C++ exceptions.
+[Google Style Guide](https://google.github.io/styleguide/cppguide.html#Exceptions)
+argues against it, but Trilinos uses it.
+Pick one way or the other and stick to it.
+
+If you use exceptions, list exceptions in function comments, but not in signatures.
+Listing in signatures is a bad idea (http://www.gotw.ca/publications/mill22.htm),
+and it is deprected in C++11.
+
+Integer Types
+-------------
+
+Most of the time, use the `int` type and safely assume that it is at least 32 bits.
+Do not assume that it is more than 32 bits, though.
+If you need any other integer type than `int`, use a precise-width integer type
+from `<cstdint>`.
+Always use `size_t` to describe the size of a memory region or offset.
+
+Do not use unsigned types unless specifically required.
+In particular, do not use unsigned types to say a number will never be negative.
+Use `int64_t` for integers for which 32 bits is not enough.
+If you really need an unsigned type, use the width-specific type,
+even for a 32-bit integer. I.e., use `uint32_t` instead of `unsigned int`.
+Basically, never use the `unsigned` keyword.
+
+> The use of unsigned types for loop counters can introduce bugs and prevent
+> compiler optimizations.
+
+Do not use a larger type than `int` for a variable, just because it will be used
+in an intermediate calculation that may overflow, such as address calculation.
+Use explicit casts to `size_t` for all address arithmetic that may overflow.
+Declare variables only of the size required for the maximum value the variable
+may store. Also, do not use integer types shorter than `int` unless specifically
+required, e.g., to minimize the memory footprint of a large array.
+
+NULL, nullptr, 0
+----------------
+
+In C++ use nullptr - removes ambiguity. You can take sizeof(nullptr).
+
+64-bit Addressing
+-----------------
+
+In numerical codes, the need for using 64-bit variables to index arrays is rare,
+but the chance of the memory offset not fitting in 32 bits is high. Therefore:
+
+* Always use a cast to `size_t` when indexing large arrays involves
+  arithmetic operations.
+
+* Use a cast to `size_t` inside all memory allocation functions.
+
+```
+#!cpp
+
+array[(size_t)...
+
+malloc((size_t)...
+```
+
+> Generally, we assume the matrix dimensions can be passed as 32-bit `int`,
+> only the offset (i + j * lda) needs to be computed as 64-bit.
+> So, we can use the 32-bit LAPACK interface.
+> One place this fails is passing `lwork`, if it needs O(m * n) workspace,
+> which it does for the `syev` and `gesvd` routines.
+
+Preprocessing Macros
+--------------------
+
+Do not use macros for anything else than:
+
+* conditionally including software dependencies,
+* guards in header files preventing multiple inclusion.
+
+This code summarizes the legitimate uses of macros:
+
+```
+#!cpp
+
+#ifndef ICL_PTHREAD_H
+#define ICL_PTHREAD_H
+
+#ifndef __APPLE__
+#include <pthread.h>
+#else
+#include <libkern/OSAtomic.h>
+
+...
+
+#endif // __APPLE__
+#endif // ICL_PTHREAD_H
+```
+
+Check out the
+[Google Style Guide](https://google.github.io/styleguide/cppguide.html#Preprocessor_Macros)
+for more explanations.
+
+Concurrency
+===========
+
+* POSIX Threads (Pthreads) and OpenMP are preferred ways of multithreading.
+  Do not refrain from using recent features. The bottom line is: if it is
+  supported in GCC, it is okay to use.
+
+* Prefer spinlocks over regular locks.
+  There is no reason for doing otherwise for high performance parallel codes.
+
+* Use [GNU atomic builtins](https://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Atomic-Builtins.html)
+  to implement low-level synchronization mechanisms.
+  They are supported by all major compilers (GNU, Intel).
+
+* Declare all synchronization variables as `volatile`, even if you are
+  only accessing them using atomic builtins.
+  If a variable can be accessed by more than one thread, it needs to be
+  `volatile` to prevent compiler from applying optimizations that might result
+  in incorrect code.
+
+* If you find yourself considering the use of memory barriers,
+  you went too low-level. Use atomic builtins or spinlocks instead.
+
+Building with Missing Dependencies
+==================================
+
+If a certain environment is missing a mainstream component,
+e.g., Pthread spinlocks on OSX, do not create a new abstraction layer,
+but implement the missing functions on top of the available native functions
+(e.g., PLASMA on MS Windows, PULSAR on OSX).
+
+```
+#!cpp
+
+#ifndef __APPLE__
+#include <pthread.h>
+#else
+#include <libkern/OSAtomic.h>
+
+typedef OSSpinLock pthread_spinlock_t;
+
+inline int pthread_spin_lock(pthread_spinlock_t *lock) {
+    OSSpinLockLock(lock);
+    return 0;
+}
+...
+#endif // __APPLE__
+```
+
+Similar principle applies when a component is completely missing.
+Use the API as if the component was there.
+Provide a header file with stubs for the missing functions.
+Write your code such that it works correctly if stubs are called
+instead of the missing component.
+PULSAR is written that way with respect to support for MPI and CUDA.
+It may not always be possible, but at least investigate the possibility.
+
+Naming
+======
+
+* Use abbreviations within reason. Follow common conventions.
+  Name a variable `retval` rather than `return_value`.
+
+* There are no special requirements for global variables,
+  which should be rare anyway, but if you use them, consider prefixing them
+  with `g_` or some other marker to easily distinguish it from local variables.
+
+File Names
+----------
+
+Most projects already have a convention.
+Follow the project's convention.
+
+In C++ the file name should match the class name.
+Consider using the Trilinos convention:
+`<NameSpace>_<ClassName>`, i.e., namespace using Pascal case,
+underscore, class name using Pascal case,
+e.g., `IclMagma_SomeClass.hh`, `IclMagma_SomeClass.cc`.
+
+Use `.h` and `.c` extensions for C files, and `.hh` and `.cc` for C++ files.
+
+Type Names
+----------
+
+* In C++ use Pascal case for class names, e.g., `MyExcitingClass`.
+
+* In C use camel case for type names followed by `_t`, e.g., `myExcitingType_t`
+  (NVIDIA's convention).
+
+Wrap enums and structs in typedefs and do not use `enum` and `struct` keywords
+when declaring variables.
+
+Variable Names
+--------------
+
+* The names of variables and data members are all lowercase,
+  with underscores between words.
+  Data members of classes (but not structs) additionally have trailing underscores.
+  For instance:
+
+    - `a_local_variable`,
+    - `a_struct_data_member`,
+    - `a_class_data_member_`.
+
+Constant Names
+--------------
+
+Use Pascal case for constants, i.e., `const int ArraySize`.
+
+> This is Microsoft's convention.
+> Google's convention is to use a leading `k`, i.e., `kArraySize`,
+> which seems a little arbitrary and looks somewhat awkward.
+
+Prefix global variables with `g_` and global constants with `G_`, e.g.,
+`g_scary_global_variable`, `G_ScaryGlobalConstant`.
+
+Function Names
+--------------
+
+Use snake case for C function names and C++ method names,
+e.g., `my_awesome_c_function`, `my_awesome_cpp_method`.
+
+Namespace Names
+---------------
+
+Namespace names are Pascal case (Trilinos, Microsoft).
+
+Enumerator Names
+----------------
+
+Individual enumerators should be named like constants, i.e., Pascal case.
+
+Macro Names
+-----------
+
+1. Do not use macros.
+
+2. If you really need to use a macro, see 1.
+
+Comments
+========
+
+* Use C99/C++ style `//` comments.
+
+* Comment your code heavily, but do not state the obvious.
+
+* If a comment is a sentence, start with a capital letter and end with a period.
+
+* If a comment is not a sentence, start with a small letter and do not end with a period.
+
+* Start each file with a boilerplate.
+
+* Do not duplicate comments in both the `.hh` and the `.cc` file.
+  Duplicated comments diverge.
+
+Function Comments
+-----------------
+
+Use Doxygen (`///`) for function comments:
+
+* Start with `@brief`,
+* Follow with extended description if necessary, indented one more space
+  than `@`.
+* Follow with parameters and indicate direction (in, out, inout),
+* Follow with return values,
+* Separate the name of the parameter from the description with a dash,
+* Do not capitalize the description,
+* Do not follow with a period.
+
+```
+#!cpp
+
+///
+/// @brief Solves a complex problem.
+///
+///  Uses such and such algorith
+///  with such and such properties.
+///
+/// @param[in] n - array size
+/// @param[in] array - array of input data
+/// ...
+/// @param[out] result - array of output data
+///
+/// @returns error code
+///
+```
+
+Use `@retval` for a list of discrete return values.
+
+```
+#!cpp
+
+/// @retval  0 - success
+/// @retval -1 - failure
+```
+
+The descriptions should be declarative ("Solves a problem.")
+rather than imperative ("Solve a problem").
+This is the convention of Google, and also LAPACK.
+
+If the function does something trivial, just skip the comment.
+It is quite common for destructors not to have header comments.
+
+Variable Comments
+-----------------
+
+* Local variables should have names descriptive enough to not require comments.
+
+* Use Doxygen (`///<`) for data members of classes and structures.
+
+Implementation Comments
+-----------------------
+
+Use standard C++ (`//`) comments (not Doxygen) for implementation comments.
+Put the comments before the codes.
+Do not use inline comments. They make the code harder to read and make it hard
+to respect the 80-characters line limit.
+
+TODO Comments
+-------------
+
+Use the Doxygen `@todo` tag for code that is temporary, a short-term solution,
+or good-enough but not perfect.
+
+```
+#!cpp
+
+// @todo Make it better.
+// @todo (Jakub) Make it even better.
+// @todo (kurzak@eecs.utk.edu) Make it yet better.
+```
+
+Formatting
+==========
+
+Line Length
+-----------
+
+Each line of text in your code should be at most 80 characters long.
+
+> It is a Google rule and is easier to follow than you may think.
+> [Google Style Guide](https://google.github.io/styleguide/cppguide.html#Line_Length)
+> gives some good reasons for this rule.
+> Many ICL projects are pretty good at following this rule.
+
+Indentation
+-----------
+
+Use only spaces (no tabs), and indent 4 spaces at a time.
+
+Most editors can be configured to insert spaces instead of tabs.
+For vim, put these in ~/.vimrc:
+```
+"set shiftwidth=4
+"set softtabstop=4
+"set expandtab
+```
+
+For emacs, put these in ~/.emacs:
+```
+(setq-default c-default-style "k&r"
+              c-basic-offset 4
+              tab-width 4
+              indent-tabs-mode nil)
+```
+
+For jedit, set Tab width to 4 and check "Soft (emulated with spaces) tabs"
+
+Function Declarations and Definitions
+-------------------------------------
+
+Use one of the following styles in declarations and definitions of functions
+that don't fit in a single line:
+
+```
+#!cpp
+
+ReturnType ClassName::ReallyLongFunctionName(Type par_name1, Type par_name2,
+                                             Type par_name3)
+{
+    DoSomething();
+    ...
+}
+```
+
+```
+#!cpp
+
+ReturnType ClassName::ReallyLongFunctionName(
+    Type par_name1, Type par_name2, Type par_name3)
+{
+    DoSomething();
+    ...
+}
+```
+
+```
+#!cpp
+
+ReturnType ClassName::ReallyLongFunctionName(
+    Type par_name1,
+    Type par_name2,
+    Type par_name3)
+{
+    DoSomething();
+    ...
+}
+```
+
+Some points to note:
+
+* Choose good parameter names.
+  Try to be consistent with existing codes (e.g., LAPACK) or literature.
+
+* Never omit parameter names in declarations.
+
+* The open parenthesis is always on the same line as the function name.
+
+* There is never a space between the parentheses and the parameters.
+
+* The open curly brace is always on the start of the next line after
+  function declaration.
+  The exception to the rule are really short functions, e.g., accessors
+  in the body of a class.
+  If this is the case, then everything can be in a single line
+  (if it fits in the 80-characters limit).
+  There should be a space between the close parenthesis
+  and the open curly brace.
+
+* The close curly brace is either on the last line by itself
+  or on the same line as the open curly brace.
+
+Function Calls
+--------------
+
+Use one of the following styles when calling functions
+that don't fit in a single line:
+
+```
+#!cpp
+
+bool result = ReallyLongFunctionName(ReallyLongArguent1,
+                                     ReallyLongArguent2,
+                                     ReallyLongArguent3);
+```
+
+```
+#!cpp
+
+bool result = ReallyLongFunctionName(
+    ReallyLongArguent1, ReallyLongArguent2, ReallyLongArguent3);
+```
+
+```
+#!cpp
+
+bool result = ReallyLongFunctionName(
+    ReallyLongArguent1,
+    ReallyLongArguent2,
+    ReallyLongArguent3);
+```
+
+```
+#!cpp
+
+bool result =
+    ReallyLongFunctionName(
+        ReallyLongArguent1,
+        ReallyLongArguent2,
+        ReallyLongArguent3);
+```
+
+Functions may have natural line breaking patterns, e.g.:
+
+```
+#!cpp
+
+cblas_dgemm(
+    CblasColMajor,
+    CblasNoTrans, CblasNoTrans,
+    m, n, k,
+    1.0, a, lda,
+         b, ldb,
+    1.0, c, ldc);
+```
+
+Conditionals
+------------
+
+The boilerplate for conditionals is:
+
+```
+#!cpp
+
+if (condition) {
+    ...
+}
+else if (...) {
+    ...
+}
+else {
+    ...
+}
+```
+
+Note:
+
+* a space between `if` and the opening parenthesis,
+* no spaces inside the parentheses,
+* the opening curly brace in the same line,
+* a space between the closing parenthesis and the opening curly brace,
+* closing curly brace in a separate line,
+* `else` in the next line after the closing curly brace,
+* the opening curly brace of `else` in the same line after a space,
+* same goes for `else if`.
+
+Short conditional statements may be written on one line
+if this enhances readability.
+
+```
+#!cpp
+
+if (x == foo) return bar;
+```
+
+This is not allowed when the `if` statement has an `else`.
+
+Curly braces are not required for single-line statements.
+However, if one part of an if-else statement uses curly braces,
+the other part must too.
+
+Loops
+-----
+
+The boilerplates for loops are:
+
+```
+#!cpp
+
+for (int i = 0; i < SomeNumber; i++)
+    ...
+
+for (int i = 0; i < SomeNumber; i++) {
+    ...
+}
+```
+
+```
+#!cpp
+
+while (condition)
+    ...
+
+while (condition) {
+    ...
+}
+
+do {
+    ...
+} while (condition);
+```
+
+Switch
+------
+
+The boilerplate for `switch` is:
+
+```
+#!cpp
+
+switch (var) {
+case 0:
+    ...
+    break;
+case 0:
+    ...
+    break;
+default:
+    assert(false);
+}
+```
+
+If the default case should never execute, simply assert false.
+
+Pointer and Reference Expressions
+---------------------------------
+
+No spaces around period or arrow. Pointer operators do not have trailing spaces.
+The following are examples of correctly-formatted pointer and reference expressions:
+
+```
+#!cpp
+
+x = *p;
+p = &x;
+x = r.y;
+x = r->y;
+```
+
+Note that:
+
+* There are no spaces around the period or arrow when accessing a member.
+* Pointer operators have no space after the * or &.
+
+When declaring a pointer variable or argument, always place the asterisk
+adjacent to the variable name:
+
+```
+#!cpp
+
+char *c;
+const string &str;
+```
+
+Boolean Expressions
+-------------------
+
+When you have a boolean expression that is longer than the standard line length,
+break lines like this:
+
+```
+#!cpp
+
+if (this_one_thing > this_other_thing &&
+    a_third_thing == a_fourth_thing &&
+    yet_another && last_one) {
+    ...
+}
+```
+
+Note that when the code wraps in this example, both of the && operators
+are at the end of the line.
+Also note that you should always use the punctuation operators,
+such as && and ~, rather than the word operators, such as `and` and `compl`.
+
+Return Values
+-------------
+
+Do not needlessly surround the `return` expression with parentheses.
+Use parentheses in `return expr`; only where you would use them in `x = expr;`.
+
+```
+#!cpp
+
+return result;
+
+return (some_long_condition && another_condition);
+```
+
+Preprocessor Directives
+-----------------------
+
+The hash mark that starts a preprocessor directive should always be
+at the beginning of the line.
+Even when preprocessor directives are within the body of indented code,
+the directives should start at the beginning of the line.
+
+Class Format
+------------
+
+The class boilerplate is:
+
+```
+#!cpp
+
+class MyClass {
+    friend class FriendClass;
+
+public:
+    MyClass(int value) : value_(value), pointer_(NULL) {}
+    ~MyClass();
+
+    int getValue() { return value_; }
+    int getPointer() { return pointer_; }
+
+    void awesomePublicMethod();
+    void anotherPublicMethod();
+
+private:
+    void awesomeSecretMethod();
+    void anotherSecretMethod();
+
+    int value_;
+    int *poiter_;
+};
+```
+
+Note that:
+
+* The `friend` keyword is indented.
+* The `private` and `public` keywords are not indented.
+* Except for the first instance, these keywords are preceded by a blank line.
+* There are no blank lines after these keywords.
+* The `friend` section is first, followed by the `public` section
+  and the `private` section.
+* In the private section, methods are first, followed by attributes.
+
+> This is the order of the
+> [Google Style Guide](https://google.github.io/styleguide/cppguide.html#Class_Format),
+> the Trilinos project, and also Doxygen produces documentation
+> in this order.
+>
+> The order of methods is top to bottom, e.g., the deeper in the call tree
+> the lower on the list. This applies both to the order of declarations
+> and the order or definitions, which should be identical.
+
+Constructor Initializer Lists
+-----------------------------
+
+Constructor initializer lists can be all on one line or with subsequent lines
+indented four spaces.
+The acceptable formats for initializer lists are:
+
+```
+#!cpp
+
+    MyClass(int value)
+        : value_(value), pointer_(NULL)
+    {
+        ...
+    }
+
+    MyClass(int value)
+        : value_(value),
+          pointer_(NULL)
+    {
+        ...
+    }
+```
+
+In each case the closing curly brace can be in the same line as the opening
+curly brace if it fits.
+
+> Initialization list is part of constructor's definition,
+> so you need to define it at the same place you define constructor's body.
+
+Namespace Formatting
+--------------------
+
+The contents of namespaces are not indented.
+
+When declaring nested namespaces, put each namespace on its own line.
+
+```
+#!cpp
+
+namespace Icl {
+namespace Magma {
+
+...
+
+} // namespace Magma
+} // namespace Icl
+```
+
+Horizontal Whitespace
+---------------------
+
+* Never put trailing whitespace at the end of a line.
+> Trailing whitespace can cause extra work for others editing the same file,
+> when they merge.
+
+* Opening curly braces should always have a space before them.
+
+* Semicolons have no space before them.
+
+* Put spaces around the colon in initializer lists.
+  The same applies for inheritance if you end up using it.
+
+* For inline function implementations, put spaces between the braces
+  and the implementation itself.
+
+* No spaces inside empty parentheses and curly braces.
+
+* Put space after the keyword in conditions and loops.
+
+* `for` loops always have a space after the semicolon,
+  and never a space before the semicolon.
+
+* No space before colon in a `switch` case.
+  A space after a colon if there's code after it.
+
+* Assignment operators always have spaces around them.
+
+* Other binary operators usually have spaces around them, but it's
+  okay to remove spaces around factors.
+  Parentheses should have no internal padding.
+
+* No spaces separating unary operators and their arguments.
+
+```
+#!cpp
+
+x = 0;
+
+v = w * x + y / z;
+v = w*x + y/z;
+v = w * (x + z);
+
+x = -5;
+++x;
+if (x && !y)
+    ...
+```
+
+* In templates, no spaces inside the angle brackets.
+  No spaces between type and pointer.
+  C++11 notation for nesting.
+
+```
+#!cpp
+
+vector<string> x;
+
+y = static_cast<char*>(x);
+
+vector<char*> x;
+
+set<list<string>> x;
+```
+
+Vertical Whitespace
+-------------------
+
+* Never use more than a single blank line.
+
+Horizontal Rules
+----------------
+
+* Horizontal rules are great.
+  Use whatever works for you.
+  Be consistent throughout each project.
+  Make them 80 characters wide.
+
+```
+#!cpp
+
+//------------------------------------------------------------------------------
+
+////////////////////////////////////////////////////////////////////////////////
+
+/******************************************************************************/
+```
\ No newline at end of file

From 11b64d5217e061658adcb303f9e7da307c2e3dda Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 15 Jan 2025 13:16:41 -0500
Subject: [PATCH 03/12] Update style guide. Add SLATE comments. Drop ICL
 prefix/namespace.

---
 docs/style-guide.md | 140 +++++++++++++++++++++++++++-----------------
 1 file changed, 86 insertions(+), 54 deletions(-)

diff --git a/docs/style-guide.md b/docs/style-guide.md
index 0cb7c799..1c913444 100644
--- a/docs/style-guide.md
+++ b/docs/style-guide.md
@@ -38,19 +38,15 @@ General Guidelines
 * If you spot inconsistencies, fix them.
 * Break rules if it helps readability. This is only a guide.
 
-> Established ICL projects (PLASMA, MAGMA, PaRSEC, PAPI) already have their
-> conventions. In most cases, existing project conventions override conventions
-> in this guide. Unless, you can fix bad practices in an existing project by consistently
-> applying a better convention across the entire source code, a task which can sometimes
-> be automated.
->
-> Definitely follow this guide if starting a new project or a prototype.
+> This guide was written for PLASMA. Other ICL projects have their own
+> conventions. A few differences with SLATE's style are noted here.
 
 Standard Compliance
 ===================
 
 C codes should be C99 compliant and compiled with the `-std=c99` flag,
-and C++ codes should be C++11 compliant and compiled with the `-std=c++11` flag.
+and C++ codes should be C++11 compliant and compiled with the `-std=c++11` flag
+(or later).
 
 Avoid features present only in C but not in C++. That is, C code should compile with either C or C++ compiler.
 
@@ -70,32 +66,31 @@ Files that are meant for textual inclusion, but are not headers, should end in `
 > Therefore, we use `.c` and `.h` for C and `.cc` and `.hh` for C++.
 >
 > Trilinos uses another common convention of `.cpp` and `.hpp`.
-> However, in a long list of files, this puts a lot of <p>s on the screen.
+> However, in a long list of files, this puts a lot of p's on the screen.
 > The `.cc` and `.hh` endings are shorter and cleaner.
 
 \#define Guards
 --------------
 
 All header files should have `#define` guards to prevent multiple inclusion.
-The format of the symbol name should be `<ICL>_<PROJECT>_<FILE>_H` for C
-and `<ICL>_<PROJECT>_<FILE>_HH` for C++.
+The format of the symbol name should be `<PROJECT>_<FILE>_H` for C
+and `<PROJECT>_<FILE>_HH` for C++.
 
 ```
 #!cpp
 
-#ifndef ICL_MAGMA_BLAS_H
-#define ICL_MAGMA_BLAS_H
+#ifndef PLASMA_BLAS_H
+#define PLASMA_BLAS_H
 
 ...
 
-#endif // ICL_MAGMA_BLAS_H
+#endif // PLASMA_BLAS_H
 ```
 
 > Google uses an underscore at the end.
-> Some projects use an underscore at the beginning and an underscore at the end.
 > Trilinos does not use underscores.
 > In the case of header guards, beginning/ending underscores seem pointless.
-> Underscores in front of a name are used for system-level hacking.
+> Underscores in front of a name are reserved for system-level headers.
 
 extern C
 --------
@@ -190,7 +185,6 @@ Keep your system-specific code small and localized.
 #include <libkern/OSAtomic.h>
 
 ...
-
 #endif // __APPLE__
 ```
 
@@ -200,13 +194,14 @@ Scoping
 Namespaces
 ----------
 
-With few exceptions, place code in a namespace.
+With few exceptions, place C++ code in a namespace.
 Use named namespaces as follows:
 
 * Namespaces wrap the entire source file after includes.
-* Do not declare anything in namespace std, including forward declarations
-  of standard library classes. Declaring entities in namespace std is undefined
-  behavior. To declare entities from the standard library, include the appropriate
+* Do not declare anything in namespace `std`, including forward declarations
+  of standard library classes. Declaring entities in namespace `std` is undefined
+  behavior (unless specifically allowed in the C++ standard).
+  To declare entities from the standard library, include the appropriate
   header file.
 * Do not use a `using` directive to make all names from a namespace available.
 * Do not use `using` declarations in `.hh` files, because anything imported
@@ -218,30 +213,28 @@ Use named namespaces as follows:
 * Use Pascal case for namespace components.
 
 Namespaces should have unique names based on the project name.
-Generic components, that may be shared among multiple projects, such as, e.g.,
+Generic components, that may be shared among multiple projects, such as
 an efficient implementation of a thread-safe hash table, may be placed in
-a namespace `Icl`, while project-specific components may be placed
-in a namespace, e.g., `Icl::Magma`.
+a namespace `icl`, while project-specific components must be placed
+in a namespace, e.g., `plasma`.
 
 Format namespaces in `.hh` files as follows:
 
 ```
 #!cpp
 
-#ifndef ICL_MAGMA_HH
-#define ICL_MAGMA_HH
+#ifndef PLASMA_HH
+#define PLASMA_HH
 
 #include <cuda.h>
 
-namespace Icl {
-namespace Magma {
+namespace plasma {
 
 ...
 
-} // namespace Magma
-} // namespace Icl
+} // namespace plasma
 
-#endif // ICL_MAGMA_HH
+#endif // PLASMA_HH
 ```
 
 Nonmember, Static Member, and Global Functions
@@ -358,7 +351,7 @@ should be handled as follows:
 
 * Be self-contained, i.e., contain all memory pointers.
 
-* Set all memory pointers to NULL at the time of initialization
+* Set all memory pointers to `NULL` (`nullptr` in C++) at the time of initialization
   (in the constructor in C++).
   Rely on constructors or factory methods or `Init()` methods to perform
   allocations and initializations.
@@ -419,6 +412,11 @@ Other C++ Features
   of the space.
   And also, the proper name of `long` is `long int`, which also includes the space.
 
+> SLATE uses C++ style casts, which are clearer about what expression
+> the cast applies to, e.g., `(int) x * y` is the same as `int( x ) * y`,
+> not `int( x * y )`. For `long long`, SLATE has a typedef `llong` so that
+> `llong( x )` works. Otherwise, `(long long)( x )` works.
+
 * Do not use C++ streams.
   Use C standard IO functions instead.
   C++ streams are cumbersome and the
@@ -426,7 +424,8 @@ Other C++ Features
   provides a long list of reasons why.
 
 * Use preincrement/predecrement (`++i`) as opposed to postincrement/postdecrement (`i++`).
-  This is a common C++ convention.
+  This is a common C++ convention since postincrement introduces a
+  needless temporary return value that can be expensive for C++ classes.
 
 * Use the following notations when initializing variables:
   `1` for integers, `1.0` for doubles, `1.0f` for floats, `0x01` for bit patterns.
@@ -482,7 +481,7 @@ Pick one way or the other and stick to it.
 
 If you use exceptions, list exceptions in function comments, but not in signatures.
 Listing in signatures is a bad idea (http://www.gotw.ca/publications/mill22.htm),
-and it is deprected in C++11.
+and it is deprecated in C++11.
 
 Integer Types
 -------------
@@ -513,7 +512,8 @@ required, e.g., to minimize the memory footprint of a large array.
 NULL, nullptr, 0
 ----------------
 
-In C++ use nullptr - removes ambiguity. You can take sizeof(nullptr).
+In C++ use `nullptr`, which removes ambiguity. `NULL` is an integer (0);
+`nullptr` is a pointer. You can take `sizeof(nullptr)`.
 
 64-bit Addressing
 -----------------
@@ -564,6 +564,7 @@ This code summarizes the legitimate uses of macros:
 ...
 
 #endif // __APPLE__
+
 #endif // ICL_PTHREAD_H
 ```
 
@@ -585,11 +586,17 @@ Concurrency
   to implement low-level synchronization mechanisms.
   They are supported by all major compilers (GNU, Intel).
 
-* Declare all synchronization variables as `volatile`, even if you are
+> SLATE uses C++ `std::atomic`.
+
+* <strike> Declare all synchronization variables as `volatile`, even if you are
   only accessing them using atomic builtins.
   If a variable can be accessed by more than one thread, it needs to be
   `volatile` to prevent compiler from applying optimizations that might result
-  in incorrect code.
+  in incorrect code. </strike>
+
+> This misunderstands `volatile`. Read Scott Meyers, "Effective Modern C++",
+> Item 40: Use `std::atomic` for concurrency, `volatile` for special memory
+> (e.g. memory-mapped I/O).
 
 * If you find yourself considering the use of memory barriers,
   you went too low-level. Use atomic builtins or spinlocks instead.
@@ -597,6 +604,12 @@ Concurrency
 Building with Missing Dependencies
 ==================================
 
+> !!! This section violates xSDK policies such as (M9) using a
+> well-defined name space and (M12) linking with external dependencies.
+> Consider a library linked with stub MPI or OpenMP functions; it cannot
+> be used in an application that links with real MPI or OpenMP due to
+> name collisions. !!!
+
 If a certain environment is missing a mainstream component,
 e.g., Pthread spinlocks on OSX, do not create a new abstraction layer,
 but implement the missing functions on top of the available native functions
@@ -648,7 +661,7 @@ In C++ the file name should match the class name.
 Consider using the Trilinos convention:
 `<NameSpace>_<ClassName>`, i.e., namespace using Pascal case,
 underscore, class name using Pascal case,
-e.g., `IclMagma_SomeClass.hh`, `IclMagma_SomeClass.cc`.
+e.g., `Magma_SomeClass.hh`, `Magma_SomeClass.cc`.
 
 Use `.h` and `.c` extensions for C files, and `.hh` and `.cc` for C++ files.
 
@@ -693,11 +706,16 @@ Function Names
 Use snake case for C function names and C++ method names,
 e.g., `my_awesome_c_function`, `my_awesome_cpp_method`.
 
+> SLATE used lowerCamelCase for methods, but there has been discussion
+> to make them snake_case.
+
 Namespace Names
 ---------------
 
 Namespace names are Pascal case (Trilinos, Microsoft).
 
+> SLATE uses lowercase for namespaces.
+
 Enumerator Names
 ----------------
 
@@ -774,6 +792,8 @@ This is the convention of Google, and also LAPACK.
 If the function does something trivial, just skip the comment.
 It is quite common for destructors not to have header comments.
 
+> SLATE skips `@brief` in favor of Doxygen's autobrief feature.
+
 Variable Comments
 -----------------
 
@@ -1010,10 +1030,10 @@ The boilerplates for loops are:
 ```
 #!cpp
 
-for (int i = 0; i < SomeNumber; i++)
+for (int i = 0; i < SomeNumber; ++i)
     ...
 
-for (int i = 0; i < SomeNumber; i++) {
+for (int i = 0; i < SomeNumber; ++i) {
     ...
 }
 ```
@@ -1055,6 +1075,8 @@ default:
 
 If the default case should never execute, simply assert false.
 
+> SLATE indents `case` statements one level, since they're inside a block.
+
 Pointer and Reference Expressions
 ---------------------------------
 
@@ -1085,6 +1107,8 @@ char *c;
 const string &str;
 ```
 
+> SLATE places * and & next to the type, since it is part of the type.
+
 Boolean Expressions
 -------------------
 
@@ -1106,6 +1130,11 @@ are at the end of the line.
 Also note that you should always use the punctuation operators,
 such as && and ~, rather than the word operators, such as `and` and `compl`.
 
+> SLATE places operators at the beginning of the line for clarity,
+> consistent with math typesetting conventions.
+> (Although older SLATE code doesn't.)
+> See AMS "Mathematics into Type", section 3.3.5.
+
 Return Values
 -------------
 
@@ -1128,6 +1157,9 @@ at the beginning of the line.
 Even when preprocessor directives are within the body of indented code,
 the directives should start at the beginning of the line.
 
+> SLATE indents preprocessor directives and code inside #if ... #else ... #endif
+> for improved readability (excluding header guards).
+
 Class Format
 ------------
 
@@ -1140,18 +1172,18 @@ class MyClass {
     friend class FriendClass;
 
 public:
-    MyClass(int value) : value_(value), pointer_(NULL) {}
+    MyClass(int value) : value_(value), pointer_(nullptr) {}
     ~MyClass();
 
-    int getValue() { return value_; }
-    int getPointer() { return pointer_; }
+    int get_value() { return value_; }
+    int get_pointer() { return pointer_; }
 
-    void awesomePublicMethod();
-    void anotherPublicMethod();
+    void awesome_public_method();
+    void another_public_method();
 
 private:
-    void awesomeSecretMethod();
-    void anotherSecretMethod();
+    void awesome_secret_method();
+    void another_secret_method();
 
     int value_;
     int *poiter_;
@@ -1188,14 +1220,14 @@ The acceptable formats for initializer lists are:
 #!cpp
 
     MyClass(int value)
-        : value_(value), pointer_(NULL)
+        : value_(value), pointer_(nullptr)
     {
         ...
     }
 
     MyClass(int value)
         : value_(value),
-          pointer_(NULL)
+          pointer_(nullptr)
     {
         ...
     }
@@ -1217,13 +1249,13 @@ When declaring nested namespaces, put each namespace on its own line.
 ```
 #!cpp
 
-namespace Icl {
-namespace Magma {
+namespace plasma {
+namespace internal {
 
 ...
 
-} // namespace Magma
-} // namespace Icl
+} // namespace internal
+} // namespace plasma
 ```
 
 Horizontal Whitespace
@@ -1313,4 +1345,4 @@ Horizontal Rules
 ////////////////////////////////////////////////////////////////////////////////
 
 /******************************************************************************/
-```
\ No newline at end of file
+```

From cf0d1b6edd1e46a4ddac517dcd093a25f27b53eb Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Fri, 17 Jan 2025 00:24:11 -0500
Subject: [PATCH 04/12] codegen: Add --depend option

---
 tools/codegen.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/codegen.py b/tools/codegen.py
index d79e89f2..273ed372 100755
--- a/tools/codegen.py
+++ b/tools/codegen.py
@@ -63,7 +63,7 @@
 Makefile.blas.gen: force_gen
 endif
 endif
-							
+
 force_gen: ;
 
 ----------------------------------------------------------------------
@@ -97,6 +97,7 @@
     epilog=help )
 parser.add_argument( '-v', '--verbose',   action='store_true', help='Print verbose output to stderr' )
 parser.add_argument( '-o', '--output',    action='store_true', help='Generate list of output files' )
+parser.add_argument( '-d', '--depend',    action='store_true', help='Generate list of dependencies (output1 output2: input)' )
 parser.add_argument( '-m', '--make',      action='store_true', help='Generate Makefile rules' )
 parser.add_argument(       '--prefix',    action='store',      help='Prefix for variables in Makefile', default='src')
 parser.add_argument( '-p', '--precision', action='append',     help='Generate only given precision (s d c z ds zc ...). Repeatable.' )
@@ -344,6 +345,15 @@ def main():
         # end
         print( " ".join( generated ) )
 
+    elif opts.depend:
+        depends = ''
+        for filename in opts.args:
+            src = SourceFile( filename )
+            (files, precs) = src.get_filenames( opts.precision )
+            if (files):
+                print( " ".join( files ) + ": " + filename )
+        # end
+
     else:
         # default is to generate files
         for filename in opts.args:

From 3394a3b187073458f5d2d22d9003db590a7584ad Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Fri, 17 Jan 2025 16:12:29 -0500
Subject: [PATCH 05/12] codegen: fix subs for newer python

---
 tools/subs.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/subs.py b/tools/subs.py
index e8b7f99b..aad67cd2 100644
--- a/tools/subs.py
+++ b/tools/subs.py
@@ -437,8 +437,8 @@ def title( table ):
     #('plasma_s',             'plasma_c'            ),
 
     # ----- Fortran examples
-    ('real\(',               'complex\(',          ),
-    ('\(transpose\(',        'conjg\(transpose\('  ),
+    (r'real\(',             r'complex\(',          ),
+    (r'\(transpose\(',      r'conjg\(transpose\('  ),
 
   ],  # end mixed
 
@@ -463,7 +463,7 @@ def title( table ):
     ('symmetric',            'symmetric',            'hermitian',            'hermitian'           ),
     ('symmetric',            'symmetric',            'Hermitian',            'Hermitian'           ),
     ('orthogonal',           'orthogonal',           'unitary',              'unitary'             ),
-    ('\^T',                  '\^T',                  '\^H',                  '\^H'                 ),
+    (r'\^T',                r'\^T',                 r'\^H',                 r'\^H'                 ),
     ('%f',                   '%lf',                  '%f',                   '%lf'                 ),  # for scanf
 
     # ----- CBLAS
@@ -551,8 +551,8 @@ def title( table ):
 
     # ----- Fortran examples
     ('wp = sp',              'wp = dp',              'wp = sp',              'wp = dp'             ),
-    ('real\(wp\)',           'real\(wp\)',           'complex\(wp\)',        'complex\(wp\)'       ),
-    ('\(transpose\(',        '\(transpose\(',        'conjg\(transpose\(',   'conjg\(transpose\('  ),
+    (r'real\(wp\)',         r'real\(wp\)',          r'complex\(wp\)',       r'complex\(wp\)'       ),
+    (r'\(transpose\(',      r'\(transpose\(',       r'conjg\(transpose\(',  r'conjg\(transpose\('  ),
 
   ],  # end normal
 } # end subs

From 67249bc3e505a80c1bf625baf9e636194c9ff6ce Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Fri, 17 Jan 2025 15:53:28 -0500
Subject: [PATCH 06/12] generate sstevx2 from dstevx2 instead of dummy zstevx2;
 similarly with others

---
 .gitignore                              |   4 -
 compute/{zlaebz2.c => dlaebz2.c}        | 131 ++++++++--------
 compute/{zlaneg2.c => dlaneg2.c}        |  57 ++++---
 compute/{zstevx2.c => dstevx2.c}        | 191 ++++++++++++------------
 test/{test_zstevx2.c => test_dstevx2.c} | 106 +++++++------
 5 files changed, 235 insertions(+), 254 deletions(-)
 rename compute/{zlaebz2.c => dlaebz2.c} (84%)
 rename compute/{zlaneg2.c => dlaneg2.c} (86%)
 rename compute/{zstevx2.c => dstevx2.c} (81%)
 rename test/{test_zstevx2.c => test_dstevx2.c} (84%)

diff --git a/.gitignore b/.gitignore
index e270d32a..768b2547 100644
--- a/.gitignore
+++ b/.gitignore
@@ -96,9 +96,7 @@ compute/dgetri.c
 compute/dgetri_aux.c
 compute/dgetrs.c
 compute/dlacpy.c
-compute/dlaebz2.c
 compute/dlag2s.c
-compute/dlaneg2.c
 compute/dlangb.c
 compute/dlange.c
 compute/dlansy.c
@@ -122,7 +120,6 @@ compute/dpotrs.c
 compute/dsgbsv.c
 compute/dsgesv.c
 compute/dsposv.c
-compute/dstevx2.c
 compute/dsymm.c
 compute/dsyr2k.c
 compute/dsyrk.c
@@ -606,7 +603,6 @@ test/test_ds.h
 test/test_dsgbsv.c
 test/test_dsgesv.c
 test/test_dsposv.c
-test/test_dstevx2.c
 test/test_dsymm.c
 test/test_dsyr2k.c
 test/test_dsyrk.c
diff --git a/compute/zlaebz2.c b/compute/dlaebz2.c
similarity index 84%
rename from compute/zlaebz2.c
rename to compute/dlaebz2.c
index 6b0b211a..f592422d 100644
--- a/compute/zlaebz2.c
+++ b/compute/dlaebz2.c
@@ -1,17 +1,17 @@
 /**
  *
- * @file 
+ * @file
  *
  *  PLASMA is a software package provided by:
  *  University of Tennessee, US,
  *
- * @precisions normal z -> s d 
+ * @precisions normal d -> s
  *
  **/
 
 #include "plasma.h"
 #include "plasma_internal.h"     /* needed for imin, imax. */
-#include "plasma_zlaebz2_work.h" /* work areas. */
+#include "plasma_dlaebz2_work.h" /* work areas. */
 
 #include <string.h>
 #include <omp.h>
@@ -22,29 +22,26 @@
  *
  * @ingroup plasma_gemm
  *
- *
- * This file is a z-template to generate s and d code.
- * Only s and d are compiled; not c or z. 
  * This code is not designed to be called directly by users; it is a subroutine
- * for zstevx2.c. 
+ * for dstevx2.c.
  *
  * Specifically, this is a task-based parallel algorithm, the parameters are
- * contained in the already initialized and populated zlaebz2_Control_t; For 
- * example, from zstevx2:
+ * contained in the already initialized and populated dlaebz2_Control_t; For
+ * example, from dstevx2:
  *
  *  #pragma omp parallel
  *  {
  *      #pragma omp single
  *      {
- *          plasma_zlaebz2(&Control, ...etc...);
+ *          plasma_dlaebz2(&Control, ...etc...);
  *      }
  *  }
  *
- *  
+ *
  *******************************************************************************
- *  
+ *
  * @param[in] *Control
- *          A pointer to the global variables needed. 
+ *          A pointer to the global variables needed.
  *
  * @param[in] Control->N
  *          int number of rows in the matrix.
@@ -66,20 +63,20 @@
  *              PlasmaVec if user desires eigenvectors computed.
  *
  * @param[in] Control->il
- *          int enum. The lowerBound of an index range if range is 
- *          PlasmaRangeI. 
+ *          int enum. The lowerBound of an index range if range is
+ *          PlasmaRangeI.
  *
  * @param[in] Control->iu
- *          int enum. The upperBound of an index range, if range is 
+ *          int enum. The upperBound of an index range, if range is
  *          PlasmaRangeI.
  *
  * @param[in] Control->stein_arrays
- *          array of [max_threads], type zlaebz2_Stein_Array_t, contains work
+ *          array of [max_threads], type dlaebz2_Stein_Array_t, contains work
  *          areas per thread for invoking _stein (inverse iteration to find
  *          eigenvectors).
  *
  * @param[in] Control->baseIdx
- *          The index of the least eigenvalue to be found in the bracket, 
+ *          The index of the least eigenvalue to be found in the bracket,
  *          used to calculate the offset into the return vectors/arrays.
  *
  * @param[out] Control->error
@@ -130,7 +127,7 @@
  *
  *
  * This algorithm uses Bisection by the Scaled Sturm Sequence, implemented in
- * plasma_zlaebz2, followed by the LAPACK routine _STEIN, which uses inverse
+ * plasma_dlaebz2, followed by the LAPACK routine _STEIN, which uses inverse
  * iteration to find the eigenvalue.  The initial 'bracket' parameters should
  * contain the full range for the eigenvalues we are to discover. The algorithm
  * is recursively task based, at each division the bracket is divided into two
@@ -151,7 +148,7 @@
  *****************************************************************************/
 
 /*******************************************************************************
- * Use LAPACK zstein to find a single eigenvector.  We may use this routine
+ * Use LAPACK dstein to find a single eigenvector.  We may use this routine
  * multiple times, so instead of allocating/freeing the work spaces repeatedly,
  * we have an array of pointers, per thread, to workspaces we allocate if not
  * already allocated for this thread. So we don't allocate more than once per
@@ -160,9 +157,9 @@
  * to converge.
 *******************************************************************************/
 
-int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd, 
-        plasma_complex64_t u,     plasma_complex64_t *v, int N, 
-        zlaebz2_Stein_Array_t *myArrays) {
+int plasma_dstein( double *diag, double *offd,
+        double u,     double *v, int N,
+        dlaebz2_Stein_Array_t *myArrays) {
     int M=1, LDZ=N, INFO;
     int thread = omp_get_thread_num();
 
@@ -176,22 +173,22 @@ int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd,
         if (myArrays[thread].ISPLIT != NULL) myArrays[thread].ISPLIT[0]=N;
     }
 
-    if (myArrays[thread].WORK   == NULL) myArrays[thread].WORK   = (plasma_complex64_t*) calloc(5*N, sizeof(plasma_complex64_t));
+    if (myArrays[thread].WORK   == NULL) myArrays[thread].WORK   = (double*) calloc(5*N, sizeof(double));
     if (myArrays[thread].IWORK  == NULL) myArrays[thread].IWORK  = (int*) calloc(N, sizeof(int));
     if (myArrays[thread].IFAIL  == NULL) myArrays[thread].IFAIL  = (int*) calloc(N, sizeof(int));
-    if (myArrays[thread].IBLOCK == NULL || 
-        myArrays[thread].ISPLIT == NULL || 
-        myArrays[thread].WORK   == NULL || 
-        myArrays[thread].IWORK  == NULL || 
+    if (myArrays[thread].IBLOCK == NULL ||
+        myArrays[thread].ISPLIT == NULL ||
+        myArrays[thread].WORK   == NULL ||
+        myArrays[thread].IWORK  == NULL ||
         myArrays[thread].IFAIL  == NULL) {
         return(PlasmaErrorOutOfMemory);
     }
 
-    plasma_complex64_t W = u;
+    double W = u;
 
-    /* We use the 'work' version so we can re-use our work arrays; using LAPACKE_zstein() */
-    /* would re-allocate and release work areas on every call.                            */ 
-    INFO = LAPACKE_zstein_work(LAPACK_COL_MAJOR, N, diag, offd, M, &W, myArrays[thread].IBLOCK, 
+    /* We use the 'work' version so we can re-use our work arrays; using LAPACKE_dstein() */
+    /* would re-allocate and release work areas on every call.                            */
+    INFO = LAPACKE_dstein_work(LAPACK_COL_MAJOR, N, diag, offd, M, &W, myArrays[thread].IBLOCK,
             myArrays[thread].ISPLIT, v, LDZ, myArrays[thread].WORK, myArrays[thread].IWORK,
             myArrays[thread].IFAIL);
     return(INFO);
@@ -213,23 +210,23 @@ int plasma_zstein( plasma_complex64_t *diag, plasma_complex64_t *offd,
  *                  nLT_Low or nLT_hi is computed.
  * ***************************************************************************/
 
-void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
-        plasma_complex64_t upperBound, int nLT_low, int nLT_hi, int numEV) {
+void plasma_dlaebz2(dlaebz2_Control_t *Control, double lowerBound,
+        double upperBound, int nLT_low, int nLT_hi, int numEV) {
 
-    plasma_complex64_t *diag = Control->diag;
-    plasma_complex64_t *offd = Control->offd;
+    double *diag = Control->diag;
+    double *offd = Control->offd;
     int    N = Control->N;
- 
-    plasma_complex64_t cp;
+
+    double cp;
     int flag=0, evLess;
 
     if (nLT_low < 0) {
-        nLT_low = plasma_zlaneg2(diag, offd, N, lowerBound);
+        nLT_low = plasma_dlaneg2(diag, offd, N, lowerBound);
         flag=1;
     }
 
     if (nLT_hi < 0) {
-        nLT_hi =  plasma_zlaneg2(diag, offd, N, upperBound);
+        nLT_hi =  plasma_dlaneg2(diag, offd, N, upperBound);
         flag=1;
     }
 
@@ -243,17 +240,17 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
     if (Control->range == PlasmaRangeI) {
         if (nLT_hi  < Control->il ||    /* e.g if il=500, and nLT_hi=499, this bracket is under range of interest. */
             nLT_low > Control->iu) {    /* e.g if iu=1000, and nLT_low=1001, this bracket is above range of interest. */
-            return; 
+            return;
         }
-    } 
-                
+    }
+
     /* Bisect the bracket until we can't anymore. */
-               
+
     flag = 0;
     for (;;) {
         cp = (lowerBound+upperBound)*0.5;
         if (cp == lowerBound || cp == upperBound) {
-            /* Our bracket has been narrowed to machine epsilon for this magnitude (=ulp). 
+            /* Our bracket has been narrowed to machine epsilon for this magnitude (=ulp).
              * We are done; the bracket is always [low,high). 'high' is not included, so
              * we have numEV eigenvalues at low, whether it == 1 or is > 1. We find
              * the eigenvector. (We can test multiplicity with GluedWilk).
@@ -261,13 +258,13 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
             break; /* exit for(;;). */
         } else {
             /* we have a new cutpoint. */
-            evLess = plasma_zlaneg2(diag, offd, N, cp);
+            evLess = plasma_dlaneg2(diag, offd, N, cp);
             if (evLess < 0) {
                 /* We could not compute the Sturm sequence for it. */
                 flag = -1; /* indicate an error. */
                 break; /* exit for (;;). */
             }
-        
+
             /* Discard empty halves in both PlasmaRangeV and PlasmaRangeI.
              * If #EV < cutpoint is the same as the #EV < high, it means
              * no EV are in [cutpoint, hi]. We can discard that range.
@@ -277,16 +274,16 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
                 upperBound = cp;
                 continue;
             }
-        
+
             /* If #EV < cutpoint is the same as #EV < low, it means no
-             * EV are in [low, cutpoint]. We can discard that range. 
+             * EV are in [low, cutpoint]. We can discard that range.
              */
 
             if (evLess == nLT_low) {
                 lowerBound = cp;
                 continue;
             }
-        
+
             /* Note: If we were PlasmaRangeV, the initial bounds given by the user are the ranges,
              * so we have nothing further to do. In PlasmaRangeI; the initial bounds are Gerschgorin
              * limits and not enough: We must further narrow to the desired indices.
@@ -295,8 +292,8 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
             if (Control->range == PlasmaRangeI) {
                 /* For PlasmaRangeI:
                  * Recall that il, iu are 1-relative; while evLess is zero-relative; i.e.
-                 * if [il,iu]=[1,2], evless must be 0, or 1. 
-                 * when evLess<cp == il-1, or just <il, cp is a good boundary and 
+                 * if [il,iu]=[1,2], evless must be 0, or 1.
+                 * when evLess<cp == il-1, or just <il, cp is a good boundary and
                  * we can discard the lower half.
                  *
                  * To judge the upper half, the cutpoint must be < iu, so if it is >= iu,
@@ -311,7 +308,7 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
                     numEV = (nLT_hi-nLT_low);
                     continue;
                 }
-        
+
                 if (evLess >= Control->iu) {
                     /* The upper half [cp, upperBound) is not needed, it has no indices > iu; */
                     upperBound = cp;
@@ -320,24 +317,24 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
                     continue;
                 }
             } /*end if index search. */
-        
+
             /* Here, the cutpoint has EV on both left right. We push off the right bracket.
-             * The new lowerBound is the cp, the upperBound is unchanged, the number of 
+             * The new lowerBound is the cp, the upperBound is unchanged, the number of
              * eigenvalues changes. */
             #pragma omp task
-                plasma_zlaebz2(Control, cp, upperBound, evLess, nLT_hi, (nLT_hi-evLess));
+                plasma_dlaebz2(Control, cp, upperBound, evLess, nLT_hi, (nLT_hi-evLess));
 
             /* Update the Left side I kept. The new number of EV less than upperBound
-             * is evLess, recompute number of EV in the bracket. */               
+             * is evLess, recompute number of EV in the bracket. */
             upperBound = cp;
             nLT_hi = evLess;
-            numEV =( evLess - nLT_low); 
-            continue; 
+            numEV =( evLess - nLT_low);
+            continue;
          }
     } /* end for (;;) for Bisection. */
-                
+
     /* Okay, count this eigenpair done, add to the Done list.
-     * NOTE: nLT_low is the global zero-relative index of 
+     * NOTE: nLT_low is the global zero-relative index of
      *       this set of mpcity eigenvalues.
      *       No other brackets can change our entry, so we
      *       don't need any thread block or atomicity.
@@ -349,24 +346,24 @@ void plasma_zlaebz2(zlaebz2_Control_t *Control, plasma_complex64_t lowerBound,
     } else { /* range == PlasmaRangeV */
         myIdx = nLT_low - Control->baseIdx;
     }
-    
+
     if (Control->jobtype == PlasmaVec) {
         /* get the eigenvector. */
-        int ret=plasma_zstein(diag, offd, lowerBound, &(Control->pVec[myIdx*N]), N, Control->stein_arrays);
+        int ret=plasma_dstein(diag, offd, lowerBound, &(Control->pVec[myIdx*N]), N, Control->stein_arrays);
         if (ret != 0) {
             #pragma omp critical (UpdateStack)
             {
-                /* Only store first error we encounter */ 
+                /* Only store first error we encounter */
                 if (Control->error == 0) Control->error = ret;
             }
         }
     }
-    
+
     /* Add eigenvalue and multiplicity. */
     Control->pVal[myIdx]=lowerBound;
     Control->pMul[myIdx]=numEV;
-    
-//    #pragma omp atomic 
+
+//    #pragma omp atomic
 //        Control->finished += numEV;
 }
 
diff --git a/compute/zlaneg2.c b/compute/dlaneg2.c
similarity index 86%
rename from compute/zlaneg2.c
rename to compute/dlaneg2.c
index 258292df..eafc5f5a 100644
--- a/compute/zlaneg2.c
+++ b/compute/dlaneg2.c
@@ -1,19 +1,14 @@
 /**
  *
- * @file 
+ * @file
  *
  *  PLASMA is a software package provided by:
  *  University of Tennessee, US,
  *
- * @precisions normal z -> s d 
+ * @precisions normal d -> s
  *
  **/
 
-/*
- * This file is a z-template to generate s and d code.
- * Only s and d are compiled; not c or z. 
- */
- 
 /******************************************************************************
  * See https://archive.siam.org/meetings/la03/proceedings/zhangjy3.pdf
  * "J. Zhang, 2003, The Scaled Sturm Sequence Computation".  Both the Sturm
@@ -28,22 +23,22 @@
  * p[-1] = 1.;             // zero relative indexing.
  * p[0] = diag[0] - u;
  * p[i] = (diag[i]-u)*p[i-1] - offd[i-1]*offd[i-1]*p[i-2], i=1, N-1.
- * 
+ *
  * The Classical Sturm recurrence can be shown as a matrix computation; namely
  * P[i] = M[i]*P[i-1]. Be careful of the i-1 index:
  * M[i] = [(diag[i]-u) , -offd[i-1]*offd[i-1] ] and P[i-1] = [ p[i-1] ]
  *        [          1 ,                    0 ]              [ p[i-2] ]
  * with P[-1] defined to be [1, 0] transposed.
  * notice 'p' is the classical Sturm, 'P' is a vector.
- * 
- * the matrix computation results in the vector: 
+ *
+ * the matrix computation results in the vector:
  * M[i]*P[i-1] = { (diag[i]-u)*p[i-1] -offd[i-1]*offd[i-1]*p[i-2] , p[i-1] }
- * 
+ *
  * So, in the classical case, P[i][0] is the classic Sturm sequence for p[i];
  * the second element is just the classic Sturm for p[i-1].
  *
- * However, this won't remain that way. For the SCALED Sturm sequence, we 
- * will scale P[i] after each calculation, with the scalar 's': 
+ * However, this won't remain that way. For the SCALED Sturm sequence, we
+ * will scale P[i] after each calculation, with the scalar 's':
  *
  * *********************************
  * P[i] = s * M[i]*P[i-1], i=0, N-1. Note we are scaling a vector here.
@@ -56,28 +51,28 @@
  * save = s * Pm1_0;
  * Pm1_0 = s * ( (diag[i]-u)*Pm1_0 -offd[i-1]*offd[i-1]*Pm1_1 );
  * Pm1_1 = save;
- 
+
  * Pm1_0 is used like the classical Sturm sequence; meaning we must calculate
  * sign changes.
- * 
+ *
  * s is computed given the vector X[] = M[i]*P[i-1] above.
  * PHI is set to 10^{10}, UPSILON is set to 10^{-10}. Then:
- *    w = max(fabs(X[0]), fabs(X[1])). 
+ *    w = max(fabs(X[0]), fabs(X[1])).
  *    if w > PHI then s = PHI/w;
  *    else if w < UPSILON then s = UPSILON/w;
  *    else s=1.0 (or, do not scale X).
- * 
+ *
  * This algorithm is backward stable. execution time is 1.5 times classic Sturm.
- * 
+ *
  * No sign change counts eigenvalues >= u.
  * sign changes count eigenvalues <  u.
  * This routine returns the number of sign changes, which is the count of
  * eigenvalues strictly less than u.
- * 
+ *
  * computation: What we need for each computation:
  * M[i], which we compute on the fly from diag[i] and offd[i-1].
  * P[i-1], which has two elements, [Pm1_0, Pm1_1]. (Pm1 means P minus 1).
- * LAPACK routine DLAEBZ computes a standard Sturm sequences; there is no 
+ * LAPACK routine DLAEBZ computes a standard Sturm sequences; there is no
  * comparable auto-scaling Sturm sequence.
  *
  * This routine is most similar to LAPACK DLANEG.f, but is not a replacement
@@ -92,13 +87,13 @@
 
 #include <math.h>
 
-int plasma_zlaneg2(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, plasma_complex64_t u) {
+int plasma_dlaneg2(double *diag, double *offd, int n, double u) {
     int i, isneg=0;
-    plasma_complex64_t s, w, v0, v1, Pm1_0, Pm1_1, PHI, UPSILON;
+    double s, w, v0, v1, Pm1_0, Pm1_1, PHI, UPSILON;
     if (n==0) return (0);
-    PHI = ((plasma_complex64_t)(((long long) 1)<<34));
+    PHI = ((double)(((long long) 1)<<34));
     UPSILON = 1.0/PHI;
- 
+
     Pm1_1 = 1.0;
     Pm1_0 = (diag[0]-u);
     if (Pm1_0 < 0) isneg = 1;  /* our first test. */
@@ -108,12 +103,12 @@ int plasma_zlaneg2(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, pl
         v1 = fabs(Pm1_1);
         if (v0 > v1) w = v0;
         else         w = v1;
- 
+
         /*Go ahead and calculate P[i]: */
         s = Pm1_0;
         Pm1_0 = (diag[i]-u)*Pm1_0 -((offd[i-1]*offd[i-1])*Pm1_1);
         Pm1_1 = s;
- 
+
         /* Now determine whether to scale these new values. */
         if (w > PHI) {
             s = PHI/w;
@@ -124,13 +119,13 @@ int plasma_zlaneg2(plasma_complex64_t *diag, plasma_complex64_t *offd, int n, pl
             Pm1_0 *= s;
             Pm1_1 *= s;
         } /* else skip scaling. */
- 
+
         /* Finally, see if the sign changed. */
-        if ( (Pm1_0 < 0 && Pm1_1 >= 0) ||  
+        if ( (Pm1_0 < 0 && Pm1_1 >= 0) ||
              (Pm1_0 >= 0 && Pm1_1 < 0)
-           ) isneg++;  
+           ) isneg++;
     }
-             
+
     return(isneg);
-} /* end plasma_zlaneg2 */
+} /* end plasma_dlaneg2 */
 
diff --git a/compute/zstevx2.c b/compute/dstevx2.c
similarity index 81%
rename from compute/zstevx2.c
rename to compute/dstevx2.c
index 87a065be..0a241083 100644
--- a/compute/zstevx2.c
+++ b/compute/dstevx2.c
@@ -1,23 +1,18 @@
 /**
  *
- * @file 
+ * @file
  *
  *  PLASMA is a software package provided by:
  *  University of Tennessee, US,
  *  University of Manchester, UK.
  *
- * @precisions normal z -> s d 
+ * @precisions normal d -> s
  *
  **/
 
-/*
- * This file is a z-template to generate s and d code.
- * Only s and d are compiled; not c or z. 
- */
- 
 #include "plasma.h"
 #include "plasma_internal.h"     /* needed for imin, imax. */
-#include "plasma_zlaebz2_work.h" /* work areas. */
+#include "plasma_dlaebz2_work.h" /* work areas. */
 
 #include <string.h>
 #include <omp.h>
@@ -34,14 +29,14 @@
  * eigenvectors can be selected by specifying either a range of values or a
  * range of indices for the desired eigenvalues.
  *
- * This is similiar to LAPACK dstevx, with more output parameters. 
+ * This is similiar to LAPACK dstevx, with more output parameters.
  *
  * Because input matrices are expected to be extremely large and the exact
  * number of eigenvalues is not necessarily known to the caller, this routine
  * provides a way to get the number of eigenvalues in either a value range or
  * an index range; so the caller can allocate the return arrays. There are
  * three; the floating point vector pVal, the integer vector pMul, and the
- * floating point matrix pVec, which is only required and only referenced for 
+ * floating point matrix pVec, which is only required and only referenced for
  * jobtype=PLasmaVec.
  *
  * When the jobtype=PlasmaCount; the code returns the maximum number of
@@ -60,7 +55,7 @@
  *
  * Finding eigenvalues alone is much faster than finding eigenpairs; the
  * majority of the time consumed when eigenvectors are found is in
- * orthogonalizing the eigenvectors; an O(N*K^2) operation. 
+ * orthogonalizing the eigenvectors; an O(N*K^2) operation.
  *******************************************************************************
  *
  * @param[in] jobtype
@@ -68,22 +63,22 @@
  *          = PlasmaNoVec: computes eigenvalues only;
  *          = PlasmaVec:   computes eigenvalues and eigenvectors.
  *          = PlasmaCount: computes pFound as the max number of eigenvalues/pairs
- *                         in the given range if there is no ULP-multiplicity, so 
+ *                         in the given range if there is no ULP-multiplicity, so
  *                         the user can allocate pVal[], pMul[], pVec[].
  *
  * @param[in] range
  *          enum:
  *          PlasmaRangeV use vl, vu for range [vl, vu)
- *          PlasmaRangeI use il, iu for range [il, iu]. 1-relative; 1..N. 
+ *          PlasmaRangeI use il, iu for range [il, iu]. 1-relative; 1..N.
  *
  * @param[in] n
  *          int. The order of the matrix A. n >= 0.
  *
  * @param[in] k
  *          int. The space the user has allocated for eigenvalues; as reflected
- *          in pVal, pMul, pVec. 
+ *          in pVal, pMul, pVec.
  *
- * @param[in] diag double[n]. Vector of [n] diagonal entries of A. 
+ * @param[in] diag double[n]. Vector of [n] diagonal entries of A.
  *
  * @param[in] offd double[n-1]. A vector of [n-1] off-diagonal entries of A.
  *
@@ -137,53 +132,53 @@
  * with offd[-1], offd[n] = 0.
  * Indexes above are 0 relative.
  * Although Gerschgorin is mentioned in ?larr?.f LAPACK files, it is coded
- * inline there. 
+ * inline there.
  *****************************************************************************/
 
-void plasma_zstelg(plasma_complex64_t *diag,  plasma_complex64_t *offd, int n,
-        plasma_complex64_t *Min, plasma_complex64_t *Max) {
+void plasma_dstelg(double *diag,  double *offd, int n,
+        double *Min, double *Max) {
     int i;
-    plasma_complex64_t test, testdi, testdim1, min=__DBL_MAX__, max=-__DBL_MAX__;
- 
+    double test, testdi, testdim1, min=__DBL_MAX__, max=-__DBL_MAX__;
+
     for (i=0; i<n; i++) {
         if (i == 0) testdim1=0.;
         else        testdim1=offd[i-1];
-        
+
         if (i==(n-1)) testdi=0;
         else          testdi=offd[i];
-        
+
         test=diag[i] - fabs(testdi) - fabs(testdim1);
         if (test < min) {
             min=test;
-        } 
-        
+        }
+
         test=diag[i] + fabs(testdi) + fabs(testdim1);
         if (test > max) {
             max=test;
-        }      
+        }
     }
-       
- 
-    plasma_complex64_t cp, minLB=min, minUB=max, maxLB=min, maxUB=max;
+
+
+    double cp, minLB=min, minUB=max, maxLB=min, maxUB=max;
     /* Within that range, find the actual minimum. */
     for (;;) {
         cp = (minLB+minUB)*0.5;
         if (cp == minLB || cp == minUB) break;
-        if (plasma_zlaneg2(diag, offd, n, cp) == n) minLB = cp;
+        if (plasma_dlaneg2(diag, offd, n, cp) == n) minLB = cp;
         else                                      minUB = cp;
     }
-     
+
     /* Within that range, find the actual maximum. */
     for (;;) {
         cp = (maxLB+maxUB)*0.5;
         if (cp == maxLB || cp == maxUB) break;
-        if (plasma_zlaneg2(diag, offd, n, cp) == n) {
+        if (plasma_dlaneg2(diag, offd, n, cp) == n) {
             maxUB=cp;
         } else {
             maxLB=cp;
         }
     }
- 
+
     *Min = minLB;
     *Max = maxUB;
 }
@@ -191,7 +186,7 @@ void plasma_zstelg(plasma_complex64_t *diag,  plasma_complex64_t *offd, int n,
 /******************************************************************************
  * STMVM: Symmetric Tridiagonal Matrix Vector Multiply.
  * Matrix multiply; A * X = Y.
- * A = [diag[0], offd[0], 
+ * A = [diag[0], offd[0],
  *     [offd[0], diag[1], offd[1]
  *     [      0, offd[1], diag[2], offd[2],
  *     ...
@@ -201,12 +196,12 @@ void plasma_zstelg(plasma_complex64_t *diag,  plasma_complex64_t *offd, int n,
  * This could be done by 3 daxpy, but more code and I think more confusing.
  *****************************************************************************/
 
-void plasma_zstmv(plasma_complex64_t *diag, plasma_complex64_t *offd, int n,
-        plasma_complex64_t *X, plasma_complex64_t *Y) {
+void plasma_dstmv(double *diag, double *offd, int n,
+        double *X, double *Y) {
     int i;
     Y[0] = diag[0]*X[0] + offd[0]*X[1];
     Y[n-1] = offd[n-2]*X[n-2] + diag[n-1]*X[n-1];
- 
+
     for (i=1; i<(n-1); i++) {
         Y[i] = offd[i-1]*X[i-1] + diag[i]*X[i] + offd[i]*X[i+1];
     }
@@ -218,23 +213,23 @@ void plasma_zstmv(plasma_complex64_t *diag, plasma_complex64_t *offd, int n,
  * This routine is necessary to determine if eigenvectors should be swapped.
  * eigenpair error: If A*v = u*v, then A*v-u*v should == 0. We compute the
  * L_infinity norm of (A*v-u*v).
- * We return DBL_MAX if the eigenvector (v) is all zeros, or if we fail to 
- * allocate memory. 
- * If u==0.0, we'll return L_INF of (A*V). 
+ * We return DBL_MAX if the eigenvector (v) is all zeros, or if we fail to
+ * allocate memory.
+ * If u==0.0, we'll return L_INF of (A*V).
  *****************************************************************************/
 
-plasma_complex64_t plasma_zstepe(plasma_complex64_t *diag, 
-    plasma_complex64_t *offd, int n, plasma_complex64_t u, 
-    plasma_complex64_t *v) {
+double plasma_dstepe(double *diag,
+    double *offd, int n, double u,
+    double *v) {
     int i, zeros=0;
-    plasma_complex64_t *AV;
-    plasma_complex64_t norm, dtemp;
- 
-    AV = (plasma_complex64_t*) malloc(n * sizeof(plasma_complex64_t));
+    double *AV;
+    double norm, dtemp;
+
+    AV = (double*) malloc(n * sizeof(double));
     if (AV == NULL) return __DBL_MAX__;
-     
-    plasma_zstmv(diag, offd, n, v, AV); /* AV = A*v. */
- 
+
+    plasma_dstmv(diag, offd, n, v, AV); /* AV = A*v. */
+
     norm = -__DBL_MAX__;  /* Trying to find maximum. */
     zeros=0;
     for (i=0; i<n; i++) {
@@ -242,7 +237,7 @@ plasma_complex64_t plasma_zstepe(plasma_complex64_t *diag,
         if (dtemp > norm) norm=dtemp;
         if (v[i] == 0.) zeros++;
     }
- 
+
     free(AV);
     if (zeros == n) return __DBL_MAX__;
     return norm;
@@ -250,19 +245,19 @@ plasma_complex64_t plasma_zstepe(plasma_complex64_t *diag,
 
 
 /******************************************************************************
- * This is the main routine; plasma_zstevx2
- * Arguments are described at the top of this source. 
+ * This is the main routine; plasma_dstevx2
+ * Arguments are described at the top of this source.
  *****************************************************************************/
-int plasma_zstevx2(
+int plasma_dstevx2(
   /* error report */
   /* args 1 - 4 */ plasma_enum_t jobtype, plasma_enum_t range, int n, int k,
-  /* args 5,6   */ plasma_complex64_t *diag, plasma_complex64_t *offd,
-  /* args 7,8   */ plasma_complex64_t vl, plasma_complex64_t vu,
-  /* args 9 - 12*/ int il, int iu, int *pFound, plasma_complex64_t *pVal,
-  /* arg 13,14  */ int    *pMul, plasma_complex64_t *pVec)
+  /* args 5,6   */ double *diag, double *offd,
+  /* args 7,8   */ double vl, double vu,
+  /* args 9 - 12*/ int il, int iu, int *pFound, double *pVal,
+  /* arg 13,14  */ int    *pMul, double *pVec)
 {
     int i, max_threads;
-    zlaebz2_Stein_Array_t *stein_arrays = NULL;
+    dlaebz2_Stein_Array_t *stein_arrays = NULL;
     /* Get PLASMA context. */
     plasma_context_t *plasma = plasma_context_self();
     if (plasma == NULL) {
@@ -295,7 +290,7 @@ int plasma_zstevx2(
         plasma_error("illegal pointer offd");
         return -6;
     }
-    
+
     if (range == PlasmaRangeV && vu <= vl ) {
         plasma_error("illegal value of vl and vu");
         return -7;
@@ -323,13 +318,13 @@ int plasma_zstevx2(
 
     if (jobtype == PlasmaVec) {
         /* we use calloc because we rely on pointer elements being NULL to single */
-        /* a need to allocate.                                                    */ 
-        stein_arrays = (zlaebz2_Stein_Array_t*) calloc(max_threads, sizeof(zlaebz2_Stein_Array_t));
+        /* a need to allocate.                                                    */
+        stein_arrays = (dlaebz2_Stein_Array_t*) calloc(max_threads, sizeof(dlaebz2_Stein_Array_t));
         if (stein_arrays == NULL) {
             return PlasmaErrorOutOfMemory;
         }
     }
-        
+
     /* Initialize sequence. */
     plasma_sequence_t sequence;
     plasma_sequence_init(&sequence);
@@ -338,10 +333,10 @@ int plasma_zstevx2(
     plasma_request_t request;
     plasma_request_init(&request);
 
-    plasma_complex64_t globMinEval, globMaxEval; 
+    double globMinEval, globMaxEval;
 
-    zlaebz2_Control_t Control;
-    memset(&Control, 0, sizeof(zlaebz2_Control_t)); 
+    dlaebz2_Control_t Control;
+    memset(&Control, 0, sizeof(dlaebz2_Control_t));
     Control.N = n;
     Control.diag = diag;
     Control.offd = offd;
@@ -352,15 +347,15 @@ int plasma_zstevx2(
     Control.stein_arrays = stein_arrays;
 
     /* Find actual least and greatest eigenvalues. */
-    plasma_zstelg(Control.diag, Control.offd, Control.N, &globMinEval, &globMaxEval);
+    plasma_dstelg(Control.diag, Control.offd, Control.N, &globMinEval, &globMaxEval);
 
     int evLessThanVL=0, evLessThanVU=n, nEigVals=0;
     if (range == PlasmaRangeV) {
         /* We don't call Sturm if we already know the answer. */
-        if (vl >= globMinEval) evLessThanVL=plasma_zlaneg2(diag, offd, n, vl);
+        if (vl >= globMinEval) evLessThanVL=plasma_dlaneg2(diag, offd, n, vl);
         else vl = globMinEval; /* optimize for computing step size. */
 
-        if (vu <= globMaxEval) evLessThanVU=plasma_zlaneg2(diag, offd, n, vu);
+        if (vu <= globMaxEval) evLessThanVU=plasma_dlaneg2(diag, offd, n, vu);
         else vu = nexttoward(globMaxEval, __DBL_MAX__);  /* optimize for computing step size */
         /* Compute the number of eigenvalues in [vl, vu). */
         nEigVals = (evLessThanVU - evLessThanVL);
@@ -384,7 +379,7 @@ int plasma_zstevx2(
     /* Now if user's K (arg 4) isn't enough room, we have a problem. */
     if (k < nEigVals) {
         return -4;             /* problem with user's K value. */
-    }   
+    }
 
     /* We are going into discovery. Make sure we have arrays. */
     if (pVal == NULL) return -12;   /* pointers cannot be null. */
@@ -396,17 +391,17 @@ int plasma_zstevx2(
     Control.pVal = pVal;
     Control.pMul = pMul;
     Control.pVec = pVec;
-    
+
     /* We launch the root task: The full range to subdivide. */
     #pragma omp parallel
     {
         #pragma omp single
         {
-            #pragma omp task 
-                plasma_zlaebz2(&Control, vl, vu, -1, -1, nEigVals);
+            #pragma omp task
+                plasma_dlaebz2(&Control, vl, vu, -1, -1, nEigVals);
         }
     }
- 
+
     /* Now, all the eigenvalues should have unit eigenvectors in the array Control.pVec.
      * We don't need to sort that, but we do want to compress it; in case of multiplicity.
      * We compute the final number of eigenvectors in vectorsFound, and mpcity is recorded.
@@ -424,14 +419,14 @@ int plasma_zstevx2(
     /* compress the array in case vectorsFound < nEigVals (due to multiplicities).    */
     /* Note that pMul[] is initialized to zeros, if still zero, a multiplicity entry. */
     if (vectorsFound < nEigVals) {
-        int j=0;   
+        int j=0;
         for (i=0; i<nEigVals; i++) {
             if (pMul[i] > 0) {                          /* If this is NOT a multiplicity, */
                 pMul[j] = pMul[i];                      /* copy to next open slot j       */
-                pVal[j] = pVal[i];      
+                pVal[j] = pVal[i];
                 if (Control.jobtype == PlasmaVec) {
                     if (j != i) {
-                        memcpy(&pVec[j*Control.N], &pVec[i*Control.N], Control.N*sizeof(plasma_complex64_t));
+                        memcpy(&pVec[j*Control.N], &pVec[i*Control.N], Control.N*sizeof(double));
                     }
                 }
 
@@ -444,24 +439,24 @@ int plasma_zstevx2(
     plasma_desc_t T;
     int retqrf=0, retgqr=0;
 
-    retqrf = plasma_zgeqrf(Control.N, vectorsFound, /* This leaves pVec in compressed state of Q+R */
+    retqrf = plasma_dgeqrf(Control.N, vectorsFound, /* This leaves pVec in compressed state of Q+R */
         pVec, Control.N, &T);
 
     if (retqrf != 0) {
-        plasma_error("plasma_zgeqrf failed.");
+        plasma_error("plasma_dgeqrf failed.");
     } else {
         /* extract just the Q of the QR, in normal form, in workspace pQ */
-        plasma_complex64_t* pQ = (plasma_complex64_t*) malloc(Control.N * vectorsFound * sizeof(plasma_complex64_t));
-        retgqr = plasma_zungqr(Control.N, vectorsFound, vectorsFound,
+        double* pQ = (double*) malloc(Control.N * vectorsFound * sizeof(double));
+        retgqr = plasma_dorgqr(Control.N, vectorsFound, vectorsFound,
                       pVec, Control.N, T, pQ, Control.N);
 
         if (retgqr != 0) {
-            plasma_error("plasma_zungqr failed.");
+            plasma_error("plasma_dorgqr failed.");
         }
 
         /* copy orthonormal vectors from workspace pQ to pVec for user return. */
-        memcpy(pVec, pQ, Control.N*vectorsFound*sizeof(plasma_complex64_t));
-        free(pQ); 
+        memcpy(pVec, pQ, Control.N*vectorsFound*sizeof(double));
+        free(pQ);
         pQ = NULL;
     }
 
@@ -469,40 +464,40 @@ int plasma_zstevx2(
     if (retqrf || retgqr) goto Cleanup;
     /*************************************************************************
      * When eigenvalue are crowded, it is possible that after orthogonalizing
-     * vectors, it can be better to swap neighboring eigenvectors. We just 
-     * test all the pairs; basically ||(A*V-e*V)||_max is the error.  if BOTH 
+     * vectors, it can be better to swap neighboring eigenvectors. We just
+     * test all the pairs; basically ||(A*V-e*V)||_max is the error.  if BOTH
      * vectors in a pair have less error by being swapped, we swap them.
      ************************************************************************/
     int swaps=0;
     if (jobtype == PlasmaVec) {
-        int N = Control.N; 
-        plasma_complex64_t *Y = malloc(N * sizeof(plasma_complex64_t));
-        plasma_complex64_t test[4];
+        int N = Control.N;
+        double *Y = malloc(N * sizeof(double));
+        double test[4];
 
         for (i=0; i<vectorsFound-1; i++) {
             if (fabs(pVal[i+1]-pVal[i]) > 1.E-11) continue;
 
             /* We've tried to parallelize the following four tests
              * as four omp tasks. It works, but takes an average of
-             * 8% longer (~3.6 ms) than just serial execution. 
+             * 8% longer (~3.6 ms) than just serial execution.
              * omp schedule and taskwait overhead, I presume.
              */
 
-            test[0]= plasma_zstepe(Control.diag, Control.offd, N,
+            test[0]= plasma_dstepe(Control.diag, Control.offd, N,
                     pVal[i], &pVec[i*N]);
-            test[1] = plasma_zstepe(Control.diag, Control.offd, N,
+            test[1] = plasma_dstepe(Control.diag, Control.offd, N,
                     pVal[i+1], &pVec[(i+1)*N]);
-            
-            test[2] = plasma_zstepe(Control.diag, Control.offd, N,
+
+            test[2] = plasma_dstepe(Control.diag, Control.offd, N,
                     pVal[i], &pVec[(i+1)*N]);
-            test[3] = plasma_zstepe(Control.diag, Control.offd, N,
+            test[3] = plasma_dstepe(Control.diag, Control.offd, N,
                     pVal[i+1], &pVec[i*N]);
-            
+
             if ( (test[2] < test[0])         /* val1 with vec2 beats val1 with vec1 */
               && (test[3] < test[1]) ) {     /* val2 with vec1 beats val2 with vec2 */
-                memcpy(Y, &pVec[i*N], N*sizeof(plasma_complex64_t));
-                memcpy(&pVec[i*N], &pVec[(i+1)*N], N*sizeof(plasma_complex64_t));
-                memcpy(&pVec[(i+1)*N], Y, N*sizeof(plasma_complex64_t));
+                memcpy(Y, &pVec[i*N], N*sizeof(double));
+                memcpy(&pVec[i*N], &pVec[(i+1)*N], N*sizeof(double));
+                memcpy(&pVec[(i+1)*N], Y, N*sizeof(double));
                 swaps++;
             }
         } /* end swapping. */
diff --git a/test/test_zstevx2.c b/test/test_dstevx2.c
similarity index 84%
rename from test/test_zstevx2.c
rename to test/test_dstevx2.c
index 54efda93..d7f4dda4 100644
--- a/test/test_zstevx2.c
+++ b/test/test_dstevx2.c
@@ -6,7 +6,7 @@
  *  University of Tennessee, US,
  *  University of Manchester, UK.
  *
- * @precisions normal z -> s d 
+ * @precisions normal d -> s
  *
  **/
 #include "test.h"
@@ -23,34 +23,32 @@
 
 #include <omp.h>
 
-#define COMPLEX
+#define REAL
 
 /******************************************************************************
- * Matrix detailed in Kahan; et al. 
+ * Matrix detailed in Kahan; et al.
  * Matrix Test: diag=[+x,-x,+x,-x,...+x,-x] for any real x, but Kahan chooses
  *                                          a tiny x.
  *              offd=[1,1,...1]
- * Dimension: n. 
+ * Dimension: n.
  * Computed eigenvalues:
- * evalue[k] = [ x*x + 4*cos(k/(n+1))^2 ] ^(1/2), 
+ * evalue[k] = [ x*x + 4*cos(k/(n+1))^2 ] ^(1/2),
  * evalue[n+1-k] = -evalue[k], for k=1,[n/2],
  * evalue[(n+1)/2] = 0 if n is odd.
  * Note k is 1-relative in these formulations.
  * The eigenvalues range from (-2,+2).
  * Note: This routine verified to match documentation for n=4,8,12,24.
- * Note: This code is a template, it is not intended to work in complex
- *       arithmetic, it is only to be translated to either single or double.
  *****************************************************************************/
 
-static void testMatrix_Kahan(plasma_complex64_t* diag, plasma_complex64_t *offd, 
-            plasma_complex64_t* evalue, lapack_int n, plasma_complex64_t myDiag) {
+static void testMatrix_Kahan(double* diag, double *offd,
+            double* evalue, lapack_int n, double myDiag) {
    lapack_int i,k;
    for (k=1; k<=(n/2); k++) {
-      plasma_complex64_t ev;
+      double ev;
       ev = (M_PI*k+0.)/(n+1.0); /* angle in radians.                       */
       ev = cos(ev);             /* cos(angle)                              */
       ev *= 4.*ev;              /* 4*cos^2(angle)                          */
-      ev += myDiag*myDiag;      /* x^2 + 4*cos^2(angle)                    */ 
+      ev += myDiag*myDiag;      /* x^2 + 4*cos^2(angle)                    */
       ev = sqrt(ev);            /* (x^2 + 4*cos^2(angle))^(0.5)            */
       /* we reverse the -ev and ev here, to get in ascending sorted order. */
       evalue[k-1] = -ev;
@@ -72,40 +70,40 @@ static void testMatrix_Kahan(plasma_complex64_t* diag, plasma_complex64_t *offd,
 
 /******************************************************************************
  * This tests an eigenvector X for the eigenvalue lambda.
- * We should have A*X = lambda*X. Thus, (A*X)/lambda = X. 
+ * We should have A*X = lambda*X. Thus, (A*X)/lambda = X.
  * We perform the matrix multiply for each element X[i], and divide the result
  * by lambda, yielding mmRes[i] which should equal X[i]. We sum the squares of
  * these results, and the squares of X[i], to compute the Frobenious Norm. We
  * return the absolute difference of these norms as the error in the vector.
  *
  * Matrix multiply; A * X = Y.
- * A = [diag[0], offd[0], 
+ * A = [diag[0], offd[0],
  *     [offd[0], diag[1], offd[1]
  *     [      0, offd[1], diag[2], offd[2],
  *     ...
  *     [ 0...0                     offd[n-2], diag[n-1] ]
  *****************************************************************************/
 
-static double testEVec(plasma_complex64_t *diag, plasma_complex64_t *offd, 
-              int n, plasma_complex64_t *X, plasma_complex64_t lambda) {
+static double testEVec(double *diag, double *offd,
+              int n, double *X, double lambda) {
     int i;
     double mmRes, vmRes, error, sumMM=0., sumVec=0., invLambda = 1.0/lambda;
 
     mmRes = (diag[0]*X[0] + offd[0]*X[1])*invLambda;
     vmRes = X[0];
     sumMM += mmRes*mmRes;
-    sumVec += vmRes*vmRes; 
+    sumVec += vmRes*vmRes;
 
     mmRes = (offd[n-2]*X[n-2] + diag[n-1]*X[n-1])*invLambda;
     vmRes = X[n-1];
     sumMM += mmRes*mmRes;
-    sumVec += vmRes*vmRes; 
- 
+    sumVec += vmRes*vmRes;
+
     for (i=1; i<(n-1); i++) {
         mmRes = (offd[i-1]*X[i-1] + diag[i]*X[i] + offd[i]*X[i+1])*invLambda;
         vmRes = X[i];
         sumMM += mmRes*mmRes;
-        sumVec += vmRes*vmRes; 
+        sumVec += vmRes*vmRes;
     }
 
     sumMM = sqrt(sumMM);
@@ -116,7 +114,7 @@ static double testEVec(plasma_complex64_t *diag, plasma_complex64_t *offd,
 
 
 /***************************************************************************//**
- * @brief Tests ZSTEVX2.
+ * @brief Tests DSTEVX2.
  *
  * @param[in,out] param - array of parameters
  * @param[in]     run - whether to run test
@@ -124,7 +122,7 @@ static double testEVec(plasma_complex64_t *diag, plasma_complex64_t *offd,
  * Sets used flags in param indicating parameters that are used.
  * If run is true, also runs test and stores output parameters.
  ******************************************************************************/
-void test_zstevx2(param_value_t param[], bool run)
+void test_dstevx2(param_value_t param[], bool run)
 {
     int i,j;
     /*****************************************************************
@@ -149,34 +147,34 @@ void test_zstevx2(param_value_t param[], bool run)
     /*****************************************************************
      * Allocate and initialize arrays.
      ****************************************************************/
-    plasma_complex64_t *Diag =
-        (plasma_complex64_t*)malloc((size_t)m*sizeof(plasma_complex64_t));
+    double *Diag =
+        (double*)malloc((size_t)m*sizeof(double));
     assert(Diag != NULL);
 
-    plasma_complex64_t *Offd =
-        (plasma_complex64_t*)malloc((size_t)(m-1)*sizeof(plasma_complex64_t));
+    double *Offd =
+        (double*)malloc((size_t)(m-1)*sizeof(double));
     assert(Offd != NULL);
 
-    plasma_complex64_t *eigenvalues =
-        (plasma_complex64_t*)malloc((size_t)m*sizeof(plasma_complex64_t));
+    double *eigenvalues =
+        (double*)malloc((size_t)m*sizeof(double));
     assert(eigenvalues != NULL);
 
-    plasma_complex64_t *pVal = 
-        (plasma_complex64_t*)malloc((size_t)m*sizeof(plasma_complex64_t));
+    double *pVal =
+        (double*)malloc((size_t)m*sizeof(double));
     assert(pVal != NULL);
 
     int *pMul = (int*)malloc((size_t)m*sizeof(int));
     assert(pMul != NULL);
 
     /**************************************************************************
-     * Kahan has eigenvalues from [-2.0 to +2.0]. However, eigenvalues are 
+     * Kahan has eigenvalues from [-2.0 to +2.0]. However, eigenvalues are
      * dense near -2.0 and +2.0, so for large matrices, the density may cause
      * eigenvalues separated by less than machine precision, which causes us
      * multiplicity (eigenvalues are identical at machine precision). We first
-     * see this in single precision at m=14734, with a multiplicity of 2. 
+     * see this in single precision at m=14734, with a multiplicity of 2.
      *************************************************************************/
 
-    plasma_complex64_t myDiag=1.e-5;
+    double myDiag=1.e-5;
     testMatrix_Kahan(Diag, Offd, eigenvalues, m, myDiag);
     double minAbsEV=__DBL_MAX__, maxAbsEV=0., Kond;
     for (i=0; i<m; i++) {
@@ -187,17 +185,17 @@ void test_zstevx2(param_value_t param[], bool run)
 
     lapack_int nEigVals=0, vectorsFound=0;
     lapack_int il=0, iu=500;
-    plasma_complex64_t vl=1.5, vu=2.01;
-    plasma_complex64_t *pVec = NULL;
+    double vl=1.5, vu=2.01;
+    double *pVec = NULL;
 
     /**************************************************************************
-     * Get the number of eigenvalues in a value range. Note these can include 
-     * multiplicity; the number of unique eigenvectors will be discovered by 
+     * Get the number of eigenvalues in a value range. Note these can include
+     * multiplicity; the number of unique eigenvectors will be discovered by
      * plasma_dstevx2.
      *************************************************************************/
 
     lapack_int ret;
-    ret=plasma_zstevx2(
+    ret=plasma_dstevx2(
             PlasmaCount,    /* Type of call (1)         */
             PlasmaRangeV,   /* Range type (2)           */
             m, 0,           /* N, k (3,4)               */
@@ -208,9 +206,9 @@ void test_zstevx2(param_value_t param[], bool run)
             pVal,           /* p eigenvals array. (12)  */
             pMul,           /* p eigenMult array  (13)  */
             pVec);          /* p eigenVec  array  (14)  */
-    
+
     if (nEigVals < 1) {
-        plasma_error("plasma_zstevx2() found no eigenvalues for test matrix.");
+        plasma_error("plasma_dstevx2() found no eigenvalues for test matrix.");
         param[PARAM_TIME].d    = 0.0;
         param[PARAM_GFLOPS].d  = 0.0;
         param[PARAM_ERROR].d   = 1.0;
@@ -220,16 +218,16 @@ void test_zstevx2(param_value_t param[], bool run)
 
     /**************************************************************************
      * We allocate pVec late, we cannot afford to allocate m*m entries
-     * (to cover every possibility) when m is huge. 
+     * (to cover every possibility) when m is huge.
      *************************************************************************/
 
-    pVec = (plasma_complex64_t*)malloc((size_t)m*nEigVals*sizeof(plasma_complex64_t));
+    pVec = (double*)malloc((size_t)m*nEigVals*sizeof(double));
     assert(pVec != NULL);
 
     /* Run and time plasma_dstevx2, range based on values. */
     plasma_time_t start = omp_get_wtime();
 
-    ret=plasma_zstevx2(
+    ret=plasma_dstevx2(
             PlasmaVec,     /* Type of call (1)          */
             PlasmaRangeV,  /* Range type (2)            */
             m, nEigVals,   /* N, k (3,4)                */
@@ -246,7 +244,7 @@ void test_zstevx2(param_value_t param[], bool run)
 
     if (ret != 0) {
         char errstr[128];
-        sprintf(errstr, "plasma_zstevx2() failed returned %i", ret);
+        sprintf(errstr, "plasma_dstevx2() failed returned %i", ret);
         plasma_error(errstr);
         param[PARAM_TIME].d    = 0.0;
         param[PARAM_GFLOPS].d  = 0.0;
@@ -271,13 +269,13 @@ void test_zstevx2(param_value_t param[], bool run)
          * Find worst eigenvalue error. However, we must worry about
          * multiplicity. In single precision this first occurs at m=14734, with
          * vl=1.5, vu=2.01; mpcity=2. At m=75000, vl=1.5, vu=2.01, mpcity=10.
-         * We must also worry about the magnitude of eigenvalues; machine 
+         * We must also worry about the magnitude of eigenvalues; machine
          * epsilon for large eigenvalues is much greater than for small ones.
          *********************************************************************/
 
-        plasma_complex64_t worstEigenvalue_error = 0, worstEigenvalue_eps;
+        double worstEigenvalue_error = 0, worstEigenvalue_eps;
         lapack_int worstEigenvalue_index = 0, worstEigenvalue_mpcty = 0, max_mpcty = 0;
-        plasma_complex64_t worstEigenvector_error = 0;
+        double worstEigenvector_error = 0;
         lapack_int worstEigenvector_index = 0;
         i=0;
         lapack_int evIdx=m-nEigVals;
@@ -286,19 +284,19 @@ void test_zstevx2(param_value_t param[], bool run)
 
             for (j=0; j<pMul[i]; j++) {
                 double ev_eps = nexttoward(fabs(eigenvalues[evIdx]), __DBL_MAX__) - fabs(eigenvalues[evIdx]);
-                plasma_complex64_t error = fabs(pVal[i]-eigenvalues[evIdx]) / ev_eps;
+                double error = fabs(pVal[i]-eigenvalues[evIdx]) / ev_eps;
                 if (error > worstEigenvalue_error) {
                     worstEigenvalue_index = i;
                     worstEigenvalue_error = error;
-                    worstEigenvalue_eps = ev_eps; 
+                    worstEigenvalue_eps = ev_eps;
                     worstEigenvalue_mpcty = pMul[i];
                 }
 
                 evIdx++; /* advance known eigenvalue index for a multiplicity. */
                 if (evIdx == m) break;
             }
-           
-            i++; /* advance to next discovered eigenvalue. */         
+
+            i++; /* advance to next discovered eigenvalue. */
         }
 
         /**********************************************************************
@@ -320,14 +318,14 @@ void test_zstevx2(param_value_t param[], bool run)
          * being too liberal. Obviously this is related to the number of bits
          * of error in the result. The condition number (Kond) of the Kahan
          * matrix also grows nearly linearly with m; Kond is computed above.
-         *********************************************************************/ 
+         *********************************************************************/
 
         for (i=0; i<vectorsFound; i++) {
             double vErr;
             vErr=testEVec(Diag, Offd, m, &pVec[m*i], pVal[i]);
 
             if (vErr > worstEigenvector_error) {
-                worstEigenvector_error = vErr; 
+                worstEigenvector_error = vErr;
                 worstEigenvector_index = i;
             }
         }
@@ -342,7 +340,7 @@ void test_zstevx2(param_value_t param[], bool run)
     /*****************************************************************
      * Free arrays.
      ****************************************************************/
-TestingDone: 
+TestingDone:
     if (Diag != NULL) free(Diag);
     if (Offd != NULL) free(Offd);
     if (eigenvalues != NULL) free(eigenvalues);
@@ -352,5 +350,5 @@ void test_zstevx2(param_value_t param[], bool run)
 
     if (test) {
         /* free any test specific matrices; currently none. */
-    }  
+    }
 }

From 11c575a96c09a627e29cad3e07a4a8dd5769de73 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Fri, 17 Jan 2025 15:41:08 -0500
Subject: [PATCH 07/12] cmake: generate files

---
 CMakeLists.txt               | 633 +++++++++++++++++++++--------------
 tools/generate_precisions.py |  43 ---
 2 files changed, 375 insertions(+), 301 deletions(-)
 delete mode 100644 tools/generate_precisions.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 102e71ea..6bc02698 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,11 @@ project( PLASMA VERSION 24.8.7 LANGUAGES C
 
 set(CMAKE_SUPPRESS_REGENERATION on)
 
+if (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.31.0)
+    cmake_policy( SET CMP0171 NEW ) # recognize CMake's `codegen` target
+    set( CODEGEN "CODEGEN" )
+endif()
+
 if (${CMAKE_VERSION} VERSION_GREATER 3.11.99)
   cmake_policy(PUSH)
   cmake_policy(SET CMP0074 NEW) # allows to use CBLAS_ROOT and LAPACKE_ROOT
@@ -13,21 +18,12 @@ endif()
 #set( CMAKE_THREAD_PREFER_PTHREAD 1 )
 #find_package( Threads )
 
-if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/compute/scamax.c")
-  message( STATUS "Some generated files already exist, proceeding" )
-else ()
-  message( STATUS "Missing files some precision files, trying to generate" )
-
-  include( FindPython )  # requires CMake 3.12
-
-  if (Python_FOUND)
-      message( STATUS "Found Python interpreter wth ID ${Python_INTERPRETER_ID} and EXE ${Python_EXECUTABLE}" )
-      message( STATUS "Generating files for all precisions. This may take a few minutes." )
-      execute_process(COMMAND "${Python_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/tools/generate_precisions.py" WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
-  else ()
+include( FindPython )  # requires CMake 3.12
+if (Python_FOUND)
+    message( STATUS "Found Python interpreter wth ID ${Python_INTERPRETER_ID} and EXE ${Python_EXECUTABLE}" )
+else()
     message( FATAL_ERROR "Couldn't find Python interpreter, cannot generate all precision files." )
-  endif ()
-endif ()
+endif()
 
 # PLASMA uses C99 features (in-loop definition of for loop variables)
 if (CMAKE_VERSION VERSION_LESS "3.1")
@@ -264,139 +260,229 @@ else()
   message(FATAL_ERROR "OpenMP not found.")
 endif()
 
-add_library(plasma SHARED include/plasma.h
-compute/clag2z.c compute/dzamax.c compute/scamax.c compute/samax.c compute/damax.c compute/pclag2z.c compute/pdzamax.c
-compute/pzdesc2ge.c compute/pzdesc2pb.c compute/pzdesc2tr.c compute/pzgbtrf.c
-compute/pzge2desc.c compute/pzgeadd.c compute/pzgelqf.c compute/pzgelqf_tree.c
-compute/pzgemm.c compute/pzgeqrf.c compute/pzgeqrf_tree.c compute/pzgeswp.c
-compute/pzgetrf.c compute/pzgetri_aux.c compute/pzhemm.c compute/pzher2k.c
-compute/pzherk.c compute/pzhetrf_aasen.c compute/pzlacpy.c compute/pzlag2c.c
-compute/pzlangb.c compute/pzlange.c compute/pzlanhe.c compute/pzlansy.c
-compute/pzlantr.c compute/pzlascl.c compute/pzlaset.c compute/pzlauum.c
-compute/pzpb2desc.c compute/pzpbtrf.c compute/pzpotrf.c compute/pzsymm.c
-compute/pzsyr2k.c compute/pzsyrk.c compute/pztbsm.c compute/pztr2desc.c
-compute/pztradd.c compute/pztrmm.c compute/pztrsm.c compute/pztrtri.c
-compute/pzunglq.c compute/pzunglq_tree.c compute/pzungqr.c
-compute/pzungqr_tree.c compute/pzunmlq.c compute/pzunmlq_tree.c
-compute/pzunmqr.c compute/pzunmqr_tree.c compute/zcgbsv.c compute/zcgesv.c
-compute/zcposv.c compute/zdesc2ge.c compute/zdesc2pb.c compute/zdesc2tr.c
-compute/zgbsv.c compute/zgbtrf.c compute/zgbtrs.c compute/zge2desc.c
-compute/zgeadd.c compute/zgeinv.c compute/zgelqf.c compute/zgelqs.c
-compute/zgels.c compute/zgemm.c compute/zgeqrf.c compute/zgeqrs.c
-compute/zgesv.c compute/zgeswp.c compute/zgetrf.c compute/zgetri_aux.c
-compute/zgetri.c compute/zgetrs.c compute/zhemm.c compute/zher2k.c
-compute/zherk.c compute/zhesv.c compute/zhetrf.c compute/zhetrs.c
-compute/zlacpy.c compute/clag2z.c compute/zlag2c.c compute/zlangb.c compute/zlange.c
-compute/zlanhe.c compute/zlansy.c compute/zlantr.c compute/zlascl.c
-compute/zlaset.c compute/zlauum.c compute/zpb2desc.c compute/zpbsv.c
-compute/zpbtrf.c compute/zpbtrs.c compute/zpoinv.c compute/zposv.c
-compute/zpotrf.c compute/zpotri.c compute/zpotrs.c compute/zsymm.c
-compute/zsyr2k.c compute/zsyrk.c compute/ztr2desc.c compute/ztradd.c
-compute/ztrmm.c compute/ztrsm.c compute/ztrtri.c compute/zunglq.c
-compute/zungqr.c compute/zunmlq.c compute/zunmqr.c compute/cgelqf.c
-compute/cgemm.c compute/cgeqrf.c compute/cpotrf.c compute/cpotrs.c
-compute/csymm.c compute/csyr2k.c compute/csyrk.c compute/ctradd.c
-compute/ctrmm.c compute/ctrsm.c compute/ctrtri.c compute/cunglq.c
-compute/cungqr.c compute/cunmlq.c compute/cunmqr.c compute/dgelqf.c
-compute/dgemm.c compute/dgeqrf.c compute/dorglq.c compute/dorgqr.c
-compute/dormlq.c compute/dormqr.c compute/dpotrf.c compute/dpotrs.c
-compute/dsymm.c compute/dsyr2k.c compute/dsyrk.c compute/dtradd.c
-compute/dtrmm.c compute/dtrsm.c compute/dtrtri.c compute/sgelqf.c
-compute/sgemm.c compute/sgeqrf.c compute/sorglq.c compute/sorgqr.c
-compute/sormlq.c compute/sormqr.c compute/spotrf.c compute/spotrs.c
-compute/ssymm.c compute/ssyr2k.c compute/ssyrk.c compute/stradd.c
-compute/strmm.c compute/strsm.c compute/strtri.c
-compute/dsposv.c compute/dgbsv.c compute/cgbsv.c compute/sgbsv.c
-compute/dgbtrf.c compute/dgbtrs.c compute/cgbtrf.c compute/cgbtrs.c
-compute/sgbtrf.c compute/sgbtrs.c compute/dgeadd.c compute/cgeadd.c
-compute/sgeadd.c compute/dgeinv.c compute/cgeinv.c compute/sgeinv.c
-compute/dgelqs.c compute/cgelqs.c compute/sgelqs.c compute/dgels.c
-compute/cgels.c compute/sgels.c compute/dgeqrs.c compute/cgeqrs.c
-compute/sgeqrs.c compute/dsgesv.c compute/dsgbsv.c compute/dgesv.c
-compute/cgesv.c compute/sgesv.c compute/dgetrf.c compute/cgetrf.c
-compute/sgetrf.c compute/dgetri.c compute/cgetri.c compute/sgetri.c
-compute/dgetri_aux.c compute/cgetri_aux.c compute/sgetri_aux.c
-compute/dgetrf.c compute/dgetrs.c compute/cgetrf.c compute/cgetrs.c
-compute/sgetrf.c compute/sgetrs.c compute/chemm.c compute/cher2k.c
-compute/cherk.c compute/dsytrf.c compute/dsytrs.c compute/chetrf.c
-compute/chetrs.c compute/ssytrf.c compute/ssytrs.c compute/dsysv.c
-compute/chesv.c compute/ssysv.c compute/dlacpy.c compute/clacpy.c
-compute/slacpy.c compute/dlag2s.c compute/slag2d.c compute/dlange.c
-compute/clange.c compute/slange.c compute/clanhe.c compute/dlansy.c
-compute/clansy.c compute/slansy.c compute/dlantr.c compute/clantr.c
-compute/slantr.c compute/dlascl.c compute/clascl.c compute/slascl.c
-compute/dlaset.c compute/claset.c compute/slaset.c compute/dgeswp.c
-compute/cgeswp.c compute/sgeswp.c compute/dlauum.c compute/clauum.c
-compute/slauum.c compute/dpbsv.c compute/cpbsv.c compute/spbsv.c
-compute/dpbtrf.c compute/dpbtrs.c compute/cpbtrf.c compute/cpbtrs.c
-compute/spbtrf.c compute/spbtrs.c compute/dlangb.c compute/clangb.c
-compute/slangb.c compute/dposv.c compute/cposv.c compute/sposv.c
-compute/dpoinv.c compute/cpoinv.c compute/spoinv.c compute/dpotri.c
-compute/cpotri.c compute/spotri.c
-compute/slaebz2.c compute/dlaebz2.c
-compute/slaneg2.c compute/dlaneg2.c
-compute/sstevx2.c compute/dstevx2.c
-compute/pslange.c compute/pclaset.c compute/psorglq_tree.c
-compute/psormqr_tree.c compute/pdgelqf_tree.c compute/pslag2d.c
-compute/pcunmqr_tree.c compute/psgeqrf_tree.c compute/pspotrf.c
-compute/pdsytrf_aasen.c compute/pslauum.c compute/pssytrf_aasen.c
-compute/pstrsm.c compute/psgeqrf.c compute/pcgelqf_tree.c
-compute/pcunglq_tree.c compute/pctrmm.c compute/pstrtri.c
-compute/pcungqr_tree.c compute/pcsymm.c compute/psormqr.c compute/pdgemm.c
-compute/pdlacpy.c compute/psgeadd.c compute/pdtrmm.c compute/pcungqr.c
-compute/pcgemm.c compute/pslansy.c compute/pdtradd.c compute/pdormqr_tree.c
-compute/pdtbsm.c compute/psormlq.c compute/pdpotrf.c compute/pcunglq.c
-compute/pchemm.c compute/psgeswp.c compute/pcher2k.c compute/pdgetri_aux.c
-compute/pcgeqrf_tree.c compute/pdorglq.c compute/pdlange.c
-compute/pcunmlq_tree.c compute/psgetrf.c compute/pdgeqrf.c compute/pdlauum.c
-compute/pdlaset.c compute/pclascl.c compute/pclauum.c compute/pcgeadd.c
-compute/pdorglq_tree.c compute/pdgetrf.c compute/pdtrsm.c compute/psorglq.c
-compute/pslangb.c compute/pdormlq_tree.c compute/pcherk.c compute/pcpbtrf.c
-compute/psgemm.c compute/pdgeqrf_tree.c compute/pdlascl.c compute/pdsyr2k.c
-compute/pdlantr.c compute/pdgeadd.c compute/pclansy.c compute/psgetri_aux.c
-compute/pclantr.c compute/pstradd.c compute/pcgbtrf.c compute/pcsyrk.c
-compute/pctradd.c compute/psgelqf_tree.c compute/pslantr.c compute/pdlag2s.c compute/pslag2d.c
-compute/pchetrf_aasen.c compute/pssymm.c compute/pcunmqr.c compute/pclacpy.c
-compute/pdsyrk.c compute/pcsyr2k.c compute/pdgelqf.c compute/pdamax.c
-compute/pslacpy.c compute/pdormqr.c compute/pctrsm.c compute/pclangb.c
-compute/pdlangb.c compute/pscamax.c compute/pdpbtrf.c compute/pcgeqrf.c
-compute/pdgbtrf.c compute/psamax.c compute/pslascl.c compute/psgbtrf.c
-compute/pdgeswp.c compute/pspbtrf.c compute/pctbsm.c compute/pdorgqr.c
-compute/pcgelqf.c compute/pcpotrf.c compute/pstbsm.c compute/pstrmm.c
-compute/pssyr2k.c compute/pclange.c compute/psorgqr.c compute/psormlq_tree.c
-compute/pssyrk.c compute/pdorgqr_tree.c compute/pdsymm.c compute/pslaset.c
-compute/pdlansy.c compute/pcgeswp.c compute/psorgqr_tree.c compute/pctrtri.c
-compute/pcgetri_aux.c compute/pdormlq.c compute/pcunmlq.c compute/pcgetrf.c
-compute/pclanhe.c compute/pdtrtri.c compute/psgelqf.c
-compute/zdesc2ge.c compute/zdesc2pb.c compute/zdesc2tr.c
-compute/cdesc2ge.c compute/cdesc2pb.c compute/cdesc2tr.c
-compute/ddesc2ge.c compute/ddesc2pb.c compute/ddesc2tr.c
-compute/sdesc2ge.c compute/sdesc2pb.c compute/sdesc2tr.c
-compute/pzdesc2ge.c compute/pzdesc2pb.c compute/pzdesc2tr.c
-compute/pcdesc2ge.c compute/pcdesc2pb.c compute/pcdesc2tr.c
-compute/pddesc2ge.c compute/pddesc2pb.c compute/pddesc2tr.c
-compute/psdesc2ge.c compute/psdesc2pb.c compute/psdesc2tr.c
-compute/zge2desc.c compute/zpb2desc.c compute/ztr2desc.c
-compute/cge2desc.c compute/cpb2desc.c compute/ctr2desc.c
-compute/dge2desc.c compute/dpb2desc.c compute/dtr2desc.c
-compute/sge2desc.c compute/spb2desc.c compute/str2desc.c
-compute/pzge2desc.c compute/pzpb2desc.c compute/pztr2desc.c
-compute/pcge2desc.c compute/pcpb2desc.c compute/pctr2desc.c
-compute/pdge2desc.c compute/pdpb2desc.c compute/pdtr2desc.c
-compute/psge2desc.c compute/pspb2desc.c compute/pstr2desc.c
-compute/zgbmm.c compute/dgbmm.c compute/sgbmm.c compute/cgbmm.c
-compute/zgbset.c compute/dgbset.c compute/sgbset.c compute/cgbset.c
-compute/zgb2desc.c compute/dgb2desc.c compute/sgb2desc.c compute/cgb2desc.c
-compute/pzgb2desc.c compute/pdgb2desc.c compute/psgb2desc.c compute/pcgb2desc.c
-compute/zgesdd.c compute/dgesdd.c compute/sgesdd.c compute/cgesdd.c
-compute/pzgbbrd_static.c compute/pcgbbrd_static.c compute/pdgbbrd_static.c compute/psgbbrd_static.c
-compute/pzgecpy_tile2lapack_band.c compute/pcgecpy_tile2lapack_band.c compute/pdgecpy_tile2lapack_band.c compute/psgecpy_tile2lapack_band.c
-compute/pzlarft_blgtrd.c compute/pclarft_blgtrd.c compute/pdlarft_blgtrd.c compute/pslarft_blgtrd.c
-compute/pzunmqr_blgtrd.c compute/pcunmqr_blgtrd.c compute/pdormqr_blgtrd.c compute/psormqr_blgtrd.c
-compute/pcge2gb.c compute/pdge2gb.c compute/psge2gb.c compute/pzge2gb.c
-control/constants.c control/context.c control/descriptor.c
-control/tree.c control/tuning.c control/workspace.c control/version.c)
+#-------------------------------------------------------------------------------
+# Parses a list of template source files to find what files should be generate.
+#
+# @param[in,out] src
+#   On input, list of template files (source and headers) for codegen to
+#   process. May have non-template source files; codegen ignores them.
+#   On output, the list of generated files is appended.
+#
+# Example:
+#   set( src zgemm.c plasma_z.h )
+#   generate_files( src )
+#   # On output, src is zgemm.c plasma_z.h sgemm.c dgemm.c cgemm.c plasma_s.h
+#   #                   plasma_d.h plasma_c.h
+#   add_library( plasma ${src} )
+#
+function( generate_files src )
+    message( DEBUG "----- generate_files -----" )
+    message( DEBUG "src   is ${src}       = <${${src}}>" )
+    message( DEBUG "cache is ${src}_cache = <${${src}_cache}>" )
+
+    if (NOT "${${src}}" STREQUAL "${${src}_cache}")
+        message( STATUS "Running codegen to find files to generate for ${src}" )
+        execute_process(
+            COMMAND "${Python_EXECUTABLE}" "tools/codegen.py" "--depend" ${${src}}
+            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+            RESULT_VARIABLE error
+            OUTPUT_VARIABLE ${src}_depends )
+        message( DEBUG "codegen error ${error}" )
+        message( DEBUG "depends is ${src}_depends = <<<\n${${src}_depends}>>>" )
+
+        if (error)
+            message( STATUS "codegen returned error; cannot generate source files." )
+        else()
+            # Cache src so we don't have to re-run codegen to get the
+            # list of dependencies again if src doesn't change.
+            set( ${src}_cache ${${src}} CACHE INTERNAL "" )
+
+            # Split lines and cache it.
+            string( REGEX REPLACE "\n" ";" ${src}_depends "${${src}_depends}" )
+            set( ${src}_depends ${${src}_depends} CACHE INTERNAL "" )
+            message( DEBUG "depends is ${src}_depends = <<<${${src}_depends}>>>" )
+        endif()
+    endif()
+
+    message( STATUS "Adding codegen commands to generate files for ${src}" )
+    foreach( depend ${${src}_depends} )
+        message( DEBUG "depend = <${depend}>" )
+        string( REGEX MATCH "^(.*): (.*)$" out "${depend}" )
+        set( outputs ${CMAKE_MATCH_1} )
+        set( input   ${CMAKE_MATCH_2} )
+        string( REGEX REPLACE " " ";" outputs "${outputs}" )
+        list( TRANSFORM outputs PREPEND "${CMAKE_CURRENT_SOURCE_DIR}/"
+              OUTPUT_VARIABLE src_outputs )
+        message( DEBUG "    input:       <${input}>" )
+        message( DEBUG "    outputs:     <${outputs}>" )
+        message( DEBUG "    src_outputs: <${src_outputs}>" )
+        add_custom_command(
+            OUTPUT   ${src_outputs}
+            COMMAND "${Python_EXECUTABLE}" "tools/codegen.py" "${input}"
+            DEPENDS "${input}"
+            WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+            VERBATIM ${CODEGEN} )
+
+        list( APPEND ${src} "${outputs}" )
+        message( DEBUG "    src:  <${${src}}>" )
+        message( DEBUG "" )
+    endforeach()
+    set( ${src} ${${src}} PARENT_SCOPE ) # propagate changes
+    message( DEBUG "src is ${src} = <${${src}}>" )
+endfunction()
+
+#-------------------------------------------------------------------------------
+# List all template files (sources and headers) and non-template source
+# files, e.g., zgemm.c, plasma_z.h, test.c.
+# Do not list generated files, e.g., sgemm.c, plasma_s.h.
+# Please add files in alphabetical order.
+set( plasma_src
+    compute/clag2z.c
+    compute/dlaebz2.c
+    compute/dlaneg2.c
+    compute/dstevx2.c
+    compute/dzamax.c
+    compute/pclag2z.c
+    compute/pdzamax.c
+    compute/pzdesc2ge.c
+    compute/pzdesc2pb.c
+    compute/pzdesc2tr.c
+    compute/pzgb2desc.c
+    compute/pzgbbrd_static.c
+    compute/pzgbtrf.c
+    compute/pzge2desc.c
+    compute/pzge2gb.c
+    compute/pzgeadd.c
+    compute/pzgecpy_tile2lapack_band.c
+    compute/pzgelqf.c
+    compute/pzgelqf_tree.c
+    compute/pzgemm.c
+    compute/pzgeqrf.c
+    compute/pzgeqrf_tree.c
+    compute/pzgeswp.c
+    compute/pzgetrf.c
+    compute/pzgetri_aux.c
+    compute/pzhemm.c
+    compute/pzher2k.c
+    compute/pzherk.c
+    compute/pzhetrf_aasen.c
+    compute/pzlacpy.c
+    compute/pzlag2c.c
+    compute/pzlangb.c
+    compute/pzlange.c
+    compute/pzlanhe.c
+    compute/pzlansy.c
+    compute/pzlantr.c
+    compute/pzlarft_blgtrd.c
+    compute/pzlascl.c
+    compute/pzlaset.c
+    compute/pzlauum.c
+    compute/pzpb2desc.c
+    compute/pzpbtrf.c
+    compute/pzpotrf.c
+    compute/pzsymm.c
+    compute/pzsyr2k.c
+    compute/pzsyrk.c
+    compute/pztbsm.c
+    compute/pztr2desc.c
+    compute/pztradd.c
+    compute/pztrmm.c
+    compute/pztrsm.c
+    compute/pztrtri.c
+    compute/pzunglq.c
+    compute/pzunglq_tree.c
+    compute/pzungqr.c
+    compute/pzungqr_tree.c
+    compute/pzunmlq.c
+    compute/pzunmlq_tree.c
+    compute/pzunmqr.c
+    compute/pzunmqr_blgtrd.c
+    compute/pzunmqr_tree.c
+    compute/zcgbsv.c
+    compute/zcgesv.c
+    compute/zcposv.c
+    compute/zdesc2ge.c
+    compute/zdesc2pb.c
+    compute/zdesc2tr.c
+    compute/zgb2desc.c
+    compute/zgbmm.c
+    compute/zgbset.c
+    compute/zgbsv.c
+    compute/zgbtrf.c
+    compute/zgbtrs.c
+    compute/zge2desc.c
+    compute/zgeadd.c
+    compute/zgeinv.c
+    compute/zgelqf.c
+    compute/zgelqs.c
+    compute/zgels.c
+    compute/zgemm.c
+    compute/zgeqrf.c
+    compute/zgeqrs.c
+    compute/zgesdd.c
+    compute/zgesv.c
+    compute/zgeswp.c
+    compute/zgetrf.c
+    compute/zgetri.c
+    compute/zgetri_aux.c
+    compute/zgetrs.c
+    compute/zhemm.c
+    compute/zher2k.c
+    compute/zherk.c
+    compute/zhesv.c
+    compute/zhetrf.c
+    compute/zhetrs.c
+    compute/zlacpy.c
+    compute/zlag2c.c
+    compute/zlangb.c
+    compute/zlange.c
+    compute/zlanhe.c
+    compute/zlansy.c
+    compute/zlantr.c
+    compute/zlascl.c
+    compute/zlaset.c
+    compute/zlauum.c
+    compute/zpb2desc.c
+    compute/zpbsv.c
+    compute/zpbtrf.c
+    compute/zpbtrs.c
+    compute/zpoinv.c
+    compute/zposv.c
+    compute/zpotrf.c
+    compute/zpotri.c
+    compute/zpotrs.c
+    compute/zsymm.c
+    compute/zsyr2k.c
+    compute/zsyrk.c
+    compute/ztr2desc.c
+    compute/ztradd.c
+    compute/ztrmm.c
+    compute/ztrsm.c
+    compute/ztrtri.c
+    compute/zunglq.c
+    compute/zungqr.c
+    compute/zunmlq.c
+    compute/zunmqr.c
+
+    control/constants.c
+    control/context.c
+    control/descriptor.c
+    control/tree.c
+    control/tuning.c
+    control/version.c
+    control/workspace.c
+
+    include/core_lapack_z.h
+    include/plasma.h
+    include/plasma_internal_z.h
+    include/plasma_internal_zc.h
+    include/plasma_z.h
+    include/plasma_zc.h
+    include/plasma_zlaebz2_work.h
+)
 
+generate_files( plasma_src )
+add_library( plasma SHARED ${plasma_src} )
 
 # CMake knows about "plasma" library at this point so inform CMake where the headers are
 target_include_directories(plasma PUBLIC
@@ -404,126 +490,157 @@ target_include_directories(plasma PUBLIC
 	$<INSTALL_INTERFACE:include>
 )
 
-add_library(plasma_core_blas SHARED include/plasma_core_blas.h
-core_blas/core_clag2z.c core_blas/core_dcabs1.c core_blas/core_scabs1.c core_blas/core_dzamax.c core_blas/core_zgeadd.c core_blas/core_zgelqt.c
-core_blas/core_zgemm.c core_blas/core_zgeqrt.c core_blas/core_zgessq.c core_blas/core_zgeswp.c core_blas/core_zgetrf.c
-core_blas/core_zhegst.c core_blas/core_zhemm.c core_blas/core_zher2k.c core_blas/core_zherk.c core_blas/core_zhessq.c
-core_blas/core_zheswp.c core_blas/core_zlacpy_band.c core_blas/core_zlacpy.c core_blas/core_zlag2c.c core_blas/core_zlange.c
-core_blas/core_zlanhe.c core_blas/core_zlansy.c core_blas/core_zlantr.c core_blas/core_zlascl.c core_blas/core_zlaset.c
-core_blas/core_zlauum.c core_blas/core_zpamm.c core_blas/core_zpemv.c core_blas/core_zparfb.c core_blas/core_zpemv.c core_blas/core_zpotrf.c
-core_blas/core_zsymm.c core_blas/core_zsyr2k.c core_blas/core_zsyrk.c core_blas/core_zsyssq.c core_blas/core_ztradd.c
-core_blas/core_ztrmm.c core_blas/core_ztrsm.c core_blas/core_ztrssq.c core_blas/core_ztrtri.c core_blas/core_ztslqt.c
-core_blas/core_ztsmlq.c core_blas/core_ztsmqr.c core_blas/core_ztsqrt.c core_blas/core_zttlqt.c core_blas/core_zttmlq.c
-core_blas/core_zttmqr.c core_blas/core_zttqrt.c core_blas/core_zunmlq.c core_blas/core_zunmqr.c
-core_blas/core_cgeadd.c core_blas/core_cgemm.c core_blas/core_cgeswp.c
-core_blas/core_cgetrf.c core_blas/core_cheswp.c core_blas/core_clacpy.c
-core_blas/core_clacpy_band.c core_blas/core_cparfb.c core_blas/core_ctrsm.c
-core_blas/core_dgeadd.c core_blas/core_dgemm.c core_blas/core_dgeswp.c
-core_blas/core_dgetrf.c core_blas/core_dlacpy.c core_blas/core_dlacpy_band.c
-core_blas/core_dparfb.c core_blas/core_dsyswp.c core_blas/core_dtrsm.c
-core_blas/core_sgeadd.c core_blas/core_sgemm.c core_blas/core_sgeswp.c
-core_blas/core_sgetrf.c core_blas/core_slacpy.c core_blas/core_slacpy_band.c
-core_blas/core_sparfb.c core_blas/core_ssyswp.c core_blas/core_strsm.c
-core_blas/core_cgelqt.c core_blas/core_cgeqrt.c core_blas/core_cgessq.c
-core_blas/core_chegst.c core_blas/core_chemm.c core_blas/core_cher2k.c
-core_blas/core_cherk.c core_blas/core_chessq.c core_blas/core_clange.c
-core_blas/core_clanhe.c core_blas/core_clansy.c core_blas/core_clantr.c
-core_blas/core_clascl.c core_blas/core_claset.c core_blas/core_clauum.c
-core_blas/core_cpamm.c core_blas/core_cpemv.c core_blas/core_cpotrf.c
-core_blas/core_csymm.c core_blas/core_csyr2k.c core_blas/core_csyrk.c
-core_blas/core_csyssq.c core_blas/core_ctradd.c core_blas/core_ctrmm.c
-core_blas/core_ctrssq.c core_blas/core_ctrtri.c core_blas/core_ctslqt.c
-core_blas/core_ctsmlq.c core_blas/core_ctsmqr.c core_blas/core_ctsqrt.c
-core_blas/core_cttlqt.c core_blas/core_cttmlq.c core_blas/core_cttmqr.c
-core_blas/core_cttqrt.c core_blas/core_cunmlq.c core_blas/core_cunmqr.c
-core_blas/core_damax.c core_blas/core_dgelqt.c core_blas/core_dgeqrt.c
-core_blas/core_dgessq.c core_blas/core_dlag2s.c core_blas/core_dlange.c
-core_blas/core_dlansy.c core_blas/core_dlantr.c core_blas/core_dlascl.c
-core_blas/core_dlaset.c core_blas/core_dlauum.c core_blas/core_dormlq.c
-core_blas/core_dormqr.c core_blas/core_dpamm.c core_blas/core_dpemv.c
-core_blas/core_dpotrf.c core_blas/core_dsygst.c core_blas/core_dsymm.c
-core_blas/core_dsyr2k.c core_blas/core_dsyrk.c core_blas/core_dsyssq.c
-core_blas/core_dtradd.c core_blas/core_dtrmm.c core_blas/core_dtrssq.c
-core_blas/core_dtrtri.c core_blas/core_dtslqt.c core_blas/core_dtsmlq.c
-core_blas/core_dtsmqr.c core_blas/core_dtsqrt.c core_blas/core_dttlqt.c
-core_blas/core_dttmlq.c core_blas/core_dttmqr.c core_blas/core_dttqrt.c
-core_blas/core_samax.c core_blas/core_scamax.c core_blas/core_sgelqt.c
-core_blas/core_sgeqrt.c core_blas/core_sgessq.c core_blas/core_slag2d.c
-core_blas/core_slange.c core_blas/core_slansy.c core_blas/core_slantr.c
-core_blas/core_slascl.c core_blas/core_slaset.c core_blas/core_slauum.c
-core_blas/core_sormlq.c core_blas/core_sormqr.c core_blas/core_spamm.c
-core_blas/core_spemv.c core_blas/core_spotrf.c core_blas/core_ssygst.c
-core_blas/core_ssymm.c core_blas/core_ssyr2k.c core_blas/core_ssyrk.c
-core_blas/core_ssyssq.c core_blas/core_stradd.c core_blas/core_strmm.c
-core_blas/core_strssq.c core_blas/core_strtri.c core_blas/core_stslqt.c
-core_blas/core_stsmlq.c core_blas/core_stsmqr.c core_blas/core_stsqrt.c
-core_blas/core_sttlqt.c core_blas/core_sttmlq.c core_blas/core_sttmqr.c
-core_blas/core_sttqrt.c control/barrier.c control/async.c
-core_blas/core_cgbtype1cb.c  core_blas/core_dgbtype1cb.c  core_blas/core_sgbtype1cb.c  core_blas/core_zgbtype1cb.c
-core_blas/core_cgbtype2cb.c  core_blas/core_dgbtype2cb.c  core_blas/core_sgbtype2cb.c  core_blas/core_zgbtype2cb.c
-core_blas/core_cgbtype3cb.c  core_blas/core_dgbtype3cb.c  core_blas/core_sgbtype3cb.c  core_blas/core_zgbtype3cb.c
-core_blas/core_clarfb_gemm.c core_blas/core_dlarfb_gemm.c core_blas/core_slarfb_gemm.c core_blas/core_zlarfb_gemm.c
-core_blas/core_clacpy.c core_blas/core_dlacpy.c core_blas/core_slacpy.c core_blas/core_zlacpy.c
+#-------------------------------------------------------------------------------
+# See note above on plasma_src.
+# Please add files in alphabetical order.
+set( plasma_core_blas_src
+    control/async.c
+    control/barrier.c
+
+    core_blas/core_clag2z.c
+    core_blas/core_dcabs1.c
+    core_blas/core_dzamax.c
+    core_blas/core_zgbtype1cb.c
+    core_blas/core_zgbtype2cb.c
+    core_blas/core_zgbtype3cb.c
+    core_blas/core_zgeadd.c
+    core_blas/core_zgelqt.c
+    core_blas/core_zgemm.c
+    core_blas/core_zgeqrt.c
+    core_blas/core_zgessq.c
+    core_blas/core_zgeswp.c
+    core_blas/core_zgetrf.c
+    core_blas/core_zhegst.c
+    core_blas/core_zhemm.c
+    core_blas/core_zher2k.c
+    core_blas/core_zherk.c
+    core_blas/core_zhessq.c
+    core_blas/core_zheswp.c
+    core_blas/core_zlacpy.c
+    core_blas/core_zlacpy_band.c
+    core_blas/core_zlag2c.c
+    core_blas/core_zlange.c
+    core_blas/core_zlanhe.c
+    core_blas/core_zlansy.c
+    core_blas/core_zlantr.c
+    core_blas/core_zlarfb_gemm.c
+    core_blas/core_zlascl.c
+    core_blas/core_zlaset.c
+    core_blas/core_zlauum.c
+    core_blas/core_zpamm.c
+    core_blas/core_zparfb.c
+    core_blas/core_zpemv.c
+    core_blas/core_zpotrf.c
+    core_blas/core_zsymm.c
+    core_blas/core_zsyr2k.c
+    core_blas/core_zsyrk.c
+    core_blas/core_zsyssq.c
+    core_blas/core_ztradd.c
+    core_blas/core_ztrmm.c
+    core_blas/core_ztrsm.c
+    core_blas/core_ztrssq.c
+    core_blas/core_ztrtri.c
+    core_blas/core_ztslqt.c
+    core_blas/core_ztsmlq.c
+    core_blas/core_ztsmqr.c
+    core_blas/core_ztsqrt.c
+    core_blas/core_zttlqt.c
+    core_blas/core_zttmlq.c
+    core_blas/core_zttmqr.c
+    core_blas/core_zttqrt.c
+    core_blas/core_zunmlq.c
+    core_blas/core_zunmqr.c
+
+    include/core_lapack_z.h
+    include/plasma_core_blas.h
+    include/plasma_core_blas_z.h
+    include/plasma_core_blas_zc.h
+    include/plasma.h
+    include/plasma_internal_z.h
+    include/plasma_internal_zc.h
+    include/plasma_z.h
+    include/plasma_zc.h
 )
 
+generate_files( plasma_core_blas_src )
+add_library( plasma_core_blas SHARED ${plasma_core_blas_src} )
+
 target_include_directories(plasma_core_blas PUBLIC
 	$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 	$<INSTALL_INTERFACE:include>
 )
 
-add_executable(plasmatest test/test.h test/test.c include/plasma.h
-test/test_dzamax.c test/test_damax.c test/test_scamax.c test/test_samax.c
-test/test_zcposv.c test/test_dsposv.c test/test_zgbsv.c test/test_dgbsv.c
-test/test_cgbsv.c test/test_sgbsv.c test/test_zgbmm.c test/test_dgbmm.c
-test/test_cgbmm.c test/test_sgbmm.c test/test_zgbtrf.c test/test_dgbtrf.c
-test/test_cgbtrf.c test/test_sgbtrf.c test/test_zgeadd.c test/test_dgeadd.c
-test/test_cgeadd.c test/test_sgeadd.c test/test_zgeinv.c test/test_dgeinv.c
-test/test_cgeinv.c test/test_sgeinv.c test/test_zgelqf.c test/test_dgelqf.c
-test/test_cgelqf.c test/test_sgelqf.c test/test_zgelqs.c test/test_dgelqs.c
-test/test_cgelqs.c test/test_sgelqs.c test/test_zgels.c test/test_dgels.c
-test/test_cgels.c test/test_sgels.c test/test_zgemm.c test/test_dgemm.c
-test/test_cgemm.c test/test_sgemm.c test/test_zgeqrf.c test/test_dgeqrf.c
-test/test_cgeqrf.c test/test_sgeqrf.c test/test_zgeqrs.c test/test_dgeqrs.c
-test/test_cgeqrs.c test/test_sgeqrs.c test/test_zcgesv.c test/test_dsgesv.c
-test/test_zcgbsv.c test/test_dsgbsv.c test/test_zgesv.c test/test_dgesv.c
-test/test_cgesv.c test/test_sgesv.c test/test_zgetrf.c test/test_dgetrf.c
-test/test_cgetrf.c test/test_sgetrf.c test/test_zgetri.c test/test_dgetri.c
-test/test_cgetri.c test/test_sgetri.c test/test_zgetri_aux.c
-test/test_dgetri_aux.c test/test_cgetri_aux.c test/test_sgetri_aux.c
-test/test_zgetrs.c test/test_dgetrs.c test/test_cgetrs.c test/test_sgetrs.c
-test/test_zhemm.c test/test_chemm.c test/test_zher2k.c test/test_cher2k.c
-test/test_zherk.c test/test_cherk.c test/test_zhetrf.c test/test_dsytrf.c
-test/test_chetrf.c test/test_ssytrf.c test/test_zhesv.c test/test_dsysv.c
-test/test_chesv.c test/test_ssysv.c test/test_zlacpy.c test/test_dlacpy.c
-test/test_clacpy.c test/test_slacpy.c test/test_zlag2c.c test/test_clag2z.c
-test/test_dlag2s.c test/test_slag2d.c test/test_zlange.c test/test_dlange.c
-test/test_clange.c test/test_slange.c test/test_zlanhe.c test/test_clanhe.c
-test/test_zlansy.c test/test_dlansy.c test/test_clansy.c test/test_slansy.c
-test/test_zlantr.c test/test_dlantr.c test/test_clantr.c test/test_slantr.c
-test/test_zlascl.c test/test_dlascl.c test/test_clascl.c test/test_slascl.c
-test/test_zlaset.c test/test_dlaset.c test/test_claset.c test/test_slaset.c
-test/test_zgeswp.c test/test_dgeswp.c test/test_cgeswp.c test/test_sgeswp.c
-test/test_zlauum.c test/test_dlauum.c test/test_clauum.c test/test_slauum.c
-test/test_zpbsv.c test/test_dpbsv.c test/test_cpbsv.c test/test_spbsv.c
-test/test_zpbtrf.c test/test_dpbtrf.c test/test_cpbtrf.c test/test_spbtrf.c
-test/test_zlangb.c test/test_dlangb.c test/test_clangb.c test/test_slangb.c
-test/test_zposv.c test/test_dposv.c test/test_cposv.c test/test_sposv.c
-test/test_zpoinv.c test/test_dpoinv.c test/test_cpoinv.c test/test_spoinv.c
-test/test_zpotrf.c test/test_dpotrf.c test/test_cpotrf.c test/test_spotrf.c
-test/test_zpotri.c test/test_dpotri.c test/test_cpotri.c test/test_spotri.c
-test/test_zpotrs.c test/test_dpotrs.c test/test_cpotrs.c test/test_spotrs.c
-test/test_dstevx2.c test/test_sstevx2.c
-test/test_zsymm.c test/test_dsymm.c test/test_csymm.c test/test_ssymm.c
-test/test_zsyr2k.c test/test_dsyr2k.c test/test_csyr2k.c test/test_ssyr2k.c
-test/test_zsyrk.c test/test_dsyrk.c test/test_csyrk.c test/test_ssyrk.c
-test/test_ztradd.c test/test_dtradd.c test/test_ctradd.c test/test_stradd.c
-test/test_ztrmm.c test/test_dtrmm.c test/test_ctrmm.c test/test_strmm.c
-test/test_ztrsm.c test/test_dtrsm.c test/test_ctrsm.c test/test_strsm.c
-test/test_ztrtri.c test/test_dtrtri.c test/test_ctrtri.c test/test_strtri.c
-test/test_zgesdd.c test/test_dgesdd.c test/test_cgesdd.c test/test_sgesdd.c
-test/test_zunmlq.c test/test_dormlq.c test/test_cunmlq.c test/test_sormlq.c
-test/test_zunmqr.c test/test_dormqr.c test/test_cunmqr.c test/test_sormqr.c)
+#-------------------------------------------------------------------------------
+# See note above on plasma_src.
+# Please add files in alphabetical order.
+set( plasma_test_src
+    include/plasma.h
+
+    test/test.c
+    test/test.h
+    test/test_clag2z.c
+    test/test_dstevx2.c
+    test/test_dzamax.c
+    test/test_z.h
+    test/test_zc.h
+    test/test_zcgbsv.c
+    test/test_zcgesv.c
+    test/test_zcposv.c
+    test/test_zgbmm.c
+    test/test_zgbsv.c
+    test/test_zgbtrf.c
+    test/test_zgeadd.c
+    test/test_zgeinv.c
+    test/test_zgelqf.c
+    test/test_zgelqs.c
+    test/test_zgels.c
+    test/test_zgemm.c
+    test/test_zgeqrf.c
+    test/test_zgeqrs.c
+    test/test_zgesdd.c
+    test/test_zgesv.c
+    test/test_zgeswp.c
+    test/test_zgetrf.c
+    test/test_zgetri.c
+    test/test_zgetri_aux.c
+    test/test_zgetrs.c
+    test/test_zhemm.c
+    test/test_zher2k.c
+    test/test_zherk.c
+    test/test_zhesv.c
+    test/test_zhetrf.c
+    test/test_zlacpy.c
+    test/test_zlag2c.c
+    test/test_zlangb.c
+    test/test_zlange.c
+    test/test_zlanhe.c
+    test/test_zlansy.c
+    test/test_zlantr.c
+    test/test_zlascl.c
+    test/test_zlaset.c
+    test/test_zlauum.c
+    test/test_zpbsv.c
+    test/test_zpbtrf.c
+    test/test_zpoinv.c
+    test/test_zposv.c
+    test/test_zpotrf.c
+    test/test_zpotri.c
+    test/test_zpotrs.c
+    test/test_zsymm.c
+    test/test_zsyr2k.c
+    test/test_zsyrk.c
+    test/test_ztradd.c
+    test/test_ztrmm.c
+    test/test_ztrsm.c
+    test/test_ztrtri.c
+    test/test_zunmlq.c
+    test/test_zunmqr.c
+)
+
+generate_files( plasma_test_src )
+add_executable( plasmatest ${plasma_test_src} )
 
+#-------------------------------------------------------------------------------
 find_library(MATH_LIBRARY m)
 if( MATH_LIBRARY )
   # OpenBLAS needs to link C math library (usually -lm) but MKL doesn't
diff --git a/tools/generate_precisions.py b/tools/generate_precisions.py
deleted file mode 100644
index 8da61713..00000000
--- a/tools/generate_precisions.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#! /usr/bin/env python
-# -*- encoding: ascii -*-
-
-"To be executed from the top most directory where 'tools/codegen.py' is available."
-
-import os
-import sys
-
-Output_Files = False # show files to be generated but don't generate
-
-def codegen(letters, filenames, fn_format):
-    for filename in filenames.split():
-        if Output_Files:
-            os.system(sys.executable + " tools/codegen.py --output {}".format(fn_format.format(filename)))
-            continue
-        for letter in letters.split():
-            os.system(sys.executable + " tools/codegen.py -p {} {}".format(letter, fn_format.format(filename)))
-
-def main(argv):
-    global Output_Files
-    if "--output" in argv:
-        Output_Files = True
-
-    elif "--help" in argv or "-h" in argv:
-        print("{} [--output]\n".format(argv[0]))
-        print("--output  show files to be generated but don't generate")
-        return 0
-
-    codegen("s d c", "plasma_z plasma_internal_z core_lapack_z plasma_core_blas_z plasma_zlaebz2_work", "include/{}.h")
-    codegen("ds", "include/plasma_zc.h include/plasma_internal_zc.h include/plasma_core_blas_zc.h test/test_zc.h", "{}")
-    codegen("s d c", "dzamax zgelqf zgemm zgbmm zgeqrf zgesdd zunglq zungqr zunmlq zunmqr zpotrf zpotrs zsymm zsyr2k zsyrk ztradd ztrmm ztrsm ztrtri zunglq zungqr zunmlq zunmqr zgbsv zgbtrf zgbtrs zgeadd zgeinv zgelqs zgels zgeqrs zgesv zgeswp zgetrf zgetri zgetrs zhemm zher2k zherk zhesv zhetrf zhetrs zlacpy zlangb zlange zlanhe zlansy zlantr zlascl zlaset zlauum zpbsv zpbtrf zpbtrs zpoinv zposv zpotri zgetri_aux zdesc2ge zdesc2pb zdesc2tr zge2desc zgb2desc zgbset zpb2desc ztr2desc pdzamax pzgbtrf pzgeadd pzgelqf pzgelqf_tree pzgemm pzgeqrf pzgeqrf_tree pzgeswp pzgetrf pzgetri_aux pzhemm pzher2k pzherk pzhetrf_aasen pzlacpy pzlangb pzlange pzlanhe pzlansy pzlantr pzlascl pzlaset pzlauum pzpbtrf pzpotrf pzsymm pzsyr2k pzsyrk pztbsm pztradd pztrmm pztrsm pztrtri pzunglq pzunglq_tree pzungqr pzungqr_tree pzunmlq pzunmlq_tree pzunmqr pzunmqr_tree pzdesc2ge pzdesc2pb pzdesc2tr pzge2desc pzgb2desc pzpb2desc pztr2desc pzge2gb pzgbbrd_static pzgecpy_tile2lapack_band pzlarft_blgtrd pzunmqr_blgtrd", "compute/{}.c")
-    codegen("s d", "zlaebz2 zlaneg2 zstevx2", "compute/{}.c")
-    codegen("ds", "zcposv zcgesv zcgbsv clag2z zlag2c pclag2z pzlag2c", "compute/{}.c")
-    codegen("s d c", "zgeadd zgemm zgeswp zgetrf zheswp zlacpy zlacpy_band zheswp ztrsm dzamax zgelqt zgeqrt zgessq zhegst zhemm zher2k zherk zhessq zlange zlanhe zlansy zlantr zlascl zlaset zlauum zunmlq zunmqr zpemv zpamm zpotrf zhegst zsymm zsyr2k zsyrk zsyssq ztradd ztrmm ztrssq ztrtri ztslqt ztsmlq ztsmqr ztsqrt zttlqt zttmlq zttmqr zttqrt zunmlq zunmqr zparfb dcabs1 zlarfb_gemm zgbtype1cb zgbtype2cb zgbtype3cb", "core_blas/core_{}.c")
-    codegen("ds", "zlag2c clag2z", "core_blas/core_{}.c")
-    codegen("s d c", "z.h", "test/test_{}")
-    codegen("s d", "zstevx2.c", "test/test_{}")
-    codegen("s d c", "dzamax zgbsv zgbtrf zgeadd zgeinv zgelqf zgelqs zgels zgemm zgbmm zgeqrf zgeqrs zgesv zgeswp zgetrf zgetri_aux zgetri zgetrs zhemm zher2k zherk zhesv zhetrf zlacpy zlangb zlange zlanhe zlansy zlantr zlascl zlaset zlauum zpbsv zpbtrf zpoinv zposv zpotrf zpotri zpotrs zsymm zsyr2k zsyrk ztradd ztrmm ztrsm ztrtri zunmlq zunmqr zgesdd", "test/test_{}.c")
-    codegen("ds", "zcposv zcgesv zcgbsv zlag2c clag2z", "test/test_{}.c")
-    return 0
-
-if "__main__" == __name__:
-    sys.exit(main(sys.argv))

From 64d128a2683cd99b067e666314925fe4dd18c10e Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 15 Jan 2025 16:04:41 -0500
Subject: [PATCH 08/12] copy eig routines from plasma_sevp repo

---
 compute/pzhe2hb.c                  | 226 +++++++++++++
 compute/pzheb2trd_static.c         | 237 +++++++++++++
 compute/pzhecpy_tile2lapack_band.c | 103 ++++++
 compute/zheevd.c                   | 527 +++++++++++++++++++++++++++++
 core_blas/core_zhbtype1cb.c        | 139 ++++++++
 core_blas/core_zhbtype2cb.c        | 164 +++++++++
 core_blas/core_zhbtype3cb.c        | 124 +++++++
 core_blas/core_zherfb.c            | 225 ++++++++++++
 core_blas/core_zlarfy.c            |  99 ++++++
 core_blas/core_ztsmlq_corner.c     | 252 ++++++++++++++
 core_blas/core_ztsmlq_hetra1.c     | 196 +++++++++++
 core_blas/core_ztsmqr_corner.c     | 248 ++++++++++++++
 core_blas/core_ztsmqr_hetra1.c     | 199 +++++++++++
 test/test_zheevd.c                 | 189 +++++++++++
 14 files changed, 2928 insertions(+)
 create mode 100644 compute/pzhe2hb.c
 create mode 100755 compute/pzheb2trd_static.c
 create mode 100644 compute/pzhecpy_tile2lapack_band.c
 create mode 100644 compute/zheevd.c
 create mode 100644 core_blas/core_zhbtype1cb.c
 create mode 100644 core_blas/core_zhbtype2cb.c
 create mode 100644 core_blas/core_zhbtype3cb.c
 create mode 100644 core_blas/core_zherfb.c
 create mode 100644 core_blas/core_zlarfy.c
 create mode 100644 core_blas/core_ztsmlq_corner.c
 create mode 100644 core_blas/core_ztsmlq_hetra1.c
 create mode 100644 core_blas/core_ztsmqr_corner.c
 create mode 100644 core_blas/core_ztsmqr_hetra1.c
 create mode 100644 test/test_zheevd.c

diff --git a/compute/pzhe2hb.c b/compute/pzhe2hb.c
new file mode 100644
index 00000000..2ed83598
--- /dev/null
+++ b/compute/pzhe2hb.c
@@ -0,0 +1,226 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> s d c
+ *
+ **/
+
+#include "plasma_async.h"
+#include "plasma_context.h"
+#include "plasma_descriptor.h"
+#include "plasma_types.h"
+#include "plasma_internal.h"
+#include "core_blas_z.h"
+
+#define A(m, n) ((plasma_complex64_t*) plasma_tile_addr(A, m, n))
+#define T(m, n) ((plasma_complex64_t*) plasma_tile_addr(T, m, n))
+/***************************************************************************//**
+ *  Parallel tile BAND Tridiagonal Reduction
+ **/
+void plasma_pzhe2hb(plasma_enum_t uplo,
+                    plasma_desc_t A, plasma_desc_t T,
+                    plasma_workspace_t work,
+                    plasma_sequence_t *sequence, plasma_request_t *request)
+{
+    // Check sequence status.
+    if (sequence->status != PlasmaSuccess)
+        return;
+
+
+    // Case nb>n  only 1 tile 
+    if(A.mt > A.m)
+        return;
+
+    // Set inner blocking from the plasma context
+    plasma_context_t *plasma = plasma_context_self();
+    if (plasma == NULL) {
+        plasma_error("PLASMA not initialized");
+        plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
+        return;
+    }
+    int ib = plasma->ib;
+
+    if (uplo == PlasmaLower) {
+       for (int k = 0; k < A.nt-1; k++){
+           int nvak = plasma_tile_nview(A, k+1);
+           int ldak = plasma_tile_mmain(A, k+1);
+           core_omp_zgeqrt(
+               nvak, A.nb, ib,
+               A(k+1, k), ldak,
+               T(k+1, k), T.mb,
+               work,
+               sequence, request);
+
+           // LEFT and RIGHT on the symmetric diagonal block
+           core_omp_zherfb(
+               PlasmaLower,
+               nvak, nvak, ib,
+               A(k+1,   k), ldak,
+               T(k+1,   k), T.mb,
+               A(k+1, k+1), ldak,
+               work,
+               sequence, request);
+
+           // RIGHT on the remaining tiles until the bottom 
+           for (int m = k+2; m < A.mt ; m++) {
+               int mvam = plasma_tile_mview(A, m);
+               int ldam = plasma_tile_mmain(A, m);
+               core_omp_zunmqr(
+                   PlasmaRight, PlasmaNoTrans,
+                   mvam, A.nb, nvak, ib,
+                   A(k+1,   k), ldak,
+                   T(k+1,   k), T.mb,
+                   A(m  , k+1), ldam,
+                   work,
+                   sequence, request);
+           }
+
+           for (int m = k+2; m < A.mt; m++) {
+               int mvam = plasma_tile_mview(A, m);
+               int ldam = plasma_tile_mmain(A, m);
+               core_omp_ztsqrt(
+                   mvam, A.nb, ib,
+                   A(k+1, k), ldak,
+                   A(m  , k), ldam,
+                   T(m  , k), T.mb,
+                   work,
+                   sequence, request);
+               
+               // LEFT 
+               for (int i = k+2; i < m; i++) {
+                   int ldai = plasma_tile_mmain(A, i);
+                   core_omp_ztsmqr_hetra1(
+                       PlasmaLeft, Plasma_ConjTrans,
+                       A.mb, A.nb, mvam, A.nb, A.nb, ib,
+                       A(i, k+1), ldai,
+                       A(m,   i), ldam,
+                       A(m,   k), ldam,
+                       T(m,   k), T.mb,
+                       work,
+                       sequence, request);
+               }
+
+               // RIGHT 
+               for (int j = m+1; j < A.mt ; j++) {
+                   int mvaj = plasma_tile_mview(A, j);
+                   int ldaj = plasma_tile_mmain(A, j);
+                   core_omp_ztsmqr(
+                       PlasmaRight, PlasmaNoTrans,
+                       mvaj, A.nb, mvaj, mvam, A.nb, ib,
+                       A(j, k+1), ldaj,
+                       A(j,   m), ldaj,
+                       A(m,   k), ldam,
+                       T(m,   k), T.mb,
+                       work,
+                       sequence, request);
+               }
+               
+               // LEFT->RIGHT 
+               core_omp_ztsmqr_corner(
+                   A.nb, A.nb, mvam, A.nb,
+                   mvam, mvam, A.nb, ib,
+                   A(k+1, k+1), ldak,
+                   A(m  , k+1), ldam,
+                   A(m  ,   m), ldam,
+                   A(m  ,   k), ldam,
+                   T(m  ,   k), T.mb,
+                   work,
+                   sequence, request);
+           }
+       }
+    }
+    else {
+       for (int k = 0; k < A.nt-1; k++){
+           int nvak = plasma_tile_nview(A, k+1);
+           int ldak  = plasma_tile_mmain(A, k);
+           int ldak1 = plasma_tile_mmain(A, k+1);
+           core_omp_zgelqt(
+               A.nb, nvak, ib,
+               A(k, k+1), ldak,
+               T(k, k+1), T.mb,
+               work,
+               sequence, request);
+           
+           // RIGHT and LEFT on the symmetric diagonal block
+           core_omp_zherfb(
+               PlasmaUpper,
+               nvak, nvak, ib,
+               A(k,   k+1), ldak,
+               T(k,   k+1), T.mb,
+               A(k+1, k+1), ldak1,
+               work,
+               sequence, request);
+
+           // LEFT on the remaining tiles until the left side
+           for (int n = k+2; n < A.nt ; n++) {
+               int nvan = plasma_tile_nview(A, n);
+               core_omp_zunmlq(
+                   PlasmaLeft, PlasmaNoTrans,
+                   A.nb, nvan, nvak, ib,
+                   A(k,   k+1), ldak,
+                   T(k,   k+1), T.mb,
+                   A(k+1,   n), ldak1,
+                   work,
+                   sequence, request);
+           }
+
+           for (int n = k+2; n < A.nt; n++) {
+               int nvan = plasma_tile_nview(A, n);
+               int ldan = plasma_tile_nmain(A, n);
+               core_omp_ztslqt(
+                   A.nb, nvan, ib,
+                   A(k, k+1), ldak,
+                   A(k,   n), ldak,
+                   T(k,   n), T.mb,
+                   work,
+                   sequence, request);
+               
+               // RIGHT 
+               for (int i = k+2; i < n; i++) {
+                   int ldai = plasma_tile_nmain(A, i);
+
+                   core_omp_ztsmlq_hetra1(
+                       PlasmaRight, Plasma_ConjTrans,
+                       A.mb, A.nb, A.nb, nvan, A.nb, ib,
+                       A(k+1, i), ldak1,
+                       A(i,   n), ldai,
+                       A(k,   n), ldak,
+                       T(k,   n), T.mb,
+                       work,
+                       sequence, request);
+               }
+
+               // LEFT 
+               for (int j = n+1; j < A.nt ; j++) {
+                   int nvaj = plasma_tile_nview(A, j);
+                   core_omp_ztsmlq(
+                       PlasmaLeft, PlasmaNoTrans,
+                       A.nb, nvaj, nvan, nvaj, A.nb, ib,
+                       A(k+1, j), ldak1,
+                       A(n,   j), ldan,
+                       A(k,   n), ldak,
+                       T(k,   n), T.mb,
+                       work,
+                       sequence, request);
+               }
+
+               // RIGHT->LEFT
+               core_omp_ztsmlq_corner(
+                   A.nb, A.nb, A.nb, nvan,
+                   nvan, nvan, A.nb, ib,
+                   A(k+1, k+1), ldak1,
+                   A(k+1,   n), ldak1,
+                   A(n  ,   n), ldan,
+                   A(k  ,   n), ldak,
+                   T(k  ,   n), T.mb,
+                   work,
+                   sequence, request);
+           }
+       }
+    }
+}
diff --git a/compute/pzheb2trd_static.c b/compute/pzheb2trd_static.c
new file mode 100755
index 00000000..0ae4e041
--- /dev/null
+++ b/compute/pzheb2trd_static.c
@@ -0,0 +1,237 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> s d c
+ *
+ **/
+
+#include "plasma_async.h"
+#include "plasma_context.h"
+#include "plasma_descriptor.h"
+#include "plasma_internal.h"
+#include "plasma_types.h"
+#include "plasma_workspace.h"
+#include "bulge.h"
+#include "core_blas.h"
+#include <omp.h>
+#include <sched.h>
+#include <string.h>
+
+#undef REAL
+#define COMPLEX
+
+/***************************************************************************//**
+ *  Static scheduler
+ **/
+
+#define shift 3
+
+#define ss_cond_set(m, n, val)                  \
+    {                                                   \
+        plasma->ss_progress[(m)+plasma->ss_ld*(n)] = (val); \
+    }
+
+
+#define ss_cond_wait(m, n, val) \
+    {                                                           \
+        while (plasma->ss_progress[(m)+plasma->ss_ld*(n)] != (val)) \
+            sched_yield();                                          \
+    }
+
+
+//  Parallel bulge chasing column-wise - static scheduling
+
+void plasma_pzheb2trd_static( plasma_enum_t uplo, int N, int NB, int Vblksiz,
+			 plasma_complex64_t *A, int LDA,
+			 plasma_complex64_t *V, plasma_complex64_t *TAU,
+			 double *D, double *E, int WANTZ,
+			 plasma_workspace_t work,
+			 plasma_sequence_t *sequence, plasma_request_t *request) 
+{
+
+    plasma_context_t *plasma = plasma_context_self();
+    if (plasma == NULL) {
+        plasma_error("PLASMA not initialized");
+        return;
+    }    
+    
+    // Check sequence status.
+    if (sequence->status != PlasmaSuccess) {
+        plasma_request_fail(sequence, request, PlasmaErrorSequence);
+        return;
+    }
+
+    if ( uplo != PlasmaLower ) {
+        plasma_request_fail(sequence, request, PlasmaErrorNotSupported);
+        return;
+    }
+    
+    
+    // Quick return
+    if (N == 0) {
+        return;
+    }
+
+    /*
+     * General case:
+     *
+     * As I store V in the V vector there are overlap between
+     * tasks so shift is now 4 where group need to be always
+     * multiple of 2 (or shift=5 if not multiple of 2),
+     * because as example if grs=1 task 2 from
+     * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2
+     * will overwrite the V of tasks 5 sweep 1 which are used by
+     * task 6, so keep in mind that group need to be multiple of 2,
+     * and thus tasks 2 sweep 2 will never run with task 6 sweep 1.
+     * OR if we allocate V as V(N,2) and we switch between the storing of
+     * sweep's like odd in V(N,1) and even in V(N,2) then no overlap and so
+     * shift is 3.
+     * when storing V in matrix style, shift could be back to 3.
+     * */
+    
+    /* Some tunning for the bulge chasing code
+     * see technical report for details */
+    int nbtiles = plasma_ceildiv(N,NB);
+    int colblktile = 1;
+    int grsiz = 1;    
+    int maxrequiredcores = imax( nbtiles/colblktile, 1 );
+    int colpercore = colblktile*NB;
+    int thgrsiz = N;
+    
+    
+    // Initialize static scheduler progress table
+    int cores_num;
+    #pragma omp parallel 
+    {
+        cores_num  = omp_get_num_threads();
+    }
+    int size = 2*nbtiles+shift+cores_num+10;
+    plasma->ss_progress = (volatile int *)malloc(size*sizeof(int));
+    for(int index = 0; index < size; index++) plasma->ss_progress[index] = 0;
+    plasma->ss_ld = (size);
+    
+    // main bulge chasing code 
+    int ii = shift/grsiz;
+    int  stepercol =  ii*grsiz == shift ? ii:ii+1;
+    ii       = (N-1)/thgrsiz;
+    int thgrnb  = ii*thgrsiz == (N-1) ? ii:ii+1;
+    int allcoresnb = imin( cores_num, maxrequiredcores );
+
+    #pragma omp parallel
+    {   
+        int coreid, sweepid, myid, stt, st, ed, stind, edind;
+        int blklastind, colpt,  thgrid, thed;
+        int i,j,m,k;
+
+        int my_core_id = omp_get_thread_num();
+        plasma_complex64_t  *WORK = work.spaces[my_core_id];
+
+        for (thgrid = 1; thgrid<=thgrnb; thgrid++){
+            stt  = (thgrid-1)*thgrsiz+1;
+            thed = imin( (stt + thgrsiz -1), (N-1));
+            for (i = stt; i <= N-1; i++){
+                ed = imin(i,thed);
+                if(stt>ed) break;
+                for (m = 1; m <=stepercol; m++){
+                    st=stt;
+                    for (sweepid = st; sweepid <=ed; sweepid++){
+                        
+                        for (k = 1; k <=grsiz; k++){
+                            myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k;
+                            if(myid%2 ==0){
+                                colpt      = (myid/2)*NB+1+sweepid-1;
+                                stind      = colpt-NB+1;
+                                edind      = imin(colpt,N);
+                                blklastind = colpt;
+                            } else {
+                                colpt      = ((myid+1)/2)*NB + 1 +sweepid -1 ;
+                                stind      = colpt-NB+1;
+                                edind      = imin(colpt,N);
+                                if( (stind>=edind-1) && (edind==N) )
+                                    blklastind=N;
+                                else
+                                    blklastind=0;
+                            }
+                            coreid = (stind/colpercore)%allcoresnb;
+                            
+                            if(my_core_id==coreid) {
+                                if(myid==1) {
+                                    
+                                    ss_cond_wait(myid+shift-1, 0, sweepid-1);
+                                    core_zhbtype1cb(N, NB, A, LDA, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, WANTZ, WORK);
+                                    ss_cond_set(myid, 0, sweepid);
+                                    
+                                    if(blklastind >= (N-1)) {
+                                        for (j = 1; j <= shift; j++)
+                                            ss_cond_set(myid+j, 0, sweepid);
+                                    }
+                                } else {
+                                    ss_cond_wait(myid-1,       0, sweepid);
+                                    ss_cond_wait(myid+shift-1, 0, sweepid-1);
+                                    if(myid%2 == 0)
+                                        core_zhbtype2cb(N, NB, A, LDA, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, WANTZ, WORK);
+                                    else
+                                        core_zhbtype3cb(N, NB, A, LDA, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, WANTZ, WORK);
+                                    
+                                    ss_cond_set(myid, 0, sweepid);
+                                    if(blklastind >= (N-1)) {
+                                        for (j = 1; j <= shift+allcoresnb; j++)
+                                            ss_cond_set(myid+j, 0, sweepid);
+                                    }
+                                } /* END if myid==1 */
+                            } /* END if my_core_id==coreid */
+                            
+                            if(blklastind >= (N-1)) {
+                                stt++;
+                                break;
+                            }
+                        } /* END for k=1:grsiz */
+                    } /* END for sweepid=st:ed */
+                } /* END for m=1:stepercol */
+            } /* END for i=1:N-1 */
+         } /* END for thgrid=1:thgrnb */
+    }
+    /* finalize static sched */
+    free((void*)plasma->ss_progress);
+    
+    /*================================================
+     *  store resulting diag and lower diag D and E
+     *  note that D and E are always real
+     *================================================*/
+    /*
+     * STORE THE RESULTING diagonal/off-diagonal in D AND E
+     */
+    /* Make diagonal and superdiagonal elements real,
+     * storing them in D and E
+     */
+    /* In complex case, the off diagonal element are
+     * not necessary real. we have to make off-diagonal
+     * elements real and copy them to E.
+     * When using HouseHolder elimination,
+     * the ZLARFG give us a real as output so, all the
+     * diagonal/off-diagonal element except the last one are already
+     * real and thus we need only to take the abs of the last
+     * one.
+     *  */
+    // sequential code here so only core 0 will work 
+    if( uplo == PlasmaLower ) {
+        for (int i=0; i < N-1 ; i++) {
+            D[i] = creal(A[i*LDA]);
+            E[i] = creal(A[i*LDA+1]);
+        }
+        D[N-1] = creal(A[(N-1)*LDA]);
+    } else { /* PlasmaUpper not tested yet */
+        for (int i=0; i<N-1; i++) {
+            D[i] = creal(A[i*LDA+NB]);
+            E[i] = creal(A[i*LDA+NB-1]);
+        }
+        D[N-1] = creal(A[(N-1)*LDA+NB]);
+    } /* end PlasmaUpper */
+    
+    return;
+}
diff --git a/compute/pzhecpy_tile2lapack_band.c b/compute/pzhecpy_tile2lapack_band.c
new file mode 100644
index 00000000..f12220dc
--- /dev/null
+++ b/compute/pzhecpy_tile2lapack_band.c
@@ -0,0 +1,103 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> s d c
+ *
+ **/
+
+#include "plasma_async.h"
+#include "plasma_context.h"
+#include "plasma_descriptor.h"
+#include "plasma_internal.h"
+#include "plasma_types.h"
+#include "plasma_workspace.h"
+#include "core_blas.h"
+
+
+
+#define A(m, n) (plasma_complex64_t*)plasma_tile_addr(A, m, n)
+#define AB(m_, n_) &(AB[(m_) + ldab*((n_)*nb) ])
+
+/***************************************************************************//**
+ *  Parallel copy of a band matrix from full nxn tile storage to band storage (nxldab).
+ *  As this function is internal and the space is the same for either Lower or Upper so
+ *  ALWAYS it convert to Lower band and then the bulge chasing will
+ *  always work with a Lower band matrix
+ **/
+
+void plasma_pzhecpy_tile2lapack_band(plasma_enum_t uplo,
+                                  plasma_desc_t A,
+                                  plasma_complex64_t *AB, int ldab,
+                                  plasma_sequence_t *sequence, plasma_request_t *request)
+{
+
+
+    // Return if failed sequence.
+    if (sequence->status != PlasmaSuccess)
+        return;
+
+    int nb = A.mb;
+
+    /*=============================================
+     * NOTE :
+     * this function transform the Lower/Upper Tile
+     * band matrix to LOWER Band storage matrix.
+     * For Lower it copy it directly.
+     * For Upper it conjtransposed during the copy.
+     *=============================================*/
+    
+    int ldx = ldab-1;
+    int minmn = imin(A.mt, A.nt);
+    /* copy Lower to Lower */
+    if ( uplo == PlasmaLower ) {
+       for (int j = 0; j < minmn; j++) {
+         int mvaj = plasma_tile_mview(A, j);
+         int nvaj = plasma_tile_nview(A, j);    
+         int ldaj = plasma_tile_mmain(A, j);
+
+           core_omp_zlacpy(PlasmaLower, PlasmaNoTrans,
+                           mvaj, nvaj, 
+                           A(j, j), ldaj, AB(0, j), ldx,
+                           sequence, request);
+
+           if( j<minmn-1 ) {
+               mvaj = plasma_tile_mview(A, j+1);
+               ldaj = plasma_tile_mmain(A, j+1);
+
+               core_omp_zlacpy(PlasmaUpper, PlasmaNoTrans,
+                               mvaj, nvaj,
+                               A(j+1, j), ldaj, AB(nb, j), ldx,
+                               sequence, request);
+           }
+       }
+    }
+    /* conjtranspose Upper when copying it to Lower */
+    else if ( uplo == PlasmaUpper ) {
+        for (int j = 0; j < minmn; j++) {
+            int mvaj = plasma_tile_mview(A, j);
+            int nvaj = plasma_tile_nview(A, j);    
+            int ldaj = plasma_tile_mmain(A, j);
+            
+            core_omp_zlacpy(PlasmaUpper, PlasmaConjTrans,
+                           mvaj, nvaj,
+                           A(j, j), ldaj, AB(0, j), ldx,
+                           sequence, request);
+
+           if(j<minmn-1){
+               nvaj = plasma_tile_nview(A, j+1);    
+
+               core_omp_zlacpy(PlasmaLower, PlasmaConjTrans,
+                               mvaj, nvaj,
+                               A(j, j+1), ldaj, AB(nb, j), ldx,
+                               sequence, request);
+           }
+        }
+    }
+}
+#undef AB
+#undef A
diff --git a/compute/zheevd.c b/compute/zheevd.c
new file mode 100644
index 00000000..91770553
--- /dev/null
+++ b/compute/zheevd.c
@@ -0,0 +1,527 @@
+/**
+ *
+ * @file
+ *
+ *  plasma is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> s d c
+ *
+ **/
+
+#include "plasma.h"
+#include "plasma_async.h"
+#include "plasma_context.h"
+#include "plasma_descriptor.h"
+#include "plasma_internal.h"
+#include "plasma_tuning.h"
+#include "plasma_types.h"
+#include "plasma_workspace.h"
+#include <string.h>
+#include "bulge.h"
+
+#include <omp.h>
+#include "core_lapack.h"
+
+/***************************************************************************//**
+ *
+ * @ingroup plasma_heevd
+ *
+ *  Computes all eigenvalues and, optionally,
+ *  eigenvectors of a complex Hermitian matrix A. The matrix A is
+ *  preliminary reduced to tridiagonal form using a two-stage
+ *  approach:
+ *  First stage: reduction to band tridiagonal form;
+ *  Second stage: reduction from band to tridiagonal form.
+ *
+ *******************************************************************************
+ *
+ * @param[in] eigt
+ *          Intended usage:
+ *          = PlasmaEigVal:    computes eigenvalues only;
+ *          = PlasmaEigValVec: computes eigenvalues and eigenvectors.
+ *
+ * @param[in] uplo
+ *          Specifies whether the matrix A is upper triangular or
+ *          lower triangular:
+ *          = PlasmaUpper: Upper triangle of A is stored;
+ *          = PlasmaLower: Lower triangle of A is stored.
+ *
+ * @param[in] n
+ *          The order of the matrix A. n >= 0.
+ *
+ * @param[in,out] pA
+ *          On entry, the symmetric (or Hermitian) matrix pA.
+ *          If uplo = PlasmaUpper, the leading n-by-n upper triangular
+ *          part of pA contains the upper triangular part of the matrix
+ *          A, and the strictly lower triangular part of pA is not
+ *          referenced.
+ *          If uplo = PlasmaLower, the leading n-by-n lower triangular
+ *          part of A contains the lower triangular part of the matrix
+ *          A, and the strictly upper triangular part of pA is not
+ *          referenced.
+ *          On exit, the lower triangle (if uplo = PlasmaLower) or the
+ *          upper triangle (if uplo = PlasmaUpper) of A, including the
+ *          diagonal, is destroyed.
+ *
+ * @param[in] lda
+ *          The leading dimension of the array A. lda >= max(1,n).
+ *
+ * @param[out] W
+ *          On exit, if info = 0, the eigenvalues.
+ *
+ * @param[in, out] T
+ *          On exit, auxiliary factorization data, required by plasma_zheevd to
+ *          Matrix in T is allocated inside this function and needs to be
+ *          destroyed by plasma_desc_destroy.
+ *
+ * @param[out] pQ
+ *          On exit, if eigt = PlasmaEigValVec and info = 0, the eigenvectors.
+ *
+ * @param[in] ldq
+ *          The leading dimension of the array pQ. ldq >= max(1,n).
+ *
+ *******************************************************************************
+ *
+ * @retval PlasmaSuccess successful exit
+ * @retval < 0 if -i, the i-th argument had an illegal value
+ *
+ *******************************************************************************
+ *
+ * @sa plasma_zheevd
+ * @sa plasma_cheevd
+ * @sa plasma_dheevd
+ * @sa plasma_sheevd
+ *
+ ******************************************************************************/
+int plasma_zheevd(plasma_enum_t eigt, plasma_enum_t uplo, int n,
+                  plasma_complex64_t *pA, int lda,
+                  plasma_desc_t *T, 
+                  double *W, 
+                  plasma_complex64_t *pQ, int ldq)
+{
+
+    // Get PLASMA context.
+    plasma_context_t *plasma = plasma_context_self();
+    if (plasma == NULL) {
+        plasma_fatal_error("PLASMA not initialized");
+        return PlasmaErrorNotInitialized;
+    }
+
+    // Check input arguments 
+    if (eigt != PlasmaEigVal && eigt != PlasmaEigValVec) {
+        plasma_error("illegal value of eigt");
+        return -1;
+    }
+    if (uplo != PlasmaLower && uplo != PlasmaUpper) {
+        plasma_error("illegal value of uplo");
+        return -2;
+    }
+    if (n < 0) {
+        plasma_error("illegal value of n");
+        return -3;
+    }
+    if (lda < imax(1, n)) {
+        plasma_error("illegal value of lda");
+        return -5;
+    }
+    if (ldq < imax(1, n)) {
+        plasma_error("illegal value of ldq");
+        return -9;
+    }
+
+    // Quick return 
+    if (n == 0)
+        return PlasmaSuccess;
+
+    // Set tiling parameters.
+    int ib = plasma->ib;
+    int nb = plasma->nb;
+
+    // Create tile matrix.
+    plasma_desc_t A;
+    int retval;
+    retval = plasma_desc_general_create(PlasmaComplexDouble, nb, nb,
+                                        n, n, 0, 0, n, n, &A);
+    if (retval != PlasmaSuccess) {
+        plasma_error("plasma_desc_general_create() failed");
+        return retval;
+    }
+
+    // Prepare descriptor T.
+    retval = plasma_descT_create(A, ib, PlasmaFlatHouseholder, T);
+    if (retval != PlasmaSuccess) {
+        plasma_error("plasma_descT_create() failed");
+        return retval;
+    }
+
+    // Allocate workspace.
+    plasma_workspace_t work;
+    size_t lwork = ib*nb + 4*nb*nb;  // geqrt: tau + work
+    retval = plasma_workspace_create(&work, lwork, PlasmaComplexDouble);
+    if (retval != PlasmaSuccess) {
+        plasma_error("plasma_workspace_create() failed");
+        return retval;
+    }
+
+    // Initialize sequence.
+    plasma_sequence_t sequence;
+    retval = plasma_sequence_init(&sequence);
+
+    // Initialize request.
+    plasma_request_t request;
+    retval = plasma_request_init(&request);
+ 
+    // asynchronous block
+    #pragma omp parallel
+    #pragma omp master
+    {
+        // Translate to tile layout.
+        plasma_omp_zge2desc(pA, lda, A, &sequence, &request);
+
+    }
+    
+    // Warning !!! plasma_omp_zheevd is not fully async function.
+    // It contains both async and syn functions.
+    plasma_omp_zheevd(eigt, uplo, A, *T, W, pQ, ldq, work, &sequence, &request);
+
+    #pragma omp parallel
+    #pragma omp master
+    {
+        // Translate back to LAPACK layout.
+        plasma_omp_zdesc2ge(A, pA, lda, &sequence, &request);
+    }
+
+    plasma_workspace_destroy(&work);
+
+    // Free matrix A in tile layout.
+    plasma_desc_destroy(&A);
+
+    // Return status.
+    return sequence.status;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup plasma_heevd
+ *
+ *  Computes all eigenvalues and,
+ *  optionally, eigenvectors of a complex Hermitian matrix A using a
+ *  two-stage approach:
+ *  First stage: reduction to band tridiagonal form;
+ *  Second stage: reduction from band to tridiagonal form.
+ *
+ *  May return before the computation is finished.
+ *  Allows for pipelining of operations at runtime.
+ *
+ *******************************************************************************
+ *
+ * @param[in] eigt
+ *          Intended usage:
+ *          = PlasmaEigVal:    computes eigenvalues only;
+ *          = PlasmaEigValVec: computes eigenvalues and eigenvectors.
+ *
+ * @param[in] uplo
+ *          Specifies whether the matrix A is upper triangular or
+ *          lower triangular:
+ *          = PlasmaUpper: Upper triangle of A is stored;
+ *          = PlasmaLower: Lower triangle of A is stored.
+ *
+ * @param[in,out] A
+ *          Descriptor of matrix A.
+ *          A is stored in the tile layout.
+ *
+ * @param[out] W
+ *          On exit, if info = 0, the eigenvalues.
+ *
+ * @param[out] T
+ *          Descriptor of matrix T.
+ *          On exit, auxiliary factorization data, required by QR factorization auxilary 
+ *          kernels to
+ *          solve the system of equations.
+ *
+ * @param[out] Q
+ *          On exit, if eigt = PlasmaEigValVec and info = 0, the eigenvectors.
+ *
+ * @param[in] ldq
+ *          The leading dimention of the eigenvectors matrix Q. ldq >= max(1,n).
+ *
+ * @param[in] sequence
+ *          Identifies the sequence of function calls that this call belongs to
+ *          (for completion checks and exception handling purposes).
+ *
+ * @param[out] request
+ *          Identifies this function call (for exception handling purposes).
+ *
+ *******************************************************************************
+ *
+ * @sa plasma_zheevd
+ * @sa plasma_omp_cheevd
+ * @sa plasma_omp_dsyev
+ * @sa plasma_omp_ssyev
+ *
+ ******************************************************************************/
+void plasma_omp_zheevd(plasma_enum_t eigt, plasma_enum_t uplo,
+                      plasma_desc_t A, plasma_desc_t T,
+                      double *W,
+                      plasma_complex64_t *pQ, int ldq,
+                      plasma_workspace_t work,
+                      plasma_sequence_t *sequence, plasma_request_t *request)
+{
+    
+    // Get PLASMA context.
+    plasma_context_t *plasma = plasma_context_self();
+    if (plasma == NULL) {
+        plasma_error("PLASMA not initialized");
+        plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
+        return;
+    }
+    
+    // Check input arguments.
+    if (eigt != PlasmaEigVal && eigt != PlasmaEigValVec) {
+        plasma_error("illegal value of eigt");
+        plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
+        return;
+    }
+    if (uplo != PlasmaLower && uplo != PlasmaUpper) {
+        plasma_error("illegal value of uplo");
+        plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
+        return;
+    }
+    if (plasma_desc_check(A) != PlasmaSuccess) {
+        plasma_error("invalid A");
+        plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
+        return;
+    }
+    if (plasma_desc_check(T) != PlasmaSuccess) {
+        plasma_error("invalid T");
+        plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
+        return;
+    }
+    if (sequence == NULL) {
+        plasma_fatal_error("NULL sequence");
+        plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
+        return;
+    }
+    if (request == NULL) {
+        plasma_fatal_error("NULL request");
+        plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
+        return;
+    }
+
+    // quick return
+    if (imin(A.m, A.n) == 0)
+        return;
+
+    int n  = A.m;
+    int nb   = imin(A.mb, A.m);
+    int lda_band = 2*nb+1;
+
+    //Allocate workspace for band storage of the band matrix
+    // A and for the off diagonal after tridiagonalisation
+    plasma_complex64_t *A_band =
+        (plasma_complex64_t *)calloc((size_t)lda_band*n, sizeof(plasma_complex64_t));
+    memset( A_band, 0, lda_band*n*sizeof(plasma_complex64_t) );
+    if (A_band == NULL) {
+        plasma_error("memory allocation(A_band) failed");
+        free(A_band);
+        return;
+    }
+    double *E = (double *)calloc((size_t)n, sizeof(double));
+    if (E == NULL) {
+        plasma_error("malloc(E) failed");
+        free(E);
+        return;
+    }
+
+    //===================
+    // Reduction to band
+    //===================
+    double start = omp_get_wtime();
+    #pragma omp parallel
+    #pragma omp master
+    {
+        plasma_pzhe2hb(uplo,
+                       A, T,
+                       work,
+                       sequence, request);
+
+        // Copy tile band to lapack band
+        plasma_pzhecpy_tile2lapack_band (uplo,
+                                         A,
+                                         A_band, lda_band,
+                                         sequence, request);  
+    }
+    double stop = omp_get_wtime();
+    double time = stop-start;
+    printf("\n N=%d:  1-stage time = %lf\t", n, time);
+
+    //====================
+    //  Bulge chasing
+    //====================
+
+    plasma_complex64_t *TAU2 = NULL;
+    plasma_complex64_t *V2 = NULL;
+    plasma_complex64_t *T2 = NULL;
+    int Vblksiz;  //Blocking used when applying V2 to the matrix Q
+    int blkcnt;  // Number of diamond tile or tile of Vs
+    int ldt, ldv;
+    int wantz   = 0;
+    int blguplo = PlasmaLower;
+    
+    if( eigt == PlasmaEigVal )
+        wantz=0;
+    else
+        wantz=2;
+    
+    Vblksiz = nb/4;
+    ldt     = Vblksiz;
+    if( eigt == PlasmaEigValVec ) {
+        findVTsiz(n, nb, Vblksiz, &blkcnt, &ldv);
+        TAU2= (plasma_complex64_t *)
+            calloc((size_t)blkcnt*Vblksiz, sizeof(plasma_complex64_t));
+        V2  = (plasma_complex64_t *)
+            calloc((size_t)ldv*blkcnt*Vblksiz, sizeof(plasma_complex64_t));
+        T2  = (plasma_complex64_t *)
+            calloc((size_t)ldt*blkcnt*Vblksiz, sizeof(plasma_complex64_t));
+        if ( (TAU2 == NULL) || (V2 == NULL) || (T2 == NULL) ) {
+            plasma_error("calloc() failed");
+            free(TAU2);
+            free(V2);
+            free(T2);
+            return;
+        }
+        memset(TAU2, 0,     blkcnt*Vblksiz*sizeof(plasma_complex64_t));
+        memset(V2,   0, ldv*blkcnt*Vblksiz*sizeof(plasma_complex64_t));
+        memset(T2,   0, ldt*blkcnt*Vblksiz*sizeof(plasma_complex64_t));
+    }
+    else {
+        TAU2   = (plasma_complex64_t *)
+            calloc((size_t)2*n, sizeof(plasma_complex64_t));
+        V2     = (plasma_complex64_t *)
+            calloc((size_t)2*n, sizeof(plasma_complex64_t ));
+        if ( (TAU2 == NULL) || (V2 == NULL) ) {
+            plasma_error("calloc() failed");
+            free(TAU2);
+            free(V2);
+            return;
+        }
+        memset(TAU2, 0, 2*n*sizeof(plasma_complex64_t));
+        memset(V2,   0, 2*n*sizeof(plasma_complex64_t));
+    }
+                
+    // Main bulge chasing kernel.
+    // Contains internal omp parallel section 
+    start = omp_get_wtime();
+    plasma_pzhbtrd_static(blguplo, n, nb, Vblksiz,
+                            A_band, lda_band,
+                            V2, TAU2,
+                            W, E,
+                            wantz,
+                            work,
+                            sequence, request);
+    stop = omp_get_wtime();
+    time = stop-start;
+    printf("2-stage timing = %lf\t", time);
+    
+    //=======================================
+    //  calling eigensolver
+    //=======================================
+
+    // call eigensolver using lapack routine for our resulting tridiag [W E] 
+    start = omp_get_wtime();
+    if(eigt == PlasmaEigVal){
+        LAPACKE_zstedc( LAPACK_COL_MAJOR,
+                        'N',
+                        n, W, E, pQ, ldq );
+    } else {
+        LAPACKE_zstedc( LAPACK_COL_MAJOR,
+                        'I',
+                                 n, W, E, pQ, ldq );
+    }
+    stop = omp_get_wtime();
+    time = stop-start;
+    printf("Eigenvalue time = %lf\t", time);
+
+    start = omp_get_wtime();
+    if (eigt == PlasmaEigValVec) {
+        /*=======================================
+         *  apply Q2 from the bulge
+         *=======================================*/
+        // compute T2 
+        #pragma omp parallel
+        {
+            plasma_pzlarft_blgtrd(n, nb, Vblksiz,
+                                  V2, T2, TAU2, 
+                                  sequence, request);
+        }
+
+        // apply Q2 from Left
+        #pragma omp parallel
+        {
+            plasma_pzunmqr_blgtrd(PlasmaLeft,  PlasmaNoTrans,
+                                  n, nb, n, 
+                                  Vblksiz, wantz,
+                                  V2, T2, TAU2,
+                                  pQ, ldq,
+                                  work,
+                                  sequence, request);
+        }
+        
+        
+        /*=======================================
+         *  apply Q1 from the first stage 
+         *=======================================*/
+        // CASE nb>N, Q1 doesn't need to be applied,
+        //only bulge chasing has been done
+        if( nb < n ){
+            
+            plasma_desc_t Q;
+            plasma_desc_general_create(PlasmaComplexDouble, nb, nb,
+                                       n, n, 0, 0, n, n, &Q);            
+
+            #pragma omp parallel
+            #pragma omp master
+            {
+                // Translate to tile layout.
+                plasma_pzge2desc(pQ, ldq, Q, sequence, request);
+                
+                // Accumulate the transformations from the first stage 
+                if(uplo==PlasmaLower){
+                    plasma_pzunmqr(PlasmaLeft, PlasmaNoTrans,
+                                   plasma_desc_view(A, A.mb, 0, A.m-A.mb, A.n-A.nb),
+                                   plasma_desc_view(T, T.mb, 0, T.m-T.mb, T.n-T.nb),
+                                   plasma_desc_view(Q, Q.mb, 0, Q.m-Q.mb, Q.n),
+                                   work,
+                                   sequence, request);
+
+                }
+                else {
+                    plasma_pzunmlq (PlasmaLeft, Plasma_ConjTrans,
+                                    plasma_desc_view(A, 0, A.nb, A.m-A.mb, A.n-A.nb),
+                                    plasma_desc_view(T, 0, T.nb, T.m-T.mb, T.n-T.nb),
+                                    plasma_desc_view(Q, Q.mb, 0, Q.m-Q.mb, Q.n),
+                                    work,
+                                    sequence, request);
+                }
+                
+                // Translate back to LAPACK layout.
+                plasma_pzdesc2ge(Q, pQ, ldq, sequence, request);
+            }
+
+            plasma_desc_destroy(&Q);
+        } // END of ( nb < N ) 
+    }
+    stop = omp_get_wtime();
+    time = stop-start;
+    printf("Eigenvector timing = %lf\n", time);
+    
+    if( eigt == PlasmaEigValVec ){ free(T2);}
+    free(V2);
+    free(TAU2);
+    free(E);
+    free(A_band);
+    return;
+}
+
diff --git a/core_blas/core_zhbtype1cb.c b/core_blas/core_zhbtype1cb.c
new file mode 100644
index 00000000..8939cf21
--- /dev/null
+++ b/core_blas/core_zhbtype1cb.c
@@ -0,0 +1,139 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "core_lapack.h"
+#include "bulge.h"
+#include <string.h>
+
+#define A(m,n)   (A + LDA * (n) + ((m)-(n)))
+#define V(m)     (V + (m))
+#define TAU(m)   (TAU + (m))
+
+/***************************************************************************//**
+ *
+ * @ingroup core_hbtype1cb
+ *
+ *  Is a kernel that will operate on a region (triangle) of data
+ *  bounded by st and ed. This kernel eliminate a column by an column-wise
+ *  annihiliation, then it apply a left+right update on the hermitian triangle.
+ *  Note that the column to be eliminated is located at st-1.
+ *
+ *  All detail are available on technical report or SC11 paper.
+ *  Azzam Haidar, Hatem Ltaief, and Jack Dongarra. 2011.
+ *  Parallel reduction to condensed forms for symmetric eigenvalue problems
+ *  using aggregated fine-grained and memory-aware kernels. In Proceedings
+ *  of 2011 International Conference for High Performance Computing,
+ *  Networking, Storage and Analysis (SC '11). ACM, New York, NY, USA, ,
+ *  Article 8 , 11 pages.
+ *  http://doi.acm.org/10.1145/2063384.2063394
+ *
+ *******************************************************************************
+ *
+ * @param[in] N
+ *          The order of the matrix A.
+ *
+ * @param[in] NB
+ *          The size of the band.
+ *
+ * @param[in, out] A
+ *          A pointer to the matrix A of size (2*NB+1)-by-N.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the matrix A. LDA >= max(1,2*NB+1)
+ *
+ * @param[out] V
+ *          PLASMA_Complex64_t array, dimension N if eigenvalue only
+ *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
+ *          The Householder reflectors are stored in this array.
+ *
+ * @param[out] TAU
+ *          PLASMA_Complex64_t array, dimension (N).
+ *          The scalar factors of the Householder reflectors are stored
+ *          in this array.
+ *
+ * @param[in] st
+ *          A pointer to the start index where this kernel will operate.
+ *
+ * @param[in] ed
+ *          A pointer to the end index where this kernel will operate.
+ *
+ * @param[in] sweep
+ *          The sweep number that is eliminated. it serve to calculate the
+ *          pointer to the position where to store the Vs and Ts.
+ *
+ * @param[in] Vblksiz
+ *          constant which correspond to the blocking used when applying the Vs.
+ *          it serve to calculate the pointer to the position where to store the
+ *          Vs and Ts.
+ *
+ * @param[in] WANTZ
+ *          constant which indicate if Eigenvalue are requested or both
+ *          Eigenvalue/Eigenvectors.
+ *
+ * @param[in] WORK
+ *          Workspace of size nb.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval PLASMA_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+/***************************************************************************
+ *          TYPE 1-BAND Lower-columnwise-Householder
+ ***************************************************************************/
+void core_zhbtype1cb(int N, int NB,
+                     plasma_complex64_t *A, int LDA,
+                     plasma_complex64_t *V, plasma_complex64_t *TAU,
+                     int st, int ed, int sweep, int Vblksiz, int WANTZ,
+                     plasma_complex64_t *WORK)
+{
+    int len, LDX;
+    int blkid, vpos, taupos, tpos;
+
+    /* find the pointer to the Vs and Ts as stored by the bulgechasing
+     * note that in case no eigenvector required V and T are stored
+     * on a vector of size N
+     * */
+     if( WANTZ == 0 ) {
+         vpos   = ((sweep+1)%2)*N + st;
+         taupos = ((sweep+1)%2)*N + st;
+     } else {
+         findVTpos(N, NB, Vblksiz, sweep, st,
+                   &vpos, &taupos, &tpos, &blkid);
+     }
+
+    LDX = LDA-1;
+    len = ed-st+1;
+    *V(vpos) = 1.;
+
+    memcpy( V(vpos+1), A(st+1, st-1), (len-1)*sizeof(plasma_complex64_t) );
+    memset( A(st+1, st-1), 0, (len-1)*sizeof(plasma_complex64_t) );
+
+    /* Eliminate the col  at st-1 */
+    LAPACKE_zlarfg_work(len, A(st, st-1), V(vpos+1), 1, TAU(taupos) );
+
+    /* Apply left and right on A(st:ed,st:ed) */
+    core_zlarfy(len, A(st,st), LDX, V(vpos), TAU(taupos), WORK);
+
+    return;
+}
+/***************************************************************************/
+#undef A
+#undef V
+#undef TAU
+
+
diff --git a/core_blas/core_zhbtype2cb.c b/core_blas/core_zhbtype2cb.c
new file mode 100644
index 00000000..b37b7d94
--- /dev/null
+++ b/core_blas/core_zhbtype2cb.c
@@ -0,0 +1,164 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "bulge.h"
+#include <string.h>
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "plasma_internal.h"
+#include "core_lapack.h"
+
+#define A(m,n)   (A + LDA * (n) + ((m)-(n)))
+#define V(m)     (V + (m))
+#define TAU(m)   (TAU + (m))
+
+/***************************************************************************//**
+ *
+ * @ingroup CORE_PLASMA_Complex64_t
+ *
+ *  CORE_zhbtype2cb is a kernel that will operate on a region (triangle) of data
+ *  bounded by st and ed. This kernel apply the right update remaining from the
+ *  type1 and this later will create a bulge so it eliminate the first column of
+ *  the created bulge and do the corresponding Left update.
+ *
+ *  All detail are available on technical report or SC11 paper.
+ *  Azzam Haidar, Hatem Ltaief, and Jack Dongarra. 2011.
+ *  Parallel reduction to condensed forms for symmetric eigenvalue problems
+ *  using aggregated fine-grained and memory-aware kernels. In Proceedings
+ *  of 2011 International Conference for High Performance Computing,
+ *  Networking, Storage and Analysis (SC '11). ACM, New York, NY, USA, ,
+ *  Article 8 , 11 pages.
+ *  http://doi.acm.org/10.1145/2063384.2063394
+ *
+ *******************************************************************************
+ *
+ * @param[in] N
+ *          The order of the matrix A.
+ *
+ * @param[in] NB
+ *          The size of the band.
+ *
+ * @param[in, out] A
+ *          A pointer to the matrix A of size (2*NB+1)-by-N.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the matrix A. LDA >= max(1,2*NB+1)
+ *
+ * @param[in, out] V
+ *          PLASMA_Complex64_t array, dimension N if eigenvalue only
+ *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
+ *          The Householder reflectors of the previous type 1 are used here
+ *          to continue update then new one are generated to eliminate the
+ *          bulge and stored in this array.
+ *
+ * @param[in, out] TAU
+ *          PLASMA_Complex64_t array, dimension (N).
+ *          The scalar factors of the Householder reflectors of the previous
+ *          type 1 are used here to continue update then new one are generated
+ *          to eliminate the bulge and stored in this array.
+ *
+ * @param[in] st
+ *          A pointer to the start index where this kernel will operate.
+ *
+ * @param[in] ed
+ *          A pointer to the end index where this kernel will operate.
+ *
+ * @param[in] sweep
+ *          The sweep number that is eliminated. it serve to calculate the
+ *          pointer to the position where to store the Vs and Ts.
+ *
+ * @param[in] Vblksiz
+ *          constant which correspond to the blocking used when applying the Vs.
+ *          it serve to calculate the pointer to the position where to store the
+ *          Vs and Ts.
+ *
+ * @param[in] WANTZ
+ *          constant which indicate if Eigenvalue are requested or both
+ *          Eigenvalue/Eigenvectors.
+ *
+ * @param[in] WORK
+ *          Workspace of size nb.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval PLASMA_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+
+/***************************************************************************
+ *          TYPE 2-BAND Lower-columnwise-Householder
+ ***************************************************************************/
+void core_zhbtype2cb(int N, int NB,
+                     plasma_complex64_t *A, int LDA,
+                     plasma_complex64_t *V, plasma_complex64_t *TAU,
+                     int st, int ed, int sweep, int Vblksiz, int WANTZ,
+                     plasma_complex64_t *WORK)
+{
+    plasma_complex64_t ctmp;
+    int J1, J2, len, lem, LDX;
+    int blkid, vpos, taupos, tpos;
+
+    if( WANTZ == 0 ) {
+        vpos   = ((sweep+1)%2)*N + st;
+        taupos = ((sweep+1)%2)*N + st;
+    } else {
+        findVTpos(N, NB, Vblksiz, sweep, st,
+                  &vpos, &taupos, &tpos, &blkid);
+    }
+
+    LDX = LDA-1;
+    J1  = ed+1;
+    J2  = imin(ed+NB,N-1);
+    len = ed-st+1;
+    lem = J2-J1+1;
+
+    if( lem > 0 ) {
+        /* Apply remaining right commming from the top block */
+        LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaRight),
+                            lem, len, V(vpos), *(TAU(taupos)), A(J1, st), LDX, WORK);
+    }
+
+    if( lem > 1 ) {
+        if( WANTZ == 0 ) {
+            vpos   = ((sweep+1)%2)*N + J1;
+            taupos = ((sweep+1)%2)*N + J1;
+        } else {
+            findVTpos(N,NB,Vblksiz,sweep,J1, &vpos, &taupos, &tpos, &blkid);
+        }
+
+        /* Remove the first column of the created bulge */
+        *V(vpos) = 1.;
+
+        memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(plasma_complex64_t));
+        memset(A(J1+1, st), 0, (lem-1)*sizeof(plasma_complex64_t));
+
+        /* Eliminate the col at st */
+        LAPACKE_zlarfg_work( lem, A(J1, st), V(vpos+1), 1, TAU(taupos) );
+
+        /*
+         * Apply left on A(J1:J2,st+1:ed)
+         * We decrease len because we start at col st+1 instead of st.
+         * col st is the col that has been revomved;
+         */
+        len = len-1;
+
+        ctmp = conj(*TAU(taupos));
+        LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaLeft),
+                            lem, len, V(vpos), ctmp, A(J1, st+1), LDX, WORK);
+    }
+    return;
+}
+/***************************************************************************/
+#undef A
+#undef V
+#undef TAU
diff --git a/core_blas/core_zhbtype3cb.c b/core_blas/core_zhbtype3cb.c
new file mode 100644
index 00000000..0e3d53b4
--- /dev/null
+++ b/core_blas/core_zhbtype3cb.c
@@ -0,0 +1,124 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "core_lapack.h"
+#include "bulge.h"
+
+#define A(m,n)   (A + LDA * (n) + ((m)-(n)))
+#define V(m)     (V + (m))
+#define TAU(m)   (TAU + (m))
+
+/***************************************************************************//**
+ *
+ * @ingroup CORE_plasma_complex64_t
+ *
+ *  CORE_zhbtype3cb is a kernel that will operate on a region (triangle) of data
+ *  bounded by st and ed. This kernel apply a left+right update on the hermitian
+ *  triangle.  Note that this kernel is very similar to type1 but does not do an
+ *  elimination.
+ *
+ *  All detail are available on technical report or SC11 paper.
+ *  Azzam Haidar, Hatem Ltaief, and Jack Dongarra. 2011.
+ *  Parallel reduction to condensed forms for symmetric eigenvalue problems
+ *  using aggregated fine-grained and memory-aware kernels. In Proceedings
+ *  of 2011 International Conference for High Performance Computing,
+ *  Networking, Storage and Analysis (SC '11). ACM, New York, NY, USA, ,
+ *  Article 8 , 11 pages.
+ *  http://doi.acm.org/10.1145/2063384.2063394
+ *
+ *******************************************************************************
+ *
+ * @param[in] N
+ *          The order of the matrix A.
+ *
+ * @param[in] NB
+ *          The size of the band.
+ *
+ * @param[in, out] A
+ *          A pointer to the matrix A of size (2*NB+1)-by-N.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the matrix A. LDA >= max(1,2*NB+1)
+ *
+ * @param[in] V
+ *          plasma_complex64_t array, dimension N if eigenvalue only
+ *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
+ *          The Householder reflectors are stored in this array.
+ *
+ * @param[in] TAU
+ *          plasma_complex64_t array, dimension (N).
+ *          The scalar factors of the Householder reflectors are stored
+ *          in this array.
+ *
+ * @param[in] st
+ *          A pointer to the start index where this kernel will operate.
+ *
+ * @param[in] ed
+ *          A pointer to the end index where this kernel will operate.
+ *
+ * @param[in] sweep
+ *          The sweep number that is eliminated. it serve to calculate the
+ *          pointer to the position where to store the Vs and Ts.
+ *
+ * @param[in] Vblksiz
+ *          constant which correspond to the blocking used when applying the Vs.
+ *          it serve to calculate the pointer to the position where to store the
+ *          Vs and Ts.
+ *
+ * @param[in] WANTZ
+ *          constant which indicate if Eigenvalue are requested or both
+ *          Eigenvalue/Eigenvectors.
+ *
+ * @param[in] WORK
+ *          Workspace of size nb.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval PLASMA_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+
+/***************************************************************************//**
+ *          TYPE 3-BAND Lower-columnwise-Householder
+ ***************************************************************************/
+void core_zhbtype3cb(int N, int NB,
+                     plasma_complex64_t *A, int LDA,
+                     const plasma_complex64_t *V, const plasma_complex64_t *TAU,
+                     int st, int ed, int sweep, int Vblksiz, int WANTZ,
+                     plasma_complex64_t *WORK)
+{
+    int len, LDX;
+    int blkid, vpos, taupos, tpos;
+
+    if( WANTZ == 0 ) {
+        vpos   = ((sweep+1)%2)*N + st;
+        taupos = ((sweep+1)%2)*N + st;
+    } else {
+        findVTpos(N, NB, Vblksiz, sweep, st,
+                  &vpos, &taupos, &tpos, &blkid);
+    }
+
+    LDX = LDA-1;
+    len = ed-st+1;
+
+    /* Apply left and right on A(st:ed,st:ed)*/
+    core_zlarfy(len, A(st,st), LDX, V(vpos), TAU(taupos), WORK);
+    return;
+}
+/***************************************************************************/
+#undef A
+#undef V
+#undef TAU
diff --git a/core_blas/core_zherfb.c b/core_blas/core_zherfb.c
new file mode 100644
index 00000000..e50efdfa
--- /dev/null
+++ b/core_blas/core_zherfb.c
@@ -0,0 +1,225 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "plasma_internal.h"
+#include "core_lapack.h"
+
+#include <omp.h>
+
+/***************************************************************************//**
+ *
+ * @ingroup core_herfb
+ *
+ *  Overwrites the symmetric complex n-by-n tile C with
+ *
+ *    Q**T*C*Q
+ *
+ *  where Q is a complex unitary matrix defined as the product of k
+ *  elementary reflectors
+ *
+ *    Q = H(1) H(2) . . . H(k)
+ *
+ *  as returned by CORE_zgeqrt. Only PlasmaLower supported!
+ *
+ *******************************************************************************
+ *
+ * @param[in] uplo
+ *         - PlasmaLower : the upper part of the symmetric matrix C
+ *                         is not referenced.
+ *         - PlasmaUpper : the lower part of the symmetric matrix C
+ *                         is not referenced (not supported).
+ * @param[in] n
+ *          The number of rows/columns of the tile C.  n >= 0.
+ *
+ * @param[in] k
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q. k >= 0.
+ *
+ * @param[in] ib
+ *         The inner-blocking size.  ib >= 0.
+
+ * @param[in] A
+ *         The i-th column must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         CORE_zgeqrt in the first k columns of its array argument A.
+ *
+ * @param[in] lda
+ *         The leading dimension of the array A. lda >= max(1,n).
+ *
+ * @param[in] T
+ *         The ib-by-k triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] ldt
+ *         The leading dimension of the array T. ldt >= ib.
+ *
+ * @param[in,out] C
+ *         On entry, the symmetric n-by-n tile C.
+ *         On exit, C is overwritten by Q**T*C*Q.
+ *
+ * @param[in] ldc
+ *         The leading dimension of the array C. ldc >= max(1,m).
+ *
+ * @param[in,out] work
+ *         On exit, if INFO = 0, work(1) returns the optimal ldwork.
+ *
+ * @param[in] ldwork
+ *         The dimension of the array work. ldwork >= max(1,n);
+ *
+ *******************************************************************************
+ *
+ * @retval  PlasmaSuccess successful exit
+ * @retval  < 0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+int core_zherfb(plasma_enum_t uplo,
+                 int n, int k, int ib,
+                 const plasma_complex64_t *A,    int lda,
+                 const plasma_complex64_t *T,    int ldt,
+                       plasma_complex64_t *C,    int ldc,
+                       plasma_complex64_t *work, int ldwork )
+{
+    plasma_complex64_t tmp;
+    int i, j;
+
+    // Check input arguments. 
+    if ((uplo != PlasmaUpper) && (uplo != PlasmaLower)) {
+        coreblas_error("Illegal value of uplo");
+        return -1;
+    }
+    if (n < 0) {
+        coreblas_error("Illegal value of n");
+        return -2;
+    }
+    if (k < 0) {
+        coreblas_error("Illegal value of k");
+        return -3;
+    }
+    if (ib < 0) {
+        coreblas_error("Illegal value of ib");
+        return -4;
+    }
+    if ( (lda < imax(1,n)) && (n > 0) ) {
+        coreblas_error("Illegal value of lda");
+        return -6;
+    }
+    if ( (ldt < imax(1,ib)) && (ib > 0) ) {
+        coreblas_error("Illegal value of ldt");
+        return -8;
+    }
+    if ( (ldc < imax(1,n)) && (n > 0) ) {
+        coreblas_error("Illegal value of ldc");
+        return -10;
+    }
+    if (ldwork < imax(1,n)) {
+        coreblas_error("Illegal value of ldwork");
+        return -12;
+    }
+
+    // Quick return
+    if ((n == 0) || (k == 0) ||
+        (ib == 0))
+        return PlasmaSuccess;
+
+    int nb = n;
+
+    if (uplo == PlasmaLower) {
+        // Rebuild the symmetric block: work <- C
+        for (j = 0; j < n; j++) {
+            *(work + j + j * ldwork) =  *(C + ldc*j + j);
+            for (i = j+1; i < n; i++){
+                tmp = *(C + i + j*ldc);
+                *(work + i + j * ldwork) = tmp;
+                *(work + j + i * ldwork) = conj( tmp );
+            }
+        }
+
+        // Left
+        core_zunmqr(PlasmaLeft, Plasma_ConjTrans, n, n, k, ib,
+                    A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
+        // Right
+        core_zunmqr(PlasmaRight, PlasmaNoTrans, n, n, k, ib,
+                    A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
+        
+        //====================================================
+        // Copy back the final result to the lower part of C
+        //===================================================
+        LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, lapack_const(PlasmaLower), n, n, work, ldwork, C, ldc );
+    }
+    else {
+        //===================================================
+        // Rebuild the symmetric block: work <- C
+        //===================================================
+        for (j = 0; j < n; j++) {
+            for (i = 0; i < j; i++){
+                tmp = *(C + i + j*ldc);
+                *(work + i + j * ldwork) = tmp;
+                *(work + j + i * ldwork) = conj( tmp );
+            }
+            *(work + j + j * ldwork) =  *(C + ldc*j + j);
+        }
+        
+        // Right 
+        core_zunmlq(PlasmaRight, Plasma_ConjTrans, n, n, k, ib,
+                    A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
+        // Left 
+        core_zunmlq(PlasmaLeft, PlasmaNoTrans, n, n, k, ib,
+                    A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
+        
+        //===================================================
+        // Copy back the final result to the upper part of C
+        //==================================================
+        LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, lapack_const(PlasmaUpper), n, n, work, ldwork, C, ldc );
+    }
+    return PlasmaSuccess;
+}
+
+/******************************************************************************/
+void core_omp_zherfb(plasma_enum_t uplo,
+                     int n, int k, int ib,
+                     const plasma_complex64_t *A, int lda,
+                     const plasma_complex64_t *T, int ldt,
+                           plasma_complex64_t *C, int ldc,
+                     plasma_workspace_t work,
+                     plasma_sequence_t *sequence, plasma_request_t *request)
+{
+    // OpenMP depends on lda == n == nb, ldc == nb, ldt == ib.
+        #pragma omp task depend(in:A[0:lda*k]) \
+                     depend(in:T[0:ib*k]) \
+                     depend(inout:C[0:ldc*n])
+    {
+        if (sequence->status == PlasmaSuccess) {
+            // Prepare workspaces.
+            int tid = omp_get_thread_num();
+            plasma_complex64_t *W = (plasma_complex64_t*)work.spaces[tid];
+
+            int ldwork = n;
+
+            // Call the kernel.
+            int info = core_zherfb(uplo,
+                                   n, k, ib,
+                                   A, lda,
+                                   T, ldt,
+                                   C, ldc,
+                                   W, ldwork);
+            if (info != PlasmaSuccess) {
+                plasma_error_with_code("Error in call to COREBLAS in argument",
+                                       -info);
+                plasma_request_fail(sequence, request,
+                                    PlasmaErrorIllegalValue);
+            }
+        }
+    }
+}
diff --git a/core_blas/core_zlarfy.c b/core_blas/core_zlarfy.c
new file mode 100644
index 00000000..efaacdc0
--- /dev/null
+++ b/core_blas/core_zlarfy.c
@@ -0,0 +1,99 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "core_lapack.h"
+
+#undef REAL
+#define COMPLEX
+
+/***************************************************************************//**
+ *
+ * @ingroup CORE_plasma_complex64_t
+ *
+ *  CORE_zlarfy applies an elementary reflector, or Householder matrix, H,
+ *  to a N-by-N hermitian matrix C, from both the left and the right.
+ *
+ *  H is represented in the form
+ *
+ *     H = I - tau * v * v'
+ *
+ *  where  tau  is a scalar and  v  is a vector.
+ *
+ *  If tau is zero, then H is taken to be the unit matrix.
+ *
+ *******************************************************************************
+ *
+ * @param[in] N
+ *          The number of rows and columns of the matrix C.  N >= 0.
+ *
+ * @param[in,out] A
+ *          COMPLEX*16 array, dimension (LDA, N)
+ *          On entry, the Hermetian matrix A.
+ *          On exit, A is overwritten by H * A * H'.
+ *
+ * @param[in] LDA
+ *         The leading dimension of the array A.  LDA >= max(1,N).
+ *
+ * @param[in] V
+ *          The vector V that contains the Householder reflectors.
+ *
+ * @param[in] TAU
+ *          The value tau.
+ *
+ * @param[out] WORK
+ *          Workspace.
+ *
+ ******************************************************************************/
+void core_zlarfy(int N,
+            plasma_complex64_t *A, int LDA,
+            const plasma_complex64_t *V,
+            const plasma_complex64_t *TAU,
+            plasma_complex64_t *WORK)
+{
+    static plasma_complex64_t zzero =  0.0;
+    static plasma_complex64_t zmone = -1.0;
+
+    int j;
+    plasma_complex64_t dtmp;
+
+    /* Compute dtmp = X'*V */
+    /* X = AVtau */
+    cblas_zhemv(CblasColMajor, CblasLower,
+                N, CBLAS_SADDR(*TAU), A, LDA,
+                V, 1, CBLAS_SADDR(zzero), WORK, 1);
+
+    /* cblas_zdotc_sub(N, WORK, 1, V, 1, &dtmp);*/
+    dtmp = 0.;
+    for (j = 0; j < N ; j++)
+        dtmp = dtmp + conj(WORK[j]) * V[j];
+
+    /* Compute 1/2 X'*V*t = 1/2*dtmp*tau  */
+    dtmp = -dtmp * 0.5 * (*TAU);
+
+   /* Compute W=X-1/2VX'Vt = X - dtmp*V */
+    cblas_zaxpy(N, CBLAS_SADDR(dtmp),
+                V, 1, WORK, 1);
+
+    /*
+     * Performs the symmetric rank 2 operation
+     *    A := alpha*x*y' + alpha*y*x' + A
+     */
+    cblas_zher2(CblasColMajor, CblasLower, N,
+                CBLAS_SADDR(zmone), WORK, 1,
+                                    V,    1,
+                                    A,    LDA);
+
+    return;
+}
+#undef COMPLEX
diff --git a/core_blas/core_ztsmlq_corner.c b/core_blas/core_ztsmlq_corner.c
new file mode 100644
index 00000000..dac68fb4
--- /dev/null
+++ b/core_blas/core_ztsmlq_corner.c
@@ -0,0 +1,252 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "plasma_internal.h"
+#include "core_lapack.h"
+
+#include <omp.h>
+
+/***************************************************************************//**
+ *
+ * @ingroup cire_tsmlq_corner
+ *
+ * Applies left and right transformations as depicted below:
+ * |I -VTV'| * | A1  A2 | * |I - VT'V'|
+ *             | A2' A3 |
+ * where A1 and A3 are symmetric matrices.
+ * Only the lower part is referenced.
+ * This is an adhoc implementation, can be further optimized...
+ *
+ *******************************************************************************
+ *
+ * @param[in] m1
+ *         The number of rows of the tile A1. m1 >= 0.
+ *
+ * @param[in] n1
+ *         The number of columns of the tile A1. n1 >= 0.
+ *
+ * @param[in] m2
+ *         The number of rows of the tile A2. m2 >= 0.
+ *
+ * @param[in] n2
+ *         The number of columns of the tile A2. n2 >= 0.
+ *
+ * @param[in] m3
+ *         The number of rows of the tile A3. m3 >= 0.
+ *
+ * @param[in] n3
+ *         The number of columns of the tile A3. n3 >= 0.
+ *
+ * @param[in] k
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] ib
+ *         The inner-blocking size.  ib >= 0.
+ *
+ * @param[in,out] A1
+ *         On entry, the m1-by-n1 tile A1.
+ *         On exit, A1 is overwritten by the application of Q.
+ *
+ * @param[in] lda1
+ *         The leading dimension of the array A1. lda1 >= max(1,m1).
+ *
+ * @param[in,out] A2
+ *         On entry, the m2-by-n2 tile A2.
+ *         On exit, A2 is overwritten by the application of Q.
+ *
+ * @param[in] lda2
+ *         The leading dimension of the tile A2. lda2 >= max(1,m2).
+ *
+ * @param[in,out] A3
+ *         On entry, the m3-by-n3 tile A3.
+ *
+ * @param[in] lda3
+ *         The leading dimension of the tile A3. lda3 >= max(1,m3).
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         CORE_ZTSLQT in the first k rows of its array argument V.
+ *
+ * @param[in] ldv
+ *         The leading dimension of the array V. ldv >= max(1,K).
+ *
+ * @param[in] T
+ *         The ib-by-n1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] ldt
+ *         The leading dimension of the array T. ldt >= ib.
+ *
+ * @param[out] work
+ *         Workspace array of size
+ *             ldwork-by-m1 if side == PlasmaLeft
+ *             ldwork-by-ib if side == PlasmaRight
+ *
+ * @param[in] ldwork
+ *         The leading dimension of the array work.
+ *             ldwork >= max(1,ib) if side == PlasmaLeft
+ *             ldwork >= max(1,n1) if side == PlasmaRight
+ *
+ *******************************************************************************
+ *
+ * @retval PlasmaSuccess successful exit
+ * @retval < 0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+int core_ztsmlq_corner(int m1, int n1, int m2, int n2,
+                       int m3, int n3, int k, int ib,
+                             plasma_complex64_t *A1, int lda1,
+                             plasma_complex64_t *A2, int lda2,
+                             plasma_complex64_t *A3, int lda3,
+                       const plasma_complex64_t *V,  int ldv,
+                       const plasma_complex64_t *T,  int ldt,
+                       plasma_complex64_t *work, int ldwork)
+{
+    plasma_enum_t side;
+    plasma_enum_t trans;
+    int i, j;
+
+    // Check input arguments
+    if ( m1 != n1 ) {
+        coreblas_error("Illegal value of M1, N1");
+        return -1;
+    }
+    int nb = n1;
+    // Rebuild the symmetric block: work <- A1
+    for (i = 0; i < m1; i++)
+        for (j = i; j < n1; j++){
+            *(work + i + j*ldwork) = *(A1 + i + j*lda1);
+            if (j > i){
+                *(work + j + i*ldwork) =  conj( *(work + i + j*ldwork) );
+            }
+        }
+
+    //  Copy the transpose of A2: work+nb*ldwork <- A2'
+    for (j = 0; j < n2; j++)
+        for (i = 0; i < m2; i++){
+            *(work + j + (i + nb) * ldwork) = conj( *(A2 + i + j*lda2) );
+        }
+
+    side = PlasmaRight;
+    trans = Plasma_ConjTrans;
+
+    //  Right application on |A1 A2|
+    core_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
+                work, ldwork, A2, lda2,
+                V, ldv, T, ldt,
+                work+3*nb*ldwork, ldwork);
+
+    //  Rebuild the symmetric block: work+2*nb*ldwork <- A3
+    for (i = 0; i < m3; i++)
+        for (j = i; j < n3; j++){
+            *(work + i + (j + 2*nb) * ldwork) = *(A3 + i + j*lda3);
+            if (j > i){
+                *(work + j + (i + 2*nb) * ldwork) =  conj ( *(work + i + (j + 2*nb) * ldwork) );
+            }
+        }
+
+    //  Right application on | A2' A3 |
+    core_ztsmlq(side, trans, n2, m2, m3, n3, k, ib,
+                work+nb*ldwork, ldwork, work+2*nb*ldwork, ldwork,
+                V, ldv, T, ldt,
+                work + 3*nb*ldwork, ldwork);
+
+    side = PlasmaLeft;
+    trans = PlasmaNoTrans;
+
+    //========================================================
+    //  Left application on | A1  |
+    //                      | A2' |
+    //========================================================
+    core_ztsmlq(side, trans, m1, n1, n2, m2, k, ib,
+                work, ldwork, work+nb*ldwork, ldwork,
+                V, ldv, T, ldt,
+                work + 3*nb*ldwork, ldwork);
+
+    //========================================================
+    //  Copy back the final result to the upper part of A1
+    //  A1 = work
+    //========================================================
+    for (i = 0; i < m1; i++)
+        for (j = i; j < n1; j++)
+            *(A1 + i + j*lda1) = *(work + i + j*ldwork);
+
+    //========================================================
+    //  Left application on | A2 |
+    //                     | A3 |
+    //========================================================
+    core_ztsmlq(side, trans, m2, n2, m3, n3, k, ib,
+                A2, lda2, work+2*nb*ldwork, ldwork,
+                V, ldv, T, ldt,
+                work + 3*nb*ldwork, ldwork);
+
+    //========================================================
+    //  Copy back the final result to the upper part of A3
+    //  A3 = work+2*nb*ldwork
+    //========================================================
+    for (i = 0; i < m3; i++)
+        for (j = i; j < n3; j++)
+            *(A3 + i + j*lda3) = *(work + i + (j+ 2*nb) * ldwork);
+
+    return PlasmaSuccess;
+}
+
+void core_omp_ztsmlq_corner(int m1, int n1, int m2, int n2,
+                            int m3, int n3, int k, int ib,
+                                  plasma_complex64_t *A1, int lda1,
+                                  plasma_complex64_t *A2, int lda2,
+                                  plasma_complex64_t *A3, int lda3,
+                            const plasma_complex64_t *V,  int ldv,
+                            const plasma_complex64_t *T,  int ldt,
+                            plasma_workspace_t work,
+                            plasma_sequence_t *sequence, plasma_request_t *request)
+{
+    int nb = n1;
+    // assuming m1 == nb, n1 == nb, m2 == nb, n2 == nb
+    // m3 == nb, n3 == nb
+    #pragma omp task depend(inout:A1[0:nb*nb]) \
+                     depend(inout:A2[0:nb*nb]) \
+                     depend(inout:A3[0:nb*nb]) \
+                     depend(in:V[0:nb*nb]) \
+                     depend(in:T[0:ib*nb])
+    {
+        if (sequence->status == PlasmaSuccess) {
+            int tid = omp_get_thread_num();
+            plasma_complex64_t *W   =
+                ((plasma_complex64_t*)work.spaces[tid]);
+
+            int ldwork = nb;
+
+            // call the kernel
+            int info = core_ztsmlq_corner(m1, n1, m2, n2, m3, n3, k, ib,
+                                          A1, lda1,
+                                          A2, lda2,
+                                          A3, lda3,
+                                          V, ldv,
+                                          T, ldt,
+                                          W, ldwork);
+
+            if (info != PlasmaSuccess) {
+                plasma_error_with_code("Error in call to COREBLAS in argument",
+                                       -info);
+                plasma_request_fail(sequence, request,
+                                    PlasmaErrorIllegalValue);
+            }
+        }
+    }
+}
+
diff --git a/core_blas/core_ztsmlq_hetra1.c b/core_blas/core_ztsmlq_hetra1.c
new file mode 100644
index 00000000..14829ddb
--- /dev/null
+++ b/core_blas/core_ztsmlq_hetra1.c
@@ -0,0 +1,196 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "plasma_internal.h"
+#include "core_lapack.h"
+
+#include <omp.h>
+
+/***************************************************************************//**
+ *
+ * @ingroup core_tsmlq_hetra1
+ *
+ * This kernel applies a Right transformation on | A1' A2 |
+ * and does not handle the transpose of A1.
+ * Needs therefore to make the explicit transpose of A1 before
+ * and after the application of the block of reflectors
+ * Can be further optimized by changing accordingly the underneath
+ * kernel ztsrfb!
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *         - PlasmaLeft  : apply Q or Q**H from the Left;
+ *         - PlasmaRight : apply Q or Q**H from the Right.
+ *
+ * @param[in] trans
+ *         - PlasmaNoTrans   :  apply Q;
+ *         - PlasmaConjTrans :  apply Q**H.
+ *
+ * @param[in] m1
+ *         The number of rows of the tile A1. m1 >= 0.
+ *
+ * @param[in] n1
+ *         The number of columns of the tile A1. n1 >= 0.
+ *
+ * @param[in] m2
+ *         The number of rows of the tile A2. m2 >= 0.
+ *         m2 = m1 if side == PlasmaRight.
+ *
+ * @param[in] n2
+ *         The number of columns of the tile A2. n2 >= 0.
+ *         n2 = n1 if side == PlasmaLeft.
+ *
+ * @param[in] k
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] ib
+ *         The inner-blocking size.  ib >= 0.
+ *
+ * @param[in,out] A1
+ *         On entry, the m1-by-n1 tile A1.
+ *         On exit, A1 is overwritten by the application of Q.
+ *
+ * @param[in] lda1
+ *         The leading dimension of the array A1. lda1 >= max(1,m1).
+ *
+ * @param[in,out] A2
+ *         On entry, the m2-by-n2 tile A2.
+ *         On exit, A2 is overwritten by the application of Q.
+ *
+ * @param[in] lda2
+ *         The leading dimension of the tile A2. lda2 >= max(1,m2).
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         CORE_ZTSLQT in the first k rows of its array argument V.
+ *!
+ * @param[in] ldv
+ *         The leading dimension of the array V. ldv >= max(1,k).
+ *
+ * @param[in] T
+ *         The ib-by-n1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] ldt
+ *         The leading dimension of the array T. ldt >= ib.
+ *
+ * @param[out] work
+ *         Workspace array of size
+ *             ldwork-by-m1 if side == PlasmaLeft
+ *             ldwork-by-ib if side == PlasmaRight
+ *
+ * @param[in] ldwork
+ *         The leading dimension of the array work.
+ *             ldwork >= max(1,ib) if side == PlasmaLeft
+ *             ldwork >= max(1,n1) if side == PlasmaRight
+ *
+ *******************************************************************************
+ *
+ * @retval PlasmaSuccess successful exit
+ * @retval < 0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+int core_ztsmlq_hetra1(plasma_enum_t side, plasma_enum_t trans,
+                       int m1, int n1, int m2, int n2, int k, int ib,
+                             plasma_complex64_t *A1, int lda1,
+                             plasma_complex64_t *A2, int lda2,
+                       const plasma_complex64_t *V,  int ldv,
+                       const plasma_complex64_t *T,  int ldt,
+                       plasma_complex64_t *work, int ldwork)
+{
+    int i, j;
+
+    // Check input arguments
+    if ( (m1 != n1) ) {
+        coreblas_error("illegal value of m1, n1");
+        return -3;
+    }
+
+    // in-place transposition of A1
+    for (j = 0; j < n1; j++){
+        A1[j + j*lda1] = conj(A1[j + j*lda1]);
+
+        for (i = j+1; i < m1; i++){
+            *work = *(A1 + i + j*lda1);
+            *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
+            *(A1 + j + i*lda1) = conj(*work);
+        }
+    }
+
+    core_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
+                A1, lda1, A2, lda2,
+                V,  ldv,  T,  ldt,
+                work, ldwork);
+
+    // in-place transposition of A1
+    for (j = 0; j < n1; j++){
+        A1[j + j*lda1] = conj(A1[j + j*lda1]);
+
+        for (i = j+1; i < m1; i++){
+            *work = *(A1 + i + j*lda1);
+            *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
+            *(A1 + j + i*lda1) = conj(*work);
+        }
+    }
+
+    return PlasmaSuccess;
+}
+
+/******************************************************************************/
+void core_omp_ztsmlq_hetra1(plasma_enum_t side, plasma_enum_t trans,
+                            int m1, int n1, int m2, int n2, int k, int ib,
+                                  plasma_complex64_t *A1, int lda1,
+                                  plasma_complex64_t *A2, int lda2,
+                            const plasma_complex64_t *V,  int ldv,
+                            const plasma_complex64_t *T,  int ldt,
+                            plasma_workspace_t work,
+                            plasma_sequence_t *sequence, plasma_request_t *request)
+{
+    int nb = n1;
+    // assuming m1 == nb, n1 == nb, m2 == nb, n2 == nb
+    #pragma omp task depend(inout:A1[0:nb*nb]) \
+                     depend(inout:A2[0:nb*nb]) \
+                     depend(in:V[0:nb*nb]) \
+                     depend(in:T[0:ib*nb])
+    {
+        if (sequence->status == PlasmaSuccess) {
+            int tid = omp_get_thread_num();
+            plasma_complex64_t *W   =
+                ((plasma_complex64_t*)work.spaces[tid]);
+
+            int ldwork = side == PlasmaLeft ? ib : nb;
+
+            // call the kernel
+            int info = core_ztsmlq_hetra1(side, trans,
+                                          m1, n1, m2, n2, k, ib,
+                                          A1, lda1,
+                                          A2, lda2,
+                                          V, ldv,
+                                          T, ldt,
+                                          W, ldwork);
+
+            if (info != PlasmaSuccess) {
+                plasma_error_with_code("Error in call to COREBLAS in argument",
+                                       -info);
+                plasma_request_fail(sequence, request,
+                                    PlasmaErrorIllegalValue);
+            }
+        }
+    }
+}
+
diff --git a/core_blas/core_ztsmqr_corner.c b/core_blas/core_ztsmqr_corner.c
new file mode 100644
index 00000000..02e22c4e
--- /dev/null
+++ b/core_blas/core_ztsmqr_corner.c
@@ -0,0 +1,248 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "plasma_internal.h"
+#include "core_lapack.h"
+
+#include <omp.h>
+
+/***************************************************************************//**
+ *
+ * @ingroup core_tsmqr_corner
+ *
+ * This kernel applies left and right transformations as depicted below:
+ * |I -VT'V'| * | A1 A2'| * |I - VTV'|
+ *              | A2 A3 |
+ * where A1 and A3 are symmetric matrices.
+ * Only the lower part is referenced.
+ * This is an adhoc implementation, can be further optimized...
+ *
+ *******************************************************************************
+ *
+ * @param[in] m1
+ *         The number of rows of the tile A1. m1 >= 0.
+ *
+ * @param[in] n1
+ *         The number of columns of the tile A1. n1 >= 0.
+ *
+ * @param[in] m2
+ *         The number of rows of the tile A2. m2 >= 0.
+ *
+ * @param[in] n2
+ *         The number of columns of the tile A2. n2 >= 0.
+ *
+ * @param[in] m3
+ *         The number of rows of the tile A3. m3 >= 0.
+ *
+ * @param[in] n3
+ *         The number of columns of the tile A3. n3 >= 0.
+ *
+ * @param[in] k
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] ib
+ *         The inner-blocking size.  ib >= 0.
+ *
+ * @param[in,out] A1
+ *         On entry, the m1-by-n1 tile A1.
+ *         On exit, A1 is overwritten by the application of Q.
+ *
+ * @param[in] lda1
+ *         The leading dimension of the array A1. lda1 >= max(1,m1).
+ *
+ * @param[in,out] A2
+ *         On entry, the m2-by-n2 tile A2.
+ *         On exit, A2 is overwritten by the application of Q.
+ *
+ * @param[in] lda2
+ *         The leading dimension of the tile A2. lda2 >= max(1,m2).
+ *
+ * @param[in,out] A3
+ *         On entry, the m3-by-n3 tile A3.
+ *
+ * @param[in] lda3
+ *         The leading dimension of the tile A3. lda3 >= max(1,m3).
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         core_ZTSQRT in the first k columns of its array argument V.
+ *
+ * @param[in] ldv
+ *         The leading dimension of the array V. ldv >= max(1,K).
+ *
+ * @param[in] T
+ *         The ib-by-n1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] ldt
+ *         The leading dimension of the array T. ldt >= ib.
+ *
+ * @param[out] work
+ *         Workspace array of size
+ *             ldwork-by-n1 if side == PlasmaLeft
+ *             ldwork-by-ib if side == PlasmaRight
+ *
+ * @param[in] ldwork
+ *         The leading dimension of the array work.
+ *             ldwork >= max(1,ib) if side == PlasmaLeft
+ *             ldwork >= max(1,m1) if side == PlasmaRight
+ *
+ *******************************************************************************
+ *
+ * @retval PlasmaSuccess successful exit
+ * @retval < 0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+int core_ztsmqr_corner(int m1, int n1, int m2, int n2,
+                       int m3, int n3, int k, int ib,
+                              plasma_complex64_t *A1, int lda1,
+                              plasma_complex64_t *A2, int lda2,
+                              plasma_complex64_t *A3, int lda3,
+                        const plasma_complex64_t *V, int ldv,
+                        const plasma_complex64_t *T, int ldt,
+                        plasma_complex64_t *work, int ldwork)
+{
+    int i, j;
+    plasma_enum_t side, trans;
+    
+    // Check input arguments.
+    if ( m1 != n1 ) {
+        coreblas_error("Illegal value of m1, n1");
+        return -1;
+    }
+    int nb = n1;
+    //  Rebuild the symmetric block: work <- A1
+    for (j = 0; j < n1; j++)
+        for (i = j; i < m1; i++){
+            *(work + i + j*ldwork) = *(A1 + i + j*lda1);
+            if (i > j){
+                *(work + j + i*ldwork) =  conj( *(work + i + j*ldwork) );
+            }
+        }
+    
+    //  Copy the transpose of A2: work+nb*ldwork <- A2'
+    for (j = 0; j < n2; j++)
+        for (i = 0; i < m2; i++){
+            *(work + j + (i + nb) * ldwork) = conj( *(A2 + i + j*lda2) );
+        }
+
+    side  = PlasmaLeft;
+    trans = Plasma_ConjTrans;
+
+    //==============================================
+    //  Left application on |A1|
+    //                      |A2|
+    //=============================================
+    core_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
+                work, ldwork, A2, lda2,
+                V, ldv, T, ldt,
+                work + 3*nb*ldwork, ldwork);
+
+    //  Rebuild the symmetric block: work+2*nb*ldwork <- A3
+    for (j = 0; j < n3; j++)
+        for (i = j; i < m3; i++){
+            *(work + i + (j + 2*nb) * ldwork) = *(A3 + i + j*lda3);
+            if (i != j){
+                *(work + j + (i + 2*nb) * ldwork) =  conj( *(work + i + (j + 2*nb) * ldwork) );
+            }
+        }
+    //===========================================
+    //  Left application on | A2'|
+    //                      | A3 |
+    //==========================================
+    core_ztsmqr(side, trans, n2, m2, m3, n3, k, ib,
+                work+nb*ldwork, ldwork, work+2*nb*ldwork, ldwork,
+                V, ldv, T, ldt,
+                work + 3*nb*ldwork, ldwork);
+
+    side  = PlasmaRight;
+    trans = PlasmaNoTrans;
+
+    //  Right application on | A1 A2' |
+    core_ztsmqr(side, trans, m1, n1, n2, m2, k, ib,
+                work, ldwork, work+nb*ldwork, ldwork,
+                V, ldv, T, ldt,
+                work + 3*nb*ldwork, ldwork);
+
+    //  Copy back the final result to the lower part of A1
+    //  A1 = work
+    for (j = 0; j < n1; j++)
+        for (i = j; i < m1; i++)
+            *(A1 + i + j*lda1) = *(work + i + j*ldwork);
+
+    //  Right application on | A2 A3 |
+    core_ztsmqr(side, trans, m2, n2, m3, n3, k, ib,
+                A2, lda2, work+2*nb*ldwork, ldwork,
+                V,  ldv,  T, ldt,
+                work + 3*nb*ldwork, ldwork);
+
+    //=======================================================
+    //  Copy back the final result to the lower part of A3
+    //  A3 = work+2*nb*ldwork
+    //=======================================================
+    for (j = 0; j < n3; j++)
+        for (i = j; i < m3; i++)
+            *(A3 + i + j*lda3) = *(work + i + (j+ 2*nb) * ldwork);
+    
+    return PlasmaSuccess;
+}
+
+/*****************************************************************************/
+void core_omp_ztsmqr_corner(int m1, int n1, int m2, int n2,
+                            int m3, int n3, int k, int ib,
+                                  plasma_complex64_t *A1, int lda1,
+                                  plasma_complex64_t *A2, int lda2,
+                                  plasma_complex64_t *A3, int lda3,
+                            const plasma_complex64_t *V,  int ldv,
+                            const plasma_complex64_t *T,  int ldt,
+                            plasma_workspace_t work,
+                            plasma_sequence_t *sequence, plasma_request_t *request)
+{
+    
+    // omp depends assume m1 == nb, n1 == nb, m2 == nb, n2 == nb,
+    // m3 == nb, n3 == nb.
+    int nb = n1;
+    #pragma omp task depend(inout:A1[0:nb*nb]) \
+                     depend(inout:A2[0:nb*nb]) \
+                     depend(inout:A3[0:nb*nb]) \
+                     depend(in:V[0:nb*nb]) \
+                     depend(in:T[0:ib*nb])
+    {
+        if (sequence->status == PlasmaSuccess) {
+            int tid = omp_get_thread_num();
+            plasma_complex64_t *W   =
+                ((plasma_complex64_t*)work.spaces[tid]);
+
+            int ldwork = nb;
+
+            // Call the kernel.
+            int info = core_ztsmqr_corner(m1, n1, m2, n2, m3, n3, k, ib,
+                                          A1, lda1,
+                                          A2, lda2,
+                                          A3, lda3,
+                                          V, ldv,
+                                          T, ldt,
+                                          W, ldwork);
+            if (info != PlasmaSuccess) {
+                plasma_error_with_code("Error in call to COREBLAS in argument",
+                                       -info);
+                plasma_request_fail(sequence, request,
+                                    PlasmaErrorIllegalValue);
+            }
+        }
+    }
+}
diff --git a/core_blas/core_ztsmqr_hetra1.c b/core_blas/core_ztsmqr_hetra1.c
new file mode 100644
index 00000000..50d8dda0
--- /dev/null
+++ b/core_blas/core_ztsmqr_hetra1.c
@@ -0,0 +1,199 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> c d s
+ *
+ **/
+
+#include "core_blas.h"
+#include "plasma_types.h"
+#include "plasma_internal.h"
+#include "core_lapack.h"
+
+#include <omp.h>
+
+/***************************************************************************//**
+ *
+ * @ingroup core_tsmqr_hetra1
+ *
+ *  This kernel applies a left transformation on | A1'|
+ *                                               | A2 |
+ *
+ * Needs therefore to make the explicit transpose of A1 before
+ * and after the application of the block of reflectors
+ * Can be further optimized by changing accordingly the underneath
+ * kernel ztsrfb!
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *         - PlasmaLeft  : apply Q or Q**H from the Left;
+ *         - PlasmaRight : apply Q or Q**H from the Right.
+ *
+ * @param[in] trans
+ *         - PlasmaNoTrans   :  apply Q;
+ *         - PlasmaConjTrans :  apply Q**H.
+ *
+ * @param[in] m1
+ *         The number of rows of the tile A1. m1 >= 0.
+ *
+ * @param[in] n1
+ *         The number of columns of the tile A1. n1 >= 0.
+ *
+ * @param[in] m2
+ *         The number of rows of the tile A2. m2 >= 0.
+ *         m2 = m1 if side == PlasmaRight.
+ *
+ * @param[in] n2
+ *         The number of columns of the tile A2. n2 >= 0.
+ *         n2 = n1 if side == PlasmaLeft.
+ *
+ * @param[in] k
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] ib
+ *         The inner-blocking size.  ib >= 0.
+ *
+ * @param[in,out] A1
+ *         On entry, the m1-by-n1 tile A1.
+ *         On exit, A1 is overwritten by the application of Q.
+ *
+ * @param[in] lda1
+ *         The leading dimension of the array A1. lda1 >= max(1,m1).
+ *
+ * @param[in,out] A2
+ *         On entry, the m2-by-n2 tile A2.
+ *         On exit, A2 is overwritten by the application of Q.
+ *
+ * @param[in] lda2
+ *         The leading dimension of the tile A2. lda2 >= max(1,m2).
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         core_ZTSQRT in the first k columns of its array argument V.
+ *
+ * @param[in] ldv
+ *         The leading dimension of the array V. ldv >= max(1,K).
+ *
+ * @param[in] T
+ *         The ib-by-n1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] ldt
+ *         The leading dimension of the array T. ldt >= ib.
+ *
+ * @param[out] work
+ *         Workspace array of size
+ *             ldwork-by-n1 if side == PlasmaLeft
+ *             ldwork-by-ib if side == PlasmaRight
+ *
+ * @param[in] ldwork
+ *         The leading dimension of the array work.
+ *             ldwork >= max(1,ib) if side == PlasmaLeft
+ *             ldwork >= max(1,m1) if side == PlasmaRight
+ *
+ *******************************************************************************
+ *
+ * @retval PlasmaSuccess successful exit
+ * @retval < 0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+int core_ztsmqr_hetra1(plasma_enum_t side, plasma_enum_t trans,
+                        int m1, int n1, int m2, int n2, int k, int ib,
+                              plasma_complex64_t *A1, int lda1,
+                              plasma_complex64_t *A2, int lda2,
+                        const plasma_complex64_t *V,  int ldv,
+                        const plasma_complex64_t *T,  int ldt,
+                              plasma_complex64_t *work, int ldwork)
+{
+    int i, j;
+
+    // Check input arguments.
+    if ( (m1 != n1) ) {
+        coreblas_error("Illegal value of m1, n1");
+        return -3;
+    }
+
+    // in-place transposition of A1 
+    for (j = 0; j < n1; j++){
+        A1[j + j*lda1] = conj(A1[j + j*lda1]);
+
+        for (i = j+1; i < m1; i++){
+            *work = *(A1 + i + j*lda1);
+            *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
+            *(A1 + j + i*lda1) = conj(*work);
+        }
+    }
+
+    core_ztsmqr(side, trans,
+                m1, n1, m2, n2, k, ib,
+                A1, lda1,
+                A2, lda2,
+                V, ldv,
+                T, ldt,
+                work, ldwork);
+
+    // in-place transposition of A1
+    for (j = 0; j < n1; j++){
+        A1[j + j*lda1] = conj(A1[j + j*lda1]);
+
+        for (i = j+1; i < m1; i++){
+            *work = *(A1 + i + j*lda1);
+            *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
+            *(A1 + j + i*lda1) = conj(*work);
+        }
+    }
+
+    return PlasmaSuccess;
+}
+
+/******************************************************************************/
+void core_omp_ztsmqr_hetra1(plasma_enum_t side, plasma_enum_t trans,
+                            int m1, int n1, int m2, int n2, int k, int ib,
+                                  plasma_complex64_t *A1, int lda1,
+                                  plasma_complex64_t *A2, int lda2,
+                            const plasma_complex64_t *V,  int ldv,
+                            const plasma_complex64_t *T,  int ldt,
+                            plasma_workspace_t work,
+                            plasma_sequence_t *sequence, plasma_request_t *request)
+{
+    int nb = n1;
+    // omp depends assume m1 == nb, n1 == nb, m2 == nb, n2 == nb.
+    #pragma omp task depend(inout:A1[0:nb*nb]) \
+                     depend(inout:A2[0:nb*nb]) \
+                     depend(in:V[0:nb*nb]) \
+                     depend(in:T[0:ib*nb])
+    {
+        if (sequence->status == PlasmaSuccess) {
+            int tid = omp_get_thread_num();
+            plasma_complex64_t *W   =
+                ((plasma_complex64_t*)work.spaces[tid]);
+
+            int ldwork = side == PlasmaLeft ? ib : nb;
+
+            // Call the kernel.
+            int info = core_ztsmqr_hetra1(side, trans,
+                                   m1, n1, m2, n2, k, ib,
+                                   A1, lda1,
+                                   A2, lda2,
+                                   V, ldv,
+                                   T, ldt,
+                                   W, ldwork);
+
+            if (info != PlasmaSuccess) {
+                plasma_error_with_code("Error in call to COREBLAS in argument",
+                                       -info);
+                plasma_request_fail(sequence, request,
+                                    PlasmaErrorIllegalValue);
+            }
+        }
+    }
+}
diff --git a/test/test_zheevd.c b/test/test_zheevd.c
new file mode 100644
index 00000000..1a55d85d
--- /dev/null
+++ b/test/test_zheevd.c
@@ -0,0 +1,189 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> s d c
+ *
+ **/
+
+#include "test.h"
+#include "flops.h"
+#include "core_blas.h"
+#include "core_lapack.h"
+#include "plasma.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <omp.h>
+
+#define COMPLEX
+
+#undef  REAL
+#define COMPLEX
+
+/***************************************************************************//**
+ *
+ * @brief Tests ZHEEVD.
+ *
+ * @param[in]  param - array of parameters
+ * @param[out] info  - string of column labels or column values; length InfoLen
+ *
+ * If param is NULL and info is NULL,     print usage and return.
+ * If param is NULL and info is non-NULL, set info to column labels and return.
+ * If param is non-NULL and info is non-NULL, set info to column values
+ * and run test.
+ ******************************************************************************/
+void test_zheevd(param_value_t param[], bool run)
+{
+
+    //================================================================
+    // Mark which parameters are used.
+    //================================================================
+    param[PARAM_EIGT  ].used = true;
+    param[PARAM_UPLO  ].used = true;
+    param[PARAM_DIM   ].used = PARAM_USE_N;
+    param[PARAM_PADA  ].used = true;
+    param[PARAM_NB    ].used = true;
+    param[PARAM_IB    ].used = true;
+    param[PARAM_HMODE ].used = true;
+    if (! run)
+        return;
+
+    //================================================================
+    // Set parameters.
+    //================================================================
+    plasma_enum_t eigt = plasma_eigt_const(param[PARAM_EIGT].c);
+    plasma_enum_t uplo = plasma_uplo_const(param[PARAM_UPLO].c);
+ 
+    int n = param[PARAM_DIM].dim.n;
+
+    int lda = imax(1, n + param[PARAM_PADA].i);
+
+    int    test = param[PARAM_TEST].c == 'y';
+    double tol  = param[PARAM_TOL].d * LAPACKE_dlamch('E');
+
+    //================================================================
+    // Set tuning parameters.
+    //================================================================
+    plasma_set(PlasmaNb, param[PARAM_NB].i);
+    plasma_set(PlasmaIb, param[PARAM_NB].i/4);
+
+
+    //================================================================
+    // Allocate and initialize arrays.
+    //================================================================
+    plasma_complex64_t *A = (plasma_complex64_t *)malloc(
+        (size_t)n*lda*sizeof(plasma_complex64_t));
+    
+    plasma_complex64_t *Aref = NULL;
+    plasma_complex64_t *Q    = NULL;
+    double             *Wref = NULL;
+    plasma_complex64_t *work = NULL;
+    double             *W = (double*)malloc((size_t)n*sizeof(double));
+    int seed[] = {0, 0, 0, 1};
+    if (test) {
+        Wref = (double*)malloc((size_t)n*sizeof(double));
+        work = (plasma_complex64_t *)malloc(
+            (size_t)3*n*sizeof(plasma_complex64_t));
+        
+        for (int i=0; i< n; i++){
+            Wref[i] = (double )i+1;
+        }
+        
+        int    mode  = 0;
+        double dmax  = 1.0;
+        double rcond = 1.0e6;
+        LAPACKE_zlatms_work(LAPACK_COL_MAJOR, n, n,
+                           'S', seed,
+                           'H', Wref, mode, rcond,
+                            dmax, n, n,
+                           'N', A, lda, work);
+
+        // Sort the eigenvalues
+        LAPACKE_dlasrt_work( 'I', n, Wref);
+
+        // Copy A into Aref
+        Aref = (plasma_complex64_t *)malloc(
+            (size_t)n*lda*sizeof(plasma_complex64_t));
+        LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,
+                            'A', n, n, A, lda, Aref, lda);
+    } else {
+        LAPACKE_zlarnv(1, seed, (size_t)lda*n, A);
+    }
+
+
+    int ldq = lda;
+    if (eigt == PlasmaEigValVec) {
+        Q = (plasma_complex64_t *)malloc(
+            (size_t)n*ldq*sizeof(plasma_complex64_t)); 
+    }
+    
+    //================================================================
+    // Prepare the descriptor for matrix T.
+    //================================================================
+    plasma_desc_t T;
+
+    //================================================================
+    // Run and time PLASMA.
+    //================================================================
+    plasma_time_t start = omp_get_wtime();
+
+    plasma_zheevd(eigt, uplo, n, A, lda, &T, W, Q, ldq);
+    //LAPACKE_zheevd( LAPACK_COL_MAJOR,
+    //               'N', 'L',  n, A, lda, W);
+    plasma_time_t stop = omp_get_wtime();
+    plasma_time_t time = stop-start;
+    
+    param[PARAM_TIME].d = time;
+    param[PARAM_GFLOPS].d = flops_zgeqrf(n, n) / time / 1e9;
+
+    if (test) {
+        
+        // check the correctness of  the eigenvalues values
+        double error = 0;
+        for (int i = 0; i < n; i++){
+            error  += fabs(fabs(W[i])-fabs(Wref[i]))/fabs(Wref[i]);
+        }
+
+        error /= n*40 ;
+        // Othorgonality test 
+        double done  =  1.0;
+        double mdone = -1.0;
+        
+        // Build the idendity matrix 
+        plasma_complex64_t *Id = (plasma_complex64_t *) malloc(n*n*sizeof(plasma_complex64_t));
+        LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', n, n, 0., 1., Id, n);
+
+        double ortho = 0.;
+        if (eigt == PlasmaEigValVec) {
+            // Perform Id - Q'Q
+            cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, n, n, done, Q, n, mdone, Id, n);
+            double normQ = LAPACKE_zlanhe_work(LAPACK_COL_MAJOR, 'I', 'U', n, Id, n, (double*)work);
+            ortho = normQ/n;
+        }
+        param[PARAM_ERROR].d = error;
+        param[PARAM_ORTHO].d = ortho;
+        param[PARAM_SUCCESS].i = (error < tol && ortho < tol);
+        
+    }
+    //================================================================
+    // Free arrays.
+    //================================================================
+    // plasma_desc_destroy(&T);
+    free(A);
+    free(W);
+    free(work);
+    if (test) {
+        free(Aref);
+        free(Wref);
+    }
+    if (eigt == PlasmaEigValVec) free(Q);
+}

From 5626abc69f07ecd91dd63f158f66b11c4f263d93 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Fri, 17 Jan 2025 16:12:02 -0500
Subject: [PATCH 09/12] add eig routines to headers, CMake, codegen subs.py

---
 CMakeLists.txt               | 15 +++++++
 include/plasma_core_blas_z.h | 79 ++++++++++++++++++++++++++++++++++++
 include/plasma_internal_z.h  | 14 +++++++
 include/plasma_z.h           | 15 +++++++
 test/test.c                  |  5 +++
 test/test_z.h                |  1 +
 tools/subs.py                | 12 ++++--
 7 files changed, 137 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6bc02698..afda6a14 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -362,6 +362,10 @@ set( plasma_src
     compute/pzgeswp.c
     compute/pzgetrf.c
     compute/pzgetri_aux.c
+    compute/pzhbtrd_static.c
+    compute/pzhe2hb.c
+    compute/pzheb2trd_static.c
+    compute/pzhecpy_tile2lapack_band.c
     compute/pzhemm.c
     compute/pzher2k.c
     compute/pzherk.c
@@ -426,6 +430,7 @@ set( plasma_src
     compute/zgetri.c
     compute/zgetri_aux.c
     compute/zgetrs.c
+    compute/zheevd.c
     compute/zhemm.c
     compute/zher2k.c
     compute/zherk.c
@@ -510,9 +515,13 @@ set( plasma_core_blas_src
     core_blas/core_zgessq.c
     core_blas/core_zgeswp.c
     core_blas/core_zgetrf.c
+    core_blas/core_zhbtype1cb.c
+    core_blas/core_zhbtype2cb.c
+    core_blas/core_zhbtype3cb.c
     core_blas/core_zhegst.c
     core_blas/core_zhemm.c
     core_blas/core_zher2k.c
+    core_blas/core_zherfb.c
     core_blas/core_zherk.c
     core_blas/core_zhessq.c
     core_blas/core_zheswp.c
@@ -524,6 +533,7 @@ set( plasma_core_blas_src
     core_blas/core_zlansy.c
     core_blas/core_zlantr.c
     core_blas/core_zlarfb_gemm.c
+    core_blas/core_zlarfy.c
     core_blas/core_zlascl.c
     core_blas/core_zlaset.c
     core_blas/core_zlauum.c
@@ -542,7 +552,11 @@ set( plasma_core_blas_src
     core_blas/core_ztrtri.c
     core_blas/core_ztslqt.c
     core_blas/core_ztsmlq.c
+    core_blas/core_ztsmlq_corner.c
+    core_blas/core_ztsmlq_hetra1.c
     core_blas/core_ztsmqr.c
+    core_blas/core_ztsmqr_corner.c
+    core_blas/core_ztsmqr_hetra1.c
     core_blas/core_ztsqrt.c
     core_blas/core_zttlqt.c
     core_blas/core_zttmlq.c
@@ -604,6 +618,7 @@ set( plasma_test_src
     test/test_zgetri.c
     test/test_zgetri_aux.c
     test/test_zgetrs.c
+    test/test_zheevd.c
     test/test_zhemm.c
     test/test_zher2k.c
     test/test_zherk.c
diff --git a/include/plasma_core_blas_z.h b/include/plasma_core_blas_z.h
index 317d0229..f8ca397f 100644
--- a/include/plasma_core_blas_z.h
+++ b/include/plasma_core_blas_z.h
@@ -91,6 +91,27 @@ void plasma_core_zgetrf(
     volatile int *max_idx, volatile plasma_complex64_t *max_val,
     volatile int *info, plasma_barrier_t *barrier);
 
+void plasma_core_zhbtype1cb(
+    int n, int nb,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *V, plasma_complex64_t *tau,
+    int st, int ed, int sweep, int Vblksiz, int wantz,
+    plasma_complex64_t *work);
+
+void plasma_core_zhbtype2cb(
+    int n, int nb,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *V, plasma_complex64_t *tau,
+    int st, int ed, int sweep, int vblksiz, int wantz,
+    plasma_complex64_t *work);
+
+void plasma_core_zhbtype3cb(
+    int n, int nb,
+    plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *V, const plasma_complex64_t *tau,
+    int st, int ed, int sweep, int vblksiz, int wantz,
+    plasma_complex64_t *work);
+
 int plasma_core_zhegst(
     int itype, plasma_enum_t uplo,
     int n,
@@ -181,6 +202,13 @@ int plasma_core_zlarfb_gemm(
     plasma_complex64_t *C, int LDC,
     plasma_complex64_t *WORK, int LDWORK);
 
+void plasma_core_zlarfy(
+    int n,
+    plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *V,
+    const plasma_complex64_t *tau,
+    plasma_complex64_t *work);
+
 void plasma_core_zlascl(
     plasma_enum_t uplo,
     double cfrom, double cto,
@@ -446,6 +474,15 @@ void plasma_core_omp_zher2k(
     double beta,                    plasma_complex64_t *C, int ldc,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
+void plasma_core_omp_zherfb(
+    plasma_enum_t uplo,
+    int n, int k, int ib,
+    const plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *T, int ldt,
+    plasma_complex64_t *C, int ldc,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
 void plasma_core_omp_zherk(
     plasma_enum_t uplo, plasma_enum_t trans,
     int n, int k,
@@ -662,6 +699,27 @@ void plasma_core_omp_ztsmlq(
     plasma_workspace_t work,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
+void plasma_core_omp_ztsmlq_hetra1(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_ztsmlq_corner(
+    int m1, int n1, int m2, int n2,
+    int m3, int n3, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+          plasma_complex64_t *A3, int lda3,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
 void plasma_core_omp_ztsmqr(
     plasma_enum_t side, plasma_enum_t trans,
     int m1, int n1, int m2, int n2, int k, int ib,
@@ -672,6 +730,27 @@ void plasma_core_omp_ztsmqr(
     plasma_workspace_t work,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
+void plasma_core_omp_ztsmqr_corner(
+    int m1, int n1, int m2, int n2,
+    int m3, int n3, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+          plasma_complex64_t *A3, int lda3,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_core_omp_ztsmqr_hetra1(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
 void plasma_core_omp_ztsqrt(
     int m, int n, int ib,
     plasma_complex64_t *A1, int lda1,
diff --git a/include/plasma_internal_z.h b/include/plasma_internal_z.h
index 7be57888..67ae5304 100644
--- a/include/plasma_internal_z.h
+++ b/include/plasma_internal_z.h
@@ -123,6 +123,20 @@ void plasma_pzgecpy_tile2lapack_band(
     plasma_complex64_t *pA_band, int lda_band,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
+void plasma_pzhbtrd_static(
+    plasma_enum_t uplo, int n, int nb, int Vblksiz,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *V, plasma_complex64_t *tau,
+    double *d, double *e, int wantz,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
+void plasma_pzhe2hb(
+    plasma_enum_t uplo,
+    plasma_desc_t A, plasma_desc_t T,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
 void plasma_pzhecpy_tile2lapack_band(
     plasma_enum_t uplo, plasma_desc_t A,
     plasma_complex64_t *AB, int ldab,
diff --git a/include/plasma_z.h b/include/plasma_z.h
index 4f1a856c..32254f52 100644
--- a/include/plasma_z.h
+++ b/include/plasma_z.h
@@ -133,6 +133,13 @@ int plasma_zgetrs(
     plasma_complex64_t *pA, int lda, int *ipiv,
     plasma_complex64_t *pB, int ldb);
 
+int plasma_zheevd(
+    plasma_enum_t job, plasma_enum_t uplo, int N,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t *T,
+    double *Lambda,
+    plasma_complex64_t *pQ, int ldq);
+
 int plasma_zhemm(
     plasma_enum_t side, plasma_enum_t uplo,
     int m, int n,
@@ -487,6 +494,14 @@ void plasma_omp_zgetrs(
     plasma_desc_t B,
     plasma_sequence_t *sequence, plasma_request_t *request);
 
+void plasma_omp_zheevd(
+    plasma_enum_t job, plasma_enum_t uplo,
+    plasma_desc_t A, plasma_desc_t T,
+    double *Lambda,
+    plasma_complex64_t *pQ, int ldq,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request);
+
 void plasma_omp_zhemm(
     plasma_enum_t side, plasma_enum_t uplo,
     plasma_complex64_t alpha, plasma_desc_t A,
diff --git a/test/test.c b/test/test.c
index 093506bc..d614b24f 100644
--- a/test/test.c
+++ b/test/test.c
@@ -133,6 +133,11 @@ struct routines_t routines[] =
     { "cgetrs", test_cgetrs },
     { "sgetrs", test_sgetrs },
 
+    { "zheevd", test_zheevd },
+    //{ "dheevd", test_dheevd },
+    //{ "csyevd", test_csyevd },
+    //{ "ssyevd", test_ssyevd },
+
     { "zhemm", test_zhemm },
     { "", NULL },
     { "chemm", test_chemm },
diff --git a/test/test_z.h b/test/test_z.h
index c5e47410..d5430429 100644
--- a/test/test_z.h
+++ b/test/test_z.h
@@ -35,6 +35,7 @@ void test_zgetrf(param_value_t param[], bool run);
 void test_zgetri(param_value_t param[], bool run);
 void test_zgetri_aux(param_value_t param[], bool run);
 void test_zgetrs(param_value_t param[], bool run);
+void test_zheevd(param_value_t param[], bool run);
 void test_zhemm(param_value_t param[], bool run);
 void test_zher2k(param_value_t param[], bool run);
 void test_zherk(param_value_t param[], bool run);
diff --git a/tools/subs.py b/tools/subs.py
index aad67cd2..37d19602 100644
--- a/tools/subs.py
+++ b/tools/subs.py
@@ -509,11 +509,15 @@ def title( table ):
     ('float',                'double',               'float',                'double'              ),
 
     # ----- PLASMA / MAGMA functions, alphabetic order
-    ('sy2sb',                'sy2sb',                'he2hb',                'he2hb'               ),
+    ('ssy2sb',               'dsy2sb',               'che2hb',               'zhe2hb'              ),
+    ('ssyb2trd',             'dsyb2trd',             'cheb2trd',             'zheb2trd'            ),  # todo sb/hb
 
-    ('sgbtype1cb',           'dgbtype1cb',           'cgbtype1cb',           'zgbtype1cb'          ),
-    ('sgbtype2cb',           'dgbtype2cb',           'cgbtype2cb',           'zgbtype2cb'          ),
-    ('sgbtype3cb',           'dgbtype3cb',           'cgbtype3cb',           'zgbtype3cb'          ),
+    ('sgbtype',              'dgbtype',              'cgbtype',              'zgbtype'             ),
+    ('sbrd',                 'dbrd',                 'cbrd',                 'zbrd'                ),
+    ('psgb2lapack_band',     'pdgb2lapack_band',     'pcgb2lapack_band',     'pzgb2lapack_band'    ),
+
+    ('ssbtype',              'dsbtype',              'chbtype',              'zhbtype'             ),
+    ('strd',                 'dtrd',                 'ctrd',                 'ztrd'                ),
 
     ('psdesc2ge',            'pddesc2ge',            'pcdesc2ge',            'pzdesc2ge'           ),
     ('psge2desc',            'pdge2desc',            'pcge2desc',            'pzge2desc'           ),

From 766a8228af9fc4ee502664280e426818ff58ba72 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Wed, 15 Jan 2025 17:16:11 -0500
Subject: [PATCH 10/12] update eig routines to fix conventions

---
 compute/pzhe2hb.c                  | 343 ++++++++++++++---------------
 compute/pzheb2trd_static.c         | 289 ++++++++++++------------
 compute/pzhecpy_tile2lapack_band.c | 113 +++++-----
 compute/zheevd.c                   | 241 ++++++++++----------
 core_blas/core_zhbtype1cb.c        |  75 ++++---
 core_blas/core_zhbtype2cb.c        |  90 ++++----
 core_blas/core_zhbtype3cb.c        |  71 +++---
 core_blas/core_zherfb.c            | 140 ++++++------
 core_blas/core_zlarfy.c            |  19 +-
 core_blas/core_ztsmlq_corner.c     | 117 +++++-----
 core_blas/core_ztsmlq_hetra1.c     |  70 +++---
 core_blas/core_ztsmqr_corner.c     | 127 ++++++-----
 core_blas/core_ztsmqr_hetra1.c     |  80 +++----
 test/test_zheevd.c                 |  87 ++++----
 14 files changed, 946 insertions(+), 916 deletions(-)

diff --git a/compute/pzhe2hb.c b/compute/pzhe2hb.c
index 2ed83598..cc9aaf7d 100644
--- a/compute/pzhe2hb.c
+++ b/compute/pzhe2hb.c
@@ -15,7 +15,7 @@
 #include "plasma_descriptor.h"
 #include "plasma_types.h"
 #include "plasma_internal.h"
-#include "core_blas_z.h"
+#include "plasma_core_blas_z.h"
 
 #define A(m, n) ((plasma_complex64_t*) plasma_tile_addr(A, m, n))
 #define T(m, n) ((plasma_complex64_t*) plasma_tile_addr(T, m, n))
@@ -31,9 +31,8 @@ void plasma_pzhe2hb(plasma_enum_t uplo,
     if (sequence->status != PlasmaSuccess)
         return;
 
-
-    // Case nb>n  only 1 tile 
-    if(A.mt > A.m)
+    // Case nb>n  only 1 tile
+    if (A.mt > A.m)
         return;
 
     // Set inner blocking from the plasma context
@@ -46,180 +45,180 @@ void plasma_pzhe2hb(plasma_enum_t uplo,
     int ib = plasma->ib;
 
     if (uplo == PlasmaLower) {
-       for (int k = 0; k < A.nt-1; k++){
-           int nvak = plasma_tile_nview(A, k+1);
-           int ldak = plasma_tile_mmain(A, k+1);
-           core_omp_zgeqrt(
-               nvak, A.nb, ib,
-               A(k+1, k), ldak,
-               T(k+1, k), T.mb,
-               work,
-               sequence, request);
-
-           // LEFT and RIGHT on the symmetric diagonal block
-           core_omp_zherfb(
-               PlasmaLower,
-               nvak, nvak, ib,
-               A(k+1,   k), ldak,
-               T(k+1,   k), T.mb,
-               A(k+1, k+1), ldak,
-               work,
-               sequence, request);
-
-           // RIGHT on the remaining tiles until the bottom 
-           for (int m = k+2; m < A.mt ; m++) {
-               int mvam = plasma_tile_mview(A, m);
-               int ldam = plasma_tile_mmain(A, m);
-               core_omp_zunmqr(
-                   PlasmaRight, PlasmaNoTrans,
-                   mvam, A.nb, nvak, ib,
-                   A(k+1,   k), ldak,
-                   T(k+1,   k), T.mb,
-                   A(m  , k+1), ldam,
-                   work,
-                   sequence, request);
-           }
+        for (int k = 0; k < A.nt-1; ++k) {
+            int nvak = plasma_tile_nview(A, k+1);
+            int ldak = plasma_tile_mmain(A, k+1);
+            plasma_core_omp_zgeqrt(
+                nvak, A.nb, ib,
+                A(k+1, k), ldak,
+                T(k+1, k), T.mb,
+                work,
+                sequence, request);
+
+            // LEFT and RIGHT on the symmetric diagonal block
+            plasma_core_omp_zherfb(
+                PlasmaLower,
+                nvak, nvak, ib,
+                A(k+1,   k), ldak,
+                T(k+1,   k), T.mb,
+                A(k+1, k+1), ldak,
+                work,
+                sequence, request);
+
+            // RIGHT on the remaining tiles until the bottom
+            for (int m = k+2; m < A.mt ; ++m) {
+                int mvam = plasma_tile_mview(A, m);
+                int ldam = plasma_tile_mmain(A, m);
+                plasma_core_omp_zunmqr(
+                    PlasmaRight, PlasmaNoTrans,
+                    mvam, A.nb, nvak, ib,
+                    A(k+1,   k), ldak,
+                    T(k+1,   k), T.mb,
+                    A(m  , k+1), ldam,
+                    work,
+                    sequence, request);
+            }
+
+            for (int m = k+2; m < A.mt; ++m) {
+                int mvam = plasma_tile_mview(A, m);
+                int ldam = plasma_tile_mmain(A, m);
+                plasma_core_omp_ztsqrt(
+                    mvam, A.nb, ib,
+                    A(k+1, k), ldak,
+                    A(m  , k), ldam,
+                    T(m  , k), T.mb,
+                    work,
+                    sequence, request);
+
+                // LEFT
+                for (int i = k+2; i < m; ++i) {
+                    int ldai = plasma_tile_mmain(A, i);
+                    plasma_core_omp_ztsmqr_hetra1(
+                        PlasmaLeft, Plasma_ConjTrans,
+                        A.mb, A.nb, mvam, A.nb, A.nb, ib,
+                        A(i, k+1), ldai,
+                        A(m,   i), ldam,
+                        A(m,   k), ldam,
+                        T(m,   k), T.mb,
+                        work,
+                        sequence, request);
+                }
+
+                // RIGHT
+                for (int j = m+1; j < A.mt ; ++j) {
+                    int mvaj = plasma_tile_mview(A, j);
+                    int ldaj = plasma_tile_mmain(A, j);
+                    plasma_core_omp_ztsmqr(
+                        PlasmaRight, PlasmaNoTrans,
+                        mvaj, A.nb, mvaj, mvam, A.nb, ib,
+                        A(j, k+1), ldaj,
+                        A(j,   m), ldaj,
+                        A(m,   k), ldam,
+                        T(m,   k), T.mb,
+                        work,
+                        sequence, request);
+                }
 
-           for (int m = k+2; m < A.mt; m++) {
-               int mvam = plasma_tile_mview(A, m);
-               int ldam = plasma_tile_mmain(A, m);
-               core_omp_ztsqrt(
-                   mvam, A.nb, ib,
-                   A(k+1, k), ldak,
-                   A(m  , k), ldam,
-                   T(m  , k), T.mb,
-                   work,
-                   sequence, request);
-               
-               // LEFT 
-               for (int i = k+2; i < m; i++) {
-                   int ldai = plasma_tile_mmain(A, i);
-                   core_omp_ztsmqr_hetra1(
-                       PlasmaLeft, Plasma_ConjTrans,
-                       A.mb, A.nb, mvam, A.nb, A.nb, ib,
-                       A(i, k+1), ldai,
-                       A(m,   i), ldam,
-                       A(m,   k), ldam,
-                       T(m,   k), T.mb,
-                       work,
-                       sequence, request);
-               }
-
-               // RIGHT 
-               for (int j = m+1; j < A.mt ; j++) {
-                   int mvaj = plasma_tile_mview(A, j);
-                   int ldaj = plasma_tile_mmain(A, j);
-                   core_omp_ztsmqr(
-                       PlasmaRight, PlasmaNoTrans,
-                       mvaj, A.nb, mvaj, mvam, A.nb, ib,
-                       A(j, k+1), ldaj,
-                       A(j,   m), ldaj,
-                       A(m,   k), ldam,
-                       T(m,   k), T.mb,
-                       work,
-                       sequence, request);
-               }
-               
-               // LEFT->RIGHT 
-               core_omp_ztsmqr_corner(
-                   A.nb, A.nb, mvam, A.nb,
-                   mvam, mvam, A.nb, ib,
-                   A(k+1, k+1), ldak,
-                   A(m  , k+1), ldam,
-                   A(m  ,   m), ldam,
-                   A(m  ,   k), ldam,
-                   T(m  ,   k), T.mb,
-                   work,
-                   sequence, request);
+                // LEFT->RIGHT
+                plasma_core_omp_ztsmqr_corner(
+                    A.nb, A.nb, mvam, A.nb,
+                    mvam, mvam, A.nb, ib,
+                    A(k+1, k+1), ldak,
+                    A(m  , k+1), ldam,
+                    A(m  ,   m), ldam,
+                    A(m  ,   k), ldam,
+                    T(m  ,   k), T.mb,
+                    work,
+                    sequence, request);
            }
        }
     }
     else {
-       for (int k = 0; k < A.nt-1; k++){
-           int nvak = plasma_tile_nview(A, k+1);
-           int ldak  = plasma_tile_mmain(A, k);
-           int ldak1 = plasma_tile_mmain(A, k+1);
-           core_omp_zgelqt(
-               A.nb, nvak, ib,
-               A(k, k+1), ldak,
-               T(k, k+1), T.mb,
-               work,
-               sequence, request);
-           
-           // RIGHT and LEFT on the symmetric diagonal block
-           core_omp_zherfb(
-               PlasmaUpper,
-               nvak, nvak, ib,
-               A(k,   k+1), ldak,
-               T(k,   k+1), T.mb,
-               A(k+1, k+1), ldak1,
-               work,
-               sequence, request);
-
-           // LEFT on the remaining tiles until the left side
-           for (int n = k+2; n < A.nt ; n++) {
-               int nvan = plasma_tile_nview(A, n);
-               core_omp_zunmlq(
-                   PlasmaLeft, PlasmaNoTrans,
-                   A.nb, nvan, nvak, ib,
-                   A(k,   k+1), ldak,
-                   T(k,   k+1), T.mb,
-                   A(k+1,   n), ldak1,
-                   work,
-                   sequence, request);
-           }
+        for (int k = 0; k < A.nt-1; ++k) {
+            int nvak = plasma_tile_nview(A, k+1);
+            int ldak  = plasma_tile_mmain(A, k);
+            int ldak1 = plasma_tile_mmain(A, k+1);
+            plasma_core_omp_zgelqt(
+                A.nb, nvak, ib,
+                A(k, k+1), ldak,
+                T(k, k+1), T.mb,
+                work,
+                sequence, request);
+
+            // RIGHT and LEFT on the symmetric diagonal block
+            plasma_core_omp_zherfb(
+                PlasmaUpper,
+                nvak, nvak, ib,
+                A(k,   k+1), ldak,
+                T(k,   k+1), T.mb,
+                A(k+1, k+1), ldak1,
+                work,
+                sequence, request);
+
+            // LEFT on the remaining tiles until the left side
+            for (int n = k+2; n < A.nt ; ++n) {
+                int nvan = plasma_tile_nview(A, n);
+                plasma_core_omp_zunmlq(
+                    PlasmaLeft, PlasmaNoTrans,
+                    A.nb, nvan, nvak, ib,
+                    A(k,   k+1), ldak,
+                    T(k,   k+1), T.mb,
+                    A(k+1,   n), ldak1,
+                    work,
+                    sequence, request);
+            }
+
+            for (int n = k+2; n < A.nt; ++n) {
+                int nvan = plasma_tile_nview(A, n);
+                int ldan = plasma_tile_nmain(A, n);
+                plasma_core_omp_ztslqt(
+                    A.nb, nvan, ib,
+                    A(k, k+1), ldak,
+                    A(k,   n), ldak,
+                    T(k,   n), T.mb,
+                    work,
+                    sequence, request);
+
+                // RIGHT
+                for (int i = k+2; i < n; ++i) {
+                    int ldai = plasma_tile_nmain(A, i);
+
+                    plasma_core_omp_ztsmlq_hetra1(
+                        PlasmaRight, Plasma_ConjTrans,
+                        A.mb, A.nb, A.nb, nvan, A.nb, ib,
+                        A(k+1, i), ldak1,
+                        A(i,   n), ldai,
+                        A(k,   n), ldak,
+                        T(k,   n), T.mb,
+                        work,
+                        sequence, request);
+                }
+
+                // LEFT
+                for (int j = n+1; j < A.nt ; ++j) {
+                    int nvaj = plasma_tile_nview(A, j);
+                    plasma_core_omp_ztsmlq(
+                        PlasmaLeft, PlasmaNoTrans,
+                        A.nb, nvaj, nvan, nvaj, A.nb, ib,
+                        A(k+1, j), ldak1,
+                        A(n,   j), ldan,
+                        A(k,   n), ldak,
+                        T(k,   n), T.mb,
+                        work,
+                        sequence, request);
+                }
 
-           for (int n = k+2; n < A.nt; n++) {
-               int nvan = plasma_tile_nview(A, n);
-               int ldan = plasma_tile_nmain(A, n);
-               core_omp_ztslqt(
-                   A.nb, nvan, ib,
-                   A(k, k+1), ldak,
-                   A(k,   n), ldak,
-                   T(k,   n), T.mb,
-                   work,
-                   sequence, request);
-               
-               // RIGHT 
-               for (int i = k+2; i < n; i++) {
-                   int ldai = plasma_tile_nmain(A, i);
-
-                   core_omp_ztsmlq_hetra1(
-                       PlasmaRight, Plasma_ConjTrans,
-                       A.mb, A.nb, A.nb, nvan, A.nb, ib,
-                       A(k+1, i), ldak1,
-                       A(i,   n), ldai,
-                       A(k,   n), ldak,
-                       T(k,   n), T.mb,
-                       work,
-                       sequence, request);
-               }
-
-               // LEFT 
-               for (int j = n+1; j < A.nt ; j++) {
-                   int nvaj = plasma_tile_nview(A, j);
-                   core_omp_ztsmlq(
-                       PlasmaLeft, PlasmaNoTrans,
-                       A.nb, nvaj, nvan, nvaj, A.nb, ib,
-                       A(k+1, j), ldak1,
-                       A(n,   j), ldan,
-                       A(k,   n), ldak,
-                       T(k,   n), T.mb,
-                       work,
-                       sequence, request);
-               }
-
-               // RIGHT->LEFT
-               core_omp_ztsmlq_corner(
-                   A.nb, A.nb, A.nb, nvan,
-                   nvan, nvan, A.nb, ib,
-                   A(k+1, k+1), ldak1,
-                   A(k+1,   n), ldak1,
-                   A(n  ,   n), ldan,
-                   A(k  ,   n), ldak,
-                   T(k  ,   n), T.mb,
-                   work,
-                   sequence, request);
+                // RIGHT->LEFT
+                plasma_core_omp_ztsmlq_corner(
+                    A.nb, A.nb, A.nb, nvan,
+                    nvan, nvan, A.nb, ib,
+                    A(k+1, k+1), ldak1,
+                    A(k+1,   n), ldak1,
+                    A(n  ,   n), ldan,
+                    A(k  ,   n), ldak,
+                    T(k  ,   n), T.mb,
+                    work,
+                    sequence, request);
            }
        }
     }
diff --git a/compute/pzheb2trd_static.c b/compute/pzheb2trd_static.c
index 0ae4e041..1acb5c55 100755
--- a/compute/pzheb2trd_static.c
+++ b/compute/pzheb2trd_static.c
@@ -16,8 +16,9 @@
 #include "plasma_internal.h"
 #include "plasma_types.h"
 #include "plasma_workspace.h"
+#include "plasma_core_blas.h"
 #include "bulge.h"
-#include "core_blas.h"
+
 #include <omp.h>
 #include <sched.h>
 #include <string.h>
@@ -31,207 +32,209 @@
 
 #define shift 3
 
-#define ss_cond_set(m, n, val)                  \
-    {                                                   \
+#define ss_cond_set(m, n, val) \
+    { \
         plasma->ss_progress[(m)+plasma->ss_ld*(n)] = (val); \
     }
 
-
 #define ss_cond_wait(m, n, val) \
-    {                                                           \
+    { \
         while (plasma->ss_progress[(m)+plasma->ss_ld*(n)] != (val)) \
-            sched_yield();                                          \
+            sched_yield(); \
     }
 
 
-//  Parallel bulge chasing column-wise - static scheduling
-
-void plasma_pzheb2trd_static( plasma_enum_t uplo, int N, int NB, int Vblksiz,
-			 plasma_complex64_t *A, int LDA,
-			 plasma_complex64_t *V, plasma_complex64_t *TAU,
-			 double *D, double *E, int WANTZ,
-			 plasma_workspace_t work,
-			 plasma_sequence_t *sequence, plasma_request_t *request) 
+//  Parallel bulge chasing column-wise, static scheduling
+void plasma_pzheb2trd_static(
+    plasma_enum_t uplo, int n, int NB, int Vblksiz,
+    plasma_complex64_t *A, int LDA,
+    plasma_complex64_t *V, plasma_complex64_t *tau,
+    double *D, double *E, int wantz,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request)
 {
 
     plasma_context_t *plasma = plasma_context_self();
     if (plasma == NULL) {
         plasma_error("PLASMA not initialized");
         return;
-    }    
-    
+    }
+
     // Check sequence status.
     if (sequence->status != PlasmaSuccess) {
         plasma_request_fail(sequence, request, PlasmaErrorSequence);
         return;
     }
 
-    if ( uplo != PlasmaLower ) {
+    if (uplo != PlasmaLower) {
         plasma_request_fail(sequence, request, PlasmaErrorNotSupported);
         return;
     }
-    
-    
+
     // Quick return
-    if (N == 0) {
+    if (n == 0) {
         return;
     }
 
-    /*
-     * General case:
-     *
-     * As I store V in the V vector there are overlap between
-     * tasks so shift is now 4 where group need to be always
-     * multiple of 2 (or shift=5 if not multiple of 2),
-     * because as example if grs=1 task 2 from
-     * sweep 2 can run with task 6 sweep 1., but task 2 sweep 2
-     * will overwrite the V of tasks 5 sweep 1 which are used by
-     * task 6, so keep in mind that group need to be multiple of 2,
-     * and thus tasks 2 sweep 2 will never run with task 6 sweep 1.
-     * OR if we allocate V as V(N,2) and we switch between the storing of
-     * sweep's like odd in V(N,1) and even in V(N,2) then no overlap and so
-     * shift is 3.
-     * when storing V in matrix style, shift could be back to 3.
-     * */
-    
-    /* Some tunning for the bulge chasing code
-     * see technical report for details */
-    int nbtiles = plasma_ceildiv(N,NB);
+    // General case:
+    //
+    // As I store V in the V vector there are overlap between
+    // tasks so shift is now 4 where group need to be always
+    // multiple of 2 (or shift = 5 if not multiple of 2),
+    // because as example if grs = 1 task 2 from
+    // sweep 2 can run with task 6 sweep 1., but task 2 sweep 2
+    // will overwrite the V of tasks 5 sweep 1 which are used by
+    // task 6, so keep in mind that group need to be multiple of 2,
+    // and thus tasks 2 sweep 2 will never run with task 6 sweep 1.
+    // OR if we allocate V as V(n,2) and we switch between the storing of
+    // sweep's like odd in V(n,1) and even in V(n,2) then no overlap and so
+    // shift is 3.
+    // when storing V in matrix style, shift could be back to 3.
+
+    // Some tunning for the bulge chasing code;
+    // see technical report for details.
+    int nbtiles = plasma_ceildiv(n,NB);
     int colblktile = 1;
-    int grsiz = 1;    
+    int grsiz = 1;
     int maxrequiredcores = imax( nbtiles/colblktile, 1 );
     int colpercore = colblktile*NB;
-    int thgrsiz = N;
-    
-    
-    // Initialize static scheduler progress table
+    int thgrsiz = n;
+
+
+    // Initialize static scheduler progress table.
     int cores_num;
-    #pragma omp parallel 
+    #pragma omp parallel
     {
         cores_num  = omp_get_num_threads();
     }
-    int size = 2*nbtiles+shift+cores_num+10;
+    int size = 2*nbtiles + shift + cores_num + 10;
     plasma->ss_progress = (volatile int *)malloc(size*sizeof(int));
-    for(int index = 0; index < size; index++) plasma->ss_progress[index] = 0;
+    for (int index = 0; index < size; ++index)
+        plasma->ss_progress[index] = 0;
     plasma->ss_ld = (size);
-    
-    // main bulge chasing code 
+
+    // main bulge chasing code
     int ii = shift/grsiz;
-    int  stepercol =  ii*grsiz == shift ? ii:ii+1;
-    ii       = (N-1)/thgrsiz;
-    int thgrnb  = ii*thgrsiz == (N-1) ? ii:ii+1;
+    int stepercol = ii*grsiz == shift ? ii : ii + 1;
+    ii = (n - 1)/thgrsiz;
+    int thgrnb = ii*thgrsiz == (n - 1) ? ii : ii + 1;
     int allcoresnb = imin( cores_num, maxrequiredcores );
 
     #pragma omp parallel
-    {   
+    {
         int coreid, sweepid, myid, stt, st, ed, stind, edind;
         int blklastind, colpt,  thgrid, thed;
         int i,j,m,k;
 
         int my_core_id = omp_get_thread_num();
-        plasma_complex64_t  *WORK = work.spaces[my_core_id];
+        plasma_complex64_t *my_work = work.spaces[my_core_id];
 
-        for (thgrid = 1; thgrid<=thgrnb; thgrid++){
-            stt  = (thgrid-1)*thgrsiz+1;
-            thed = imin( (stt + thgrsiz -1), (N-1));
-            for (i = stt; i <= N-1; i++){
+        for (thgrid = 1; thgrid <= thgrnb; ++thgrid) {
+            stt  = (thgrid - 1)*thgrsiz + 1;
+            thed = imin( stt + thgrsiz - 1, n - 1 );
+            for (i = stt; i <= n - 1; ++i) {
                 ed = imin(i,thed);
-                if(stt>ed) break;
-                for (m = 1; m <=stepercol; m++){
-                    st=stt;
-                    for (sweepid = st; sweepid <=ed; sweepid++){
-                        
-                        for (k = 1; k <=grsiz; k++){
-                            myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k;
-                            if(myid%2 ==0){
-                                colpt      = (myid/2)*NB+1+sweepid-1;
-                                stind      = colpt-NB+1;
-                                edind      = imin(colpt,N);
+                if (stt > ed)
+                    break;
+                for (m = 1; m <= stepercol; ++m) {
+                    st = stt;
+                    for (sweepid = st; sweepid <= ed; ++sweepid) {
+                        for (k = 1; k <= grsiz; ++k) {
+                            myid = (i - sweepid)*(stepercol*grsiz) + (m - 1)*grsiz + k;
+                            if (myid % 2 == 0) {
+                                colpt = (myid/2)*NB + 1 + sweepid - 1;
+                                stind = colpt - NB + 1;
+                                edind = imin(colpt,n);
                                 blklastind = colpt;
-                            } else {
-                                colpt      = ((myid+1)/2)*NB + 1 +sweepid -1 ;
-                                stind      = colpt-NB+1;
-                                edind      = imin(colpt,N);
-                                if( (stind>=edind-1) && (edind==N) )
-                                    blklastind=N;
+                            }
+                            else {
+                                colpt = ((myid + 1)/2)*NB + 1 + sweepid - 1;
+                                stind = colpt - NB + 1;
+                                edind = imin(colpt,n);
+                                if ((stind >= edind - 1) && (edind == n))
+                                    blklastind = n;
                                 else
-                                    blklastind=0;
+                                    blklastind = 0;
                             }
-                            coreid = (stind/colpercore)%allcoresnb;
-                            
-                            if(my_core_id==coreid) {
-                                if(myid==1) {
-                                    
-                                    ss_cond_wait(myid+shift-1, 0, sweepid-1);
-                                    core_zhbtype1cb(N, NB, A, LDA, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, WANTZ, WORK);
+                            coreid = (stind / colpercore) % allcoresnb;
+
+                            if (my_core_id == coreid) {
+                                if (myid == 1) {
+                                    ss_cond_wait(myid + shift - 1, 0, sweepid - 1);
+                                    plasma_core_zhbtype1cb(
+                                        n, NB, A, LDA, V, tau,
+                                        stind - 1, edind - 1, sweepid - 1,
+                                        Vblksiz, wantz, my_work);
                                     ss_cond_set(myid, 0, sweepid);
-                                    
-                                    if(blklastind >= (N-1)) {
-                                        for (j = 1; j <= shift; j++)
-                                            ss_cond_set(myid+j, 0, sweepid);
+
+                                    if (blklastind >= (n - 1)) {
+                                        for (j = 1; j <= shift; ++j)
+                                            ss_cond_set(myid + j, 0, sweepid);
+                                    }
+                                }
+                                else {
+                                    ss_cond_wait(myid - 1,       0, sweepid);
+                                    ss_cond_wait(myid + shift - 1, 0, sweepid - 1);
+                                    if (myid%2 == 0) {
+                                        plasma_core_zhbtype2cb(
+                                            n, NB, A, LDA, V, tau,
+                                            stind - 1, edind - 1, sweepid - 1,
+                                            Vblksiz, wantz, my_work);
+                                    }
+                                    else {
+                                        plasma_core_zhbtype3cb(
+                                            n, NB, A, LDA, V, tau,
+                                            stind - 1, edind - 1, sweepid - 1,
+                                            Vblksiz, wantz, my_work);
                                     }
-                                } else {
-                                    ss_cond_wait(myid-1,       0, sweepid);
-                                    ss_cond_wait(myid+shift-1, 0, sweepid-1);
-                                    if(myid%2 == 0)
-                                        core_zhbtype2cb(N, NB, A, LDA, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, WANTZ, WORK);
-                                    else
-                                        core_zhbtype3cb(N, NB, A, LDA, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, WANTZ, WORK);
-                                    
+
                                     ss_cond_set(myid, 0, sweepid);
-                                    if(blklastind >= (N-1)) {
-                                        for (j = 1; j <= shift+allcoresnb; j++)
-                                            ss_cond_set(myid+j, 0, sweepid);
+                                    if (blklastind >= (n - 1)) {
+                                        for (j = 1; j <= shift + allcoresnb; ++j)
+                                            ss_cond_set(myid + j, 0, sweepid);
                                     }
-                                } /* END if myid==1 */
-                            } /* END if my_core_id==coreid */
-                            
-                            if(blklastind >= (N-1)) {
-                                stt++;
+                                } // end if myid == 1
+                            } // end if my_core_id == coreid
+
+                            if (blklastind >= (n - 1)) {
+                                ++stt;
                                 break;
                             }
-                        } /* END for k=1:grsiz */
-                    } /* END for sweepid=st:ed */
-                } /* END for m=1:stepercol */
-            } /* END for i=1:N-1 */
-         } /* END for thgrid=1:thgrnb */
+                        } // end for k = 1:grsiz
+                    } // end for sweepid = st:ed
+                } // end for m = 1:stepercol
+            } // end for i = 1:n - 1
+         } // end for thgrid = 1:thgrnb
     }
-    /* finalize static sched */
+    // finalize static sched
     free((void*)plasma->ss_progress);
-    
-    /*================================================
-     *  store resulting diag and lower diag D and E
-     *  note that D and E are always real
-     *================================================*/
-    /*
-     * STORE THE RESULTING diagonal/off-diagonal in D AND E
-     */
-    /* Make diagonal and superdiagonal elements real,
-     * storing them in D and E
-     */
-    /* In complex case, the off diagonal element are
-     * not necessary real. we have to make off-diagonal
-     * elements real and copy them to E.
-     * When using HouseHolder elimination,
-     * the ZLARFG give us a real as output so, all the
-     * diagonal/off-diagonal element except the last one are already
-     * real and thus we need only to take the abs of the last
-     * one.
-     *  */
-    // sequential code here so only core 0 will work 
-    if( uplo == PlasmaLower ) {
-        for (int i=0; i < N-1 ; i++) {
+
+    //================================================
+    // Make the resulting diagonal and super-diagonal elements real,
+    // storing them in D and E, respectively.
+    //================================================*
+    // In complex case, the off diagonal element are
+    // not necessary real. We have to make off-diagonal
+    // elements real and copy them to E.
+    // When using Householder elimination,
+    // the ZLARFG give us a real as output, so all the
+    // diagonal/off-diagonal element except the last one are already
+    // real and thus we need only to take the abs of the last one.
+    // @todo Where is abs?
+    //
+    // Sequential code here so only core 0 will work.
+    if (uplo == PlasmaLower) {
+        for (int i = 0; i < n - 1; ++i) {
             D[i] = creal(A[i*LDA]);
-            E[i] = creal(A[i*LDA+1]);
+            E[i] = creal(A[i*LDA + 1]);
         }
-        D[N-1] = creal(A[(N-1)*LDA]);
-    } else { /* PlasmaUpper not tested yet */
-        for (int i=0; i<N-1; i++) {
-            D[i] = creal(A[i*LDA+NB]);
-            E[i] = creal(A[i*LDA+NB-1]);
+        D[n - 1] = creal(A[(n - 1)*LDA]);
+    }
+    else { // PlasmaUpper not yet tested
+        for (int i = 0; i < n - 1; ++i) {
+            D[i] = creal(A[i*LDA + NB]);
+            E[i] = creal(A[i*LDA + NB - 1]);
         }
-        D[N-1] = creal(A[(N-1)*LDA+NB]);
-    } /* end PlasmaUpper */
-    
-    return;
+        D[n - 1] = creal(A[(n - 1)*LDA + NB]);
+    }
 }
diff --git a/compute/pzhecpy_tile2lapack_band.c b/compute/pzhecpy_tile2lapack_band.c
index f12220dc..0d4fba78 100644
--- a/compute/pzhecpy_tile2lapack_band.c
+++ b/compute/pzhecpy_tile2lapack_band.c
@@ -16,86 +16,85 @@
 #include "plasma_internal.h"
 #include "plasma_types.h"
 #include "plasma_workspace.h"
-#include "core_blas.h"
-
-
+#include "plasma_core_blas.h"
 
 #define A(m, n) (plasma_complex64_t*)plasma_tile_addr(A, m, n)
 #define AB(m_, n_) &(AB[(m_) + ldab*((n_)*nb) ])
 
 /***************************************************************************//**
- *  Parallel copy of a band matrix from full nxn tile storage to band storage (nxldab).
- *  As this function is internal and the space is the same for either Lower or Upper so
- *  ALWAYS it convert to Lower band and then the bulge chasing will
- *  always work with a Lower band matrix
+ * Parallel copy of a band matrix from full n x n tile storage to band
+ * storage (n x ldab). As this function is internal and the space is the
+ * same for either Lower or Upper, it ALWAYS converts to lower band and
+ * then the bulge chasing will always work with a lower band matrix.
  **/
-
-void plasma_pzhecpy_tile2lapack_band(plasma_enum_t uplo,
-                                  plasma_desc_t A,
-                                  plasma_complex64_t *AB, int ldab,
-                                  plasma_sequence_t *sequence, plasma_request_t *request)
+void plasma_pzhecpy_tile2lapack_band(
+    plasma_enum_t uplo,
+    plasma_desc_t A,
+    plasma_complex64_t *AB, int ldab,
+    plasma_sequence_t *sequence, plasma_request_t *request)
 {
-
-
     // Return if failed sequence.
     if (sequence->status != PlasmaSuccess)
         return;
 
     int nb = A.mb;
 
-    /*=============================================
-     * NOTE :
-     * this function transform the Lower/Upper Tile
-     * band matrix to LOWER Band storage matrix.
-     * For Lower it copy it directly.
-     * For Upper it conjtransposed during the copy.
-     *=============================================*/
-    
+    //=============================================
+    // NOTE :
+    // this function transform the Lower/Upper Tile
+    // band matrix to LOWER Band storage matrix.
+    // For Lower it copies it directly.
+    // For Upper it is conjugate-transposed during the copy.
+    //=============================================
     int ldx = ldab-1;
     int minmn = imin(A.mt, A.nt);
-    /* copy Lower to Lower */
-    if ( uplo == PlasmaLower ) {
-       for (int j = 0; j < minmn; j++) {
-         int mvaj = plasma_tile_mview(A, j);
-         int nvaj = plasma_tile_nview(A, j);    
-         int ldaj = plasma_tile_mmain(A, j);
+    if (uplo == PlasmaLower) {
+        // copy Lower to Lower
+        for (int j = 0; j < minmn; ++j) {
+            int mvaj = plasma_tile_mview(A, j);
+            int nvaj = plasma_tile_nview(A, j);
+            int ldaj = plasma_tile_mmain(A, j);
 
-           core_omp_zlacpy(PlasmaLower, PlasmaNoTrans,
-                           mvaj, nvaj, 
-                           A(j, j), ldaj, AB(0, j), ldx,
-                           sequence, request);
+            plasma_core_omp_zlacpy(
+                PlasmaLower, PlasmaNoTrans,
+                mvaj, nvaj,
+                A(j, j), ldaj, AB(0, j), ldx,
+                sequence, request);
 
-           if( j<minmn-1 ) {
-               mvaj = plasma_tile_mview(A, j+1);
-               ldaj = plasma_tile_mmain(A, j+1);
+            if (j < minmn - 1) {
+                mvaj = plasma_tile_mview(A, j+1);
+                ldaj = plasma_tile_mmain(A, j+1);
 
-               core_omp_zlacpy(PlasmaUpper, PlasmaNoTrans,
-                               mvaj, nvaj,
-                               A(j+1, j), ldaj, AB(nb, j), ldx,
-                               sequence, request);
-           }
-       }
+                plasma_core_omp_zlacpy(
+                    PlasmaUpper, PlasmaNoTrans,
+                    mvaj, nvaj,
+                    A(j+1, j), ldaj, AB(nb, j), ldx,
+                    sequence, request);
+            }
+        }
     }
-    /* conjtranspose Upper when copying it to Lower */
-    else if ( uplo == PlasmaUpper ) {
-        for (int j = 0; j < minmn; j++) {
+    else if (uplo == PlasmaUpper) {
+        // conj-transpose Upper when copying it to Lower
+        for (int j = 0; j < minmn; ++j) {
             int mvaj = plasma_tile_mview(A, j);
-            int nvaj = plasma_tile_nview(A, j);    
+            int nvaj = plasma_tile_nview(A, j);
             int ldaj = plasma_tile_mmain(A, j);
-            
-            core_omp_zlacpy(PlasmaUpper, PlasmaConjTrans,
-                           mvaj, nvaj,
-                           A(j, j), ldaj, AB(0, j), ldx,
-                           sequence, request);
 
-           if(j<minmn-1){
-               nvaj = plasma_tile_nview(A, j+1);    
+            plasma_core_omp_zlacpy(
+                PlasmaUpper, PlasmaConjTrans,
+                mvaj, nvaj,
+                A(j, j), ldaj, AB(0, j), ldx,
+                sequence, request);
+
+            if (j < minmn - 1) {
+                nvaj = plasma_tile_nview(A, j+1);
 
-               core_omp_zlacpy(PlasmaLower, PlasmaConjTrans,
-                               mvaj, nvaj,
-                               A(j, j+1), ldaj, AB(nb, j), ldx,
-                               sequence, request);
-           }
+                plasma_core_omp_zlacpy(
+                    PlasmaLower, PlasmaConjTrans,
+                    mvaj, nvaj,
+                    A(j, j+1), ldaj, AB(nb, j), ldx,
+                    sequence, request);
+            }
         }
     }
 }
diff --git a/compute/zheevd.c b/compute/zheevd.c
index 91770553..4ba4c8f5 100644
--- a/compute/zheevd.c
+++ b/compute/zheevd.c
@@ -2,7 +2,7 @@
  *
  * @file
  *
- *  plasma is a software package provided by:
+ *  PLASMA is a software package provided by:
  *  University of Tennessee, US,
  *  University of Manchester, UK.
  *
@@ -18,11 +18,11 @@
 #include "plasma_tuning.h"
 #include "plasma_types.h"
 #include "plasma_workspace.h"
-#include <string.h>
+#include "core_lapack.h"
 #include "bulge.h"
 
+#include <string.h>
 #include <omp.h>
-#include "core_lapack.h"
 
 /***************************************************************************//**
  *
@@ -37,22 +37,22 @@
  *
  *******************************************************************************
  *
- * @param[in] eigt
- *          Intended usage:
- *          = PlasmaEigVal:    computes eigenvalues only;
- *          = PlasmaEigValVec: computes eigenvalues and eigenvectors.
+ * @param[in] job
+ *          Specifies whether to compute eigenvectors.
+ *          - PlasmaNoVec: computes eigenvalues only;
+ *          - PlasmaVec:   computes eigenvalues and eigenvectors.
  *
  * @param[in] uplo
  *          Specifies whether the matrix A is upper triangular or
  *          lower triangular:
- *          = PlasmaUpper: Upper triangle of A is stored;
- *          = PlasmaLower: Lower triangle of A is stored.
+ *          - PlasmaUpper: Upper triangle of A is stored;
+ *          - PlasmaLower: Lower triangle of A is stored.
  *
  * @param[in] n
  *          The order of the matrix A. n >= 0.
  *
  * @param[in,out] pA
- *          On entry, the symmetric (or Hermitian) matrix pA.
+ *          On entry, the Hermitian matrix pA.
  *          If uplo = PlasmaUpper, the leading n-by-n upper triangular
  *          part of pA contains the upper triangular part of the matrix
  *          A, and the strictly lower triangular part of pA is not
@@ -68,16 +68,17 @@
  * @param[in] lda
  *          The leading dimension of the array A. lda >= max(1,n).
  *
- * @param[out] W
+ * @param[out] Lambda
  *          On exit, if info = 0, the eigenvalues.
  *
  * @param[in, out] T
- *          On exit, auxiliary factorization data, required by plasma_zheevd to
+ *          On exit, auxiliary factorization data, required by plasma_zheevd.
  *          Matrix in T is allocated inside this function and needs to be
  *          destroyed by plasma_desc_destroy.
+ *          @todo Shouldn't heevd destroy T? Why is this an argument?
  *
  * @param[out] pQ
- *          On exit, if eigt = PlasmaEigValVec and info = 0, the eigenvectors.
+ *          On exit, if job = PlasmaVec and info = 0, the eigenvectors.
  *
  * @param[in] ldq
  *          The leading dimension of the array pQ. ldq >= max(1,n).
@@ -95,13 +96,13 @@
  * @sa plasma_sheevd
  *
  ******************************************************************************/
-int plasma_zheevd(plasma_enum_t eigt, plasma_enum_t uplo, int n,
-                  plasma_complex64_t *pA, int lda,
-                  plasma_desc_t *T, 
-                  double *W, 
-                  plasma_complex64_t *pQ, int ldq)
+int plasma_zheevd(
+    plasma_enum_t job, plasma_enum_t uplo, int n,
+    plasma_complex64_t *pA, int lda,
+    plasma_desc_t *T,
+    double *Lambda,
+    plasma_complex64_t *pQ, int ldq)
 {
-
     // Get PLASMA context.
     plasma_context_t *plasma = plasma_context_self();
     if (plasma == NULL) {
@@ -109,9 +110,9 @@ int plasma_zheevd(plasma_enum_t eigt, plasma_enum_t uplo, int n,
         return PlasmaErrorNotInitialized;
     }
 
-    // Check input arguments 
-    if (eigt != PlasmaEigVal && eigt != PlasmaEigValVec) {
-        plasma_error("illegal value of eigt");
+    // Check input arguments
+    if (job != PlasmaNoVec && job != PlasmaVec) {
+        plasma_error("illegal value of job");
         return -1;
     }
     if (uplo != PlasmaLower && uplo != PlasmaUpper) {
@@ -131,7 +132,7 @@ int plasma_zheevd(plasma_enum_t eigt, plasma_enum_t uplo, int n,
         return -9;
     }
 
-    // Quick return 
+    // Quick return
     if (n == 0)
         return PlasmaSuccess;
 
@@ -172,19 +173,19 @@ int plasma_zheevd(plasma_enum_t eigt, plasma_enum_t uplo, int n,
     // Initialize request.
     plasma_request_t request;
     retval = plasma_request_init(&request);
- 
+
     // asynchronous block
     #pragma omp parallel
     #pragma omp master
     {
         // Translate to tile layout.
         plasma_omp_zge2desc(pA, lda, A, &sequence, &request);
-
     }
-    
-    // Warning !!! plasma_omp_zheevd is not fully async function.
-    // It contains both async and syn functions.
-    plasma_omp_zheevd(eigt, uplo, A, *T, W, pQ, ldq, work, &sequence, &request);
+
+    // Warning !!! plasma_omp_zheevd is not a fully async function.
+    // It contains both async and sync functions.
+    plasma_omp_zheevd(job, uplo, A, *T, Lambda, pQ, ldq, work,
+                      &sequence, &request);
 
     #pragma omp parallel
     #pragma omp master
@@ -217,32 +218,31 @@ int plasma_zheevd(plasma_enum_t eigt, plasma_enum_t uplo, int n,
  *
  *******************************************************************************
  *
- * @param[in] eigt
- *          Intended usage:
- *          = PlasmaEigVal:    computes eigenvalues only;
- *          = PlasmaEigValVec: computes eigenvalues and eigenvectors.
+ * @param[in] job
+ *          Specifies whether to compute eigenvectors.
+ *          - PlasmaNoVec: computes eigenvalues only;
+ *          - PlasmaVec:   computes eigenvalues and eigenvectors.
  *
  * @param[in] uplo
  *          Specifies whether the matrix A is upper triangular or
  *          lower triangular:
- *          = PlasmaUpper: Upper triangle of A is stored;
- *          = PlasmaLower: Lower triangle of A is stored.
+ *          - PlasmaUpper: Upper triangle of A is stored;
+ *          - PlasmaLower: Lower triangle of A is stored.
  *
  * @param[in,out] A
  *          Descriptor of matrix A.
  *          A is stored in the tile layout.
  *
- * @param[out] W
+ * @param[out] Lambda
  *          On exit, if info = 0, the eigenvalues.
  *
  * @param[out] T
  *          Descriptor of matrix T.
- *          On exit, auxiliary factorization data, required by QR factorization auxilary 
- *          kernels to
- *          solve the system of equations.
+ *          On exit, auxiliary factorization data, required by QR factorization
+ *          auxiliary kernels to apply Q.
  *
  * @param[out] Q
- *          On exit, if eigt = PlasmaEigValVec and info = 0, the eigenvectors.
+ *          On exit, if job = PlasmaVec and info = 0, the eigenvectors.
  *
  * @param[in] ldq
  *          The leading dimention of the eigenvectors matrix Q. ldq >= max(1,n).
@@ -262,14 +262,14 @@ int plasma_zheevd(plasma_enum_t eigt, plasma_enum_t uplo, int n,
  * @sa plasma_omp_ssyev
  *
  ******************************************************************************/
-void plasma_omp_zheevd(plasma_enum_t eigt, plasma_enum_t uplo,
-                      plasma_desc_t A, plasma_desc_t T,
-                      double *W,
-                      plasma_complex64_t *pQ, int ldq,
-                      plasma_workspace_t work,
-                      plasma_sequence_t *sequence, plasma_request_t *request)
+void plasma_omp_zheevd(
+    plasma_enum_t job, plasma_enum_t uplo,
+    plasma_desc_t A, plasma_desc_t T,
+    double *Lambda,
+    plasma_complex64_t *pQ, int ldq,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request)
 {
-    
     // Get PLASMA context.
     plasma_context_t *plasma = plasma_context_self();
     if (plasma == NULL) {
@@ -277,10 +277,10 @@ void plasma_omp_zheevd(plasma_enum_t eigt, plasma_enum_t uplo,
         plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
         return;
     }
-    
+
     // Check input arguments.
-    if (eigt != PlasmaEigVal && eigt != PlasmaEigValVec) {
-        plasma_error("illegal value of eigt");
+    if (job != PlasmaNoVec && job != PlasmaVec) {
+        plasma_error("illegal value of job");
         plasma_request_fail(sequence, request, PlasmaErrorIllegalValue);
         return;
     }
@@ -316,7 +316,7 @@ void plasma_omp_zheevd(plasma_enum_t eigt, plasma_enum_t uplo,
 
     int n  = A.m;
     int nb   = imin(A.mb, A.m);
-    int lda_band = 2*nb+1;
+    int lda_band = 2*nb + 1;
 
     //Allocate workspace for band storage of the band matrix
     // A and for the off diagonal after tridiagonalisation
@@ -351,41 +351,40 @@ void plasma_omp_zheevd(plasma_enum_t eigt, plasma_enum_t uplo,
         plasma_pzhecpy_tile2lapack_band (uplo,
                                          A,
                                          A_band, lda_band,
-                                         sequence, request);  
+                                         sequence, request);
     }
     double stop = omp_get_wtime();
-    double time = stop-start;
-    printf("\n N=%d:  1-stage time = %lf\t", n, time);
+    double time = stop - start;
+    //printf("\n N=%d:  1-stage time = %lf\t", n, time);
 
     //====================
     //  Bulge chasing
     //====================
-
     plasma_complex64_t *TAU2 = NULL;
     plasma_complex64_t *V2 = NULL;
     plasma_complex64_t *T2 = NULL;
-    int Vblksiz;  //Blocking used when applying V2 to the matrix Q
-    int blkcnt;  // Number of diamond tile or tile of Vs
+    int Vblksiz;  // Blocking used when applying V2 to the matrix Q
+    int blkcnt;   // Number of diamond tile or tile of Vs
     int ldt, ldv;
     int wantz   = 0;
     int blguplo = PlasmaLower;
-    
-    if( eigt == PlasmaEigVal )
-        wantz=0;
+
+    if (job == PlasmaNoVec)
+        wantz = 0;
     else
-        wantz=2;
-    
+        wantz = 2;
+
     Vblksiz = nb/4;
     ldt     = Vblksiz;
-    if( eigt == PlasmaEigValVec ) {
+    if (job == PlasmaVec) {
         findVTsiz(n, nb, Vblksiz, &blkcnt, &ldv);
-        TAU2= (plasma_complex64_t *)
+        TAU2 = (plasma_complex64_t *)
             calloc((size_t)blkcnt*Vblksiz, sizeof(plasma_complex64_t));
         V2  = (plasma_complex64_t *)
             calloc((size_t)ldv*blkcnt*Vblksiz, sizeof(plasma_complex64_t));
         T2  = (plasma_complex64_t *)
             calloc((size_t)ldt*blkcnt*Vblksiz, sizeof(plasma_complex64_t));
-        if ( (TAU2 == NULL) || (V2 == NULL) || (T2 == NULL) ) {
+        if (TAU2 == NULL || V2 == NULL || T2 == NULL) {
             plasma_error("calloc() failed");
             free(TAU2);
             free(V2);
@@ -401,7 +400,7 @@ void plasma_omp_zheevd(plasma_enum_t eigt, plasma_enum_t uplo,
             calloc((size_t)2*n, sizeof(plasma_complex64_t));
         V2     = (plasma_complex64_t *)
             calloc((size_t)2*n, sizeof(plasma_complex64_t ));
-        if ( (TAU2 == NULL) || (V2 == NULL) ) {
+        if (TAU2 == NULL || V2 == NULL) {
             plasma_error("calloc() failed");
             free(TAU2);
             free(V2);
@@ -410,50 +409,47 @@ void plasma_omp_zheevd(plasma_enum_t eigt, plasma_enum_t uplo,
         memset(TAU2, 0, 2*n*sizeof(plasma_complex64_t));
         memset(V2,   0, 2*n*sizeof(plasma_complex64_t));
     }
-                
+
     // Main bulge chasing kernel.
-    // Contains internal omp parallel section 
+    // Contains internal omp parallel section
     start = omp_get_wtime();
     plasma_pzhbtrd_static(blguplo, n, nb, Vblksiz,
-                            A_band, lda_band,
-                            V2, TAU2,
-                            W, E,
-                            wantz,
-                            work,
-                            sequence, request);
+                          A_band, lda_band,
+                          V2, TAU2,
+                          Lambda, E,
+                          wantz,
+                          work,
+                          sequence, request);
     stop = omp_get_wtime();
-    time = stop-start;
-    printf("2-stage timing = %lf\t", time);
-    
+    time = stop - start;
+    //printf("2-stage timing = %lf\t", time);
+
     //=======================================
-    //  calling eigensolver
+    // Tridiagonal eigensolver
     //=======================================
-
-    // call eigensolver using lapack routine for our resulting tridiag [W E] 
+    // call eigensolver using lapack routine for our resulting tridiag,
+    // [Lambda E]
     start = omp_get_wtime();
-    if(eigt == PlasmaEigVal){
-        LAPACKE_zstedc( LAPACK_COL_MAJOR,
-                        'N',
-                        n, W, E, pQ, ldq );
-    } else {
-        LAPACKE_zstedc( LAPACK_COL_MAJOR,
-                        'I',
-                                 n, W, E, pQ, ldq );
+    if (job == PlasmaNoVec) {
+        LAPACKE_zstedc( LAPACK_COL_MAJOR, 'N', n, Lambda, E, pQ, ldq );
+    }
+    else {
+        LAPACKE_zstedc( LAPACK_COL_MAJOR, 'I', n, Lambda, E, pQ, ldq );
     }
     stop = omp_get_wtime();
-    time = stop-start;
-    printf("Eigenvalue time = %lf\t", time);
+    time = stop - start;
+    //printf("Eigenvalue time = %lf\t", time);
 
     start = omp_get_wtime();
-    if (eigt == PlasmaEigValVec) {
-        /*=======================================
-         *  apply Q2 from the bulge
-         *=======================================*/
-        // compute T2 
+    if (job == PlasmaVec) {
+        //=======================================
+        // Apply Q2 from the bulge.
+        //=======================================
+        // compute T2
         #pragma omp parallel
         {
             plasma_pzlarft_blgtrd(n, nb, Vblksiz,
-                                  V2, T2, TAU2, 
+                                  V2, T2, TAU2,
                                   sequence, request);
         }
 
@@ -461,67 +457,62 @@ void plasma_omp_zheevd(plasma_enum_t eigt, plasma_enum_t uplo,
         #pragma omp parallel
         {
             plasma_pzunmqr_blgtrd(PlasmaLeft,  PlasmaNoTrans,
-                                  n, nb, n, 
+                                  n, nb, n,
                                   Vblksiz, wantz,
                                   V2, T2, TAU2,
                                   pQ, ldq,
                                   work,
                                   sequence, request);
         }
-        
-        
-        /*=======================================
-         *  apply Q1 from the first stage 
-         *=======================================*/
-        // CASE nb>N, Q1 doesn't need to be applied,
-        //only bulge chasing has been done
-        if( nb < n ){
-            
+
+        //=======================================
+        // Apply Q1 from the first stage .
+        //=======================================
+        // If nb > N, Q1 doesn't need to be applied,
+        // only bulge chasing has been done
+        if (nb < n) {
             plasma_desc_t Q;
             plasma_desc_general_create(PlasmaComplexDouble, nb, nb,
-                                       n, n, 0, 0, n, n, &Q);            
+                                       n, n, 0, 0, n, n, &Q);
 
             #pragma omp parallel
             #pragma omp master
             {
                 // Translate to tile layout.
                 plasma_pzge2desc(pQ, ldq, Q, sequence, request);
-                
-                // Accumulate the transformations from the first stage 
-                if(uplo==PlasmaLower){
+
+                // Accumulate the transformations from the first stage
+                if (uplo == PlasmaLower) {
                     plasma_pzunmqr(PlasmaLeft, PlasmaNoTrans,
-                                   plasma_desc_view(A, A.mb, 0, A.m-A.mb, A.n-A.nb),
-                                   plasma_desc_view(T, T.mb, 0, T.m-T.mb, T.n-T.nb),
-                                   plasma_desc_view(Q, Q.mb, 0, Q.m-Q.mb, Q.n),
+                                   plasma_desc_view(A, A.mb, 0, A.m - A.mb, A.n - A.nb),
+                                   plasma_desc_view(T, T.mb, 0, T.m - T.mb, T.n - T.nb),
+                                   plasma_desc_view(Q, Q.mb, 0, Q.m - Q.mb, Q.n),
                                    work,
                                    sequence, request);
-
                 }
                 else {
                     plasma_pzunmlq (PlasmaLeft, Plasma_ConjTrans,
-                                    plasma_desc_view(A, 0, A.nb, A.m-A.mb, A.n-A.nb),
-                                    plasma_desc_view(T, 0, T.nb, T.m-T.mb, T.n-T.nb),
-                                    plasma_desc_view(Q, Q.mb, 0, Q.m-Q.mb, Q.n),
+                                    plasma_desc_view(A, 0, A.nb, A.m - A.mb, A.n - A.nb),
+                                    plasma_desc_view(T, 0, T.nb, T.m - T.mb, T.n - T.nb),
+                                    plasma_desc_view(Q, Q.mb, 0, Q.m - Q.mb, Q.n),
                                     work,
                                     sequence, request);
                 }
-                
+
                 // Translate back to LAPACK layout.
                 plasma_pzdesc2ge(Q, pQ, ldq, sequence, request);
             }
 
             plasma_desc_destroy(&Q);
-        } // END of ( nb < N ) 
+        } // end (nb < N)
     }
     stop = omp_get_wtime();
-    time = stop-start;
-    printf("Eigenvector timing = %lf\n", time);
-    
-    if( eigt == PlasmaEigValVec ){ free(T2);}
+    time = stop - start;
+    //printf("Eigenvector timing = %lf\n", time);
+
+    free(T2);
     free(V2);
     free(TAU2);
     free(E);
     free(A_band);
-    return;
 }
-
diff --git a/core_blas/core_zhbtype1cb.c b/core_blas/core_zhbtype1cb.c
index 8939cf21..99ecbae2 100644
--- a/core_blas/core_zhbtype1cb.c
+++ b/core_blas/core_zhbtype1cb.c
@@ -11,15 +11,15 @@
  **/
 
 
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "core_lapack.h"
 #include "bulge.h"
 #include <string.h>
 
-#define A(m,n)   (A + LDA * (n) + ((m)-(n)))
+#define A(m, n)  (A + lda*(n) + ((m) - (n)))
 #define V(m)     (V + (m))
-#define TAU(m)   (TAU + (m))
+#define tau(m)   (tau + (m))
 
 /***************************************************************************//**
  *
@@ -41,25 +41,25 @@
  *
  *******************************************************************************
  *
- * @param[in] N
+ * @param[in] n
  *          The order of the matrix A.
  *
- * @param[in] NB
+ * @param[in] nb
  *          The size of the band.
  *
  * @param[in, out] A
- *          A pointer to the matrix A of size (2*NB+1)-by-N.
+ *          A pointer to the matrix A of size (2*nb + 1)-by-n.
  *
- * @param[in] LDA
- *          The leading dimension of the matrix A. LDA >= max(1,2*NB+1)
+ * @param[in] lda
+ *          The leading dimension of the matrix A. lda >= max(1, 2*nb + 1)
  *
  * @param[out] V
- *          PLASMA_Complex64_t array, dimension N if eigenvalue only
+ *          PLASMA_Complex64_t array, dimension n if eigenvalue only
  *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
  *          The Householder reflectors are stored in this array.
  *
- * @param[out] TAU
- *          PLASMA_Complex64_t array, dimension (N).
+ * @param[out] tau
+ *          PLASMA_Complex64_t array, dimension (n).
  *          The scalar factors of the Householder reflectors are stored
  *          in this array.
  *
@@ -78,11 +78,11 @@
  *          it serve to calculate the pointer to the position where to store the
  *          Vs and Ts.
  *
- * @param[in] WANTZ
+ * @param[in] wantz
  *          constant which indicate if Eigenvalue are requested or both
  *          Eigenvalue/Eigenvectors.
  *
- * @param[in] WORK
+ * @param[in] work
  *          Workspace of size nb.
  *
  *******************************************************************************
@@ -95,45 +95,44 @@
 /***************************************************************************
  *          TYPE 1-BAND Lower-columnwise-Householder
  ***************************************************************************/
-void core_zhbtype1cb(int N, int NB,
-                     plasma_complex64_t *A, int LDA,
-                     plasma_complex64_t *V, plasma_complex64_t *TAU,
-                     int st, int ed, int sweep, int Vblksiz, int WANTZ,
-                     plasma_complex64_t *WORK)
+void plasma_core_zhbtype1cb(
+    int n, int nb,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *V, plasma_complex64_t *tau,
+    int st, int ed, int sweep, int Vblksiz, int wantz,
+    plasma_complex64_t *work)
 {
-    int len, LDX;
+    int len, ldx;
     int blkid, vpos, taupos, tpos;
 
-    /* find the pointer to the Vs and Ts as stored by the bulgechasing
-     * note that in case no eigenvector required V and T are stored
-     * on a vector of size N
-     * */
-     if( WANTZ == 0 ) {
-         vpos   = ((sweep+1)%2)*N + st;
-         taupos = ((sweep+1)%2)*N + st;
-     } else {
-         findVTpos(N, NB, Vblksiz, sweep, st,
-                   &vpos, &taupos, &tpos, &blkid);
-     }
+    // Find the pointer to the Vs and Ts as stored by the bulge chasing.
+    // Note that in case no eigenvector required V and T are stored
+    // on a vector of size n
+    if (wantz == 0) {
+        vpos   = ((sweep + 1)%2)*n + st;
+        taupos = ((sweep + 1)%2)*n + st;
+    }
+    else {
+        findVTpos(n, nb, Vblksiz, sweep, st,
+                  &vpos, &taupos, &tpos, &blkid);
+    }
 
-    LDX = LDA-1;
+    ldx = lda-1;
     len = ed-st+1;
     *V(vpos) = 1.;
 
     memcpy( V(vpos+1), A(st+1, st-1), (len-1)*sizeof(plasma_complex64_t) );
     memset( A(st+1, st-1), 0, (len-1)*sizeof(plasma_complex64_t) );
 
-    /* Eliminate the col  at st-1 */
-    LAPACKE_zlarfg_work(len, A(st, st-1), V(vpos+1), 1, TAU(taupos) );
+    // Eliminate the col at st-1.
+    LAPACKE_zlarfg_work(len, A(st, st-1), V(vpos+1), 1, tau(taupos) );
 
-    /* Apply left and right on A(st:ed,st:ed) */
-    core_zlarfy(len, A(st,st), LDX, V(vpos), TAU(taupos), WORK);
-
-    return;
+    // Apply left and right on A(st:ed, st:ed).
+    plasma_core_zlarfy(len, A(st, st), ldx, V(vpos), tau(taupos), work);
 }
 /***************************************************************************/
 #undef A
 #undef V
-#undef TAU
+#undef tau
 
 
diff --git a/core_blas/core_zhbtype2cb.c b/core_blas/core_zhbtype2cb.c
index b37b7d94..97c977d8 100644
--- a/core_blas/core_zhbtype2cb.c
+++ b/core_blas/core_zhbtype2cb.c
@@ -10,15 +10,16 @@
  *
  **/
 #include "bulge.h"
-#include <string.h>
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "plasma_internal.h"
 #include "core_lapack.h"
 
-#define A(m,n)   (A + LDA * (n) + ((m)-(n)))
+#include <string.h>
+
+#define A(m, n)  (A + lda*(n) + ((m) - (n)))
 #define V(m)     (V + (m))
-#define TAU(m)   (TAU + (m))
+#define tau(m)   (tau + (m))
 
 /***************************************************************************//**
  *
@@ -40,27 +41,27 @@
  *
  *******************************************************************************
  *
- * @param[in] N
+ * @param[in] n
  *          The order of the matrix A.
  *
- * @param[in] NB
+ * @param[in] nb
  *          The size of the band.
  *
- * @param[in, out] A
- *          A pointer to the matrix A of size (2*NB+1)-by-N.
+ * @param[in,out] A
+ *          A pointer to the matrix A of size (2*nb + 1)-by-n.
  *
- * @param[in] LDA
- *          The leading dimension of the matrix A. LDA >= max(1,2*NB+1)
+ * @param[in] lda
+ *          The leading dimension of the matrix A. lda >= max(1, 2*nb + 1)
  *
  * @param[in, out] V
- *          PLASMA_Complex64_t array, dimension N if eigenvalue only
+ *          plasma_complex64_t array, dimension n if eigenvalue only
  *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
  *          The Householder reflectors of the previous type 1 are used here
  *          to continue update then new one are generated to eliminate the
  *          bulge and stored in this array.
  *
- * @param[in, out] TAU
- *          PLASMA_Complex64_t array, dimension (N).
+ * @param[in, out] tau
+ *          plasma_complex64_t array, dimension (n).
  *          The scalar factors of the Householder reflectors of the previous
  *          type 1 are used here to continue update then new one are generated
  *          to eliminate the bulge and stored in this array.
@@ -80,11 +81,11 @@
  *          it serve to calculate the pointer to the position where to store the
  *          Vs and Ts.
  *
- * @param[in] WANTZ
+ * @param[in] wantz
  *          constant which indicate if Eigenvalue are requested or both
  *          Eigenvalue/Eigenvectors.
  *
- * @param[in] WORK
+ * @param[in] work
  *          Workspace of size nb.
  *
  *******************************************************************************
@@ -98,42 +99,46 @@
 /***************************************************************************
  *          TYPE 2-BAND Lower-columnwise-Householder
  ***************************************************************************/
-void core_zhbtype2cb(int N, int NB,
-                     plasma_complex64_t *A, int LDA,
-                     plasma_complex64_t *V, plasma_complex64_t *TAU,
-                     int st, int ed, int sweep, int Vblksiz, int WANTZ,
-                     plasma_complex64_t *WORK)
+void plasma_core_zhbtype2cb(
+    int n, int nb,
+    plasma_complex64_t *A, int lda,
+    plasma_complex64_t *V, plasma_complex64_t *tau,
+    int st, int ed, int sweep, int Vblksiz, int wantz,
+    plasma_complex64_t *work)
 {
     plasma_complex64_t ctmp;
-    int J1, J2, len, lem, LDX;
+    int J1, J2, len, lem, ldx;
     int blkid, vpos, taupos, tpos;
 
-    if( WANTZ == 0 ) {
-        vpos   = ((sweep+1)%2)*N + st;
-        taupos = ((sweep+1)%2)*N + st;
-    } else {
-        findVTpos(N, NB, Vblksiz, sweep, st,
+    if (wantz == 0) {
+        vpos   = ((sweep + 1)%2)*n + st;
+        taupos = ((sweep + 1)%2)*n + st;
+    }
+    else {
+        findVTpos(n, nb, Vblksiz, sweep, st,
                   &vpos, &taupos, &tpos, &blkid);
     }
 
-    LDX = LDA-1;
+    ldx = lda-1;
     J1  = ed+1;
-    J2  = imin(ed+NB,N-1);
+    J2  = imin(ed+nb, n-1);
     len = ed-st+1;
     lem = J2-J1+1;
 
-    if( lem > 0 ) {
-        /* Apply remaining right commming from the top block */
+    if (lem > 0) {
+        // Apply remaining right commming from the top block.
         LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaRight),
-                            lem, len, V(vpos), *(TAU(taupos)), A(J1, st), LDX, WORK);
+                            lem, len, V(vpos), *(tau(taupos)),
+                            A(J1, st), ldx, work);
     }
 
-    if( lem > 1 ) {
-        if( WANTZ == 0 ) {
-            vpos   = ((sweep+1)%2)*N + J1;
-            taupos = ((sweep+1)%2)*N + J1;
-        } else {
-            findVTpos(N,NB,Vblksiz,sweep,J1, &vpos, &taupos, &tpos, &blkid);
+    if (lem > 1) {
+        if (wantz == 0 ) {
+            vpos   = ((sweep+1)%2)*n + J1;
+            taupos = ((sweep+1)%2)*n + J1;
+        }
+        else {
+            findVTpos(n, nb, Vblksiz, sweep, J1, &vpos, &taupos, &tpos, &blkid);
         }
 
         /* Remove the first column of the created bulge */
@@ -143,22 +148,21 @@ void core_zhbtype2cb(int N, int NB,
         memset(A(J1+1, st), 0, (lem-1)*sizeof(plasma_complex64_t));
 
         /* Eliminate the col at st */
-        LAPACKE_zlarfg_work( lem, A(J1, st), V(vpos+1), 1, TAU(taupos) );
+        LAPACKE_zlarfg_work( lem, A(J1, st), V(vpos+1), 1, tau(taupos) );
 
         /*
-         * Apply left on A(J1:J2,st+1:ed)
+         * Apply left on A(J1:J2, st+1:ed)
          * We decrease len because we start at col st+1 instead of st.
          * col st is the col that has been revomved;
          */
         len = len-1;
 
-        ctmp = conj(*TAU(taupos));
+        ctmp = conj(*tau(taupos));
         LAPACKE_zlarfx_work(LAPACK_COL_MAJOR, lapack_const(PlasmaLeft),
-                            lem, len, V(vpos), ctmp, A(J1, st+1), LDX, WORK);
+                            lem, len, V(vpos), ctmp, A(J1, st+1), ldx, work);
     }
-    return;
 }
 /***************************************************************************/
 #undef A
 #undef V
-#undef TAU
+#undef tau
diff --git a/core_blas/core_zhbtype3cb.c b/core_blas/core_zhbtype3cb.c
index 0e3d53b4..64883328 100644
--- a/core_blas/core_zhbtype3cb.c
+++ b/core_blas/core_zhbtype3cb.c
@@ -10,54 +10,54 @@
  *
  **/
 
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "core_lapack.h"
 #include "bulge.h"
 
-#define A(m,n)   (A + LDA * (n) + ((m)-(n)))
+#define A(m, n)  (A + lda*(n) + ((m) - (n)))
 #define V(m)     (V + (m))
-#define TAU(m)   (TAU + (m))
+#define tau(m)   (tau + (m))
 
 /***************************************************************************//**
  *
  * @ingroup CORE_plasma_complex64_t
  *
  *  CORE_zhbtype3cb is a kernel that will operate on a region (triangle) of data
- *  bounded by st and ed. This kernel apply a left+right update on the hermitian
- *  triangle.  Note that this kernel is very similar to type1 but does not do an
+ *  bounded by st and ed. This kernel applies a left+right update on the Hermitian
+ *  triangle. Note that this kernel is very similar to type1 but does not do an
  *  elimination.
  *
- *  All detail are available on technical report or SC11 paper.
+ *  All details are available in the technical report or SC11 paper.
  *  Azzam Haidar, Hatem Ltaief, and Jack Dongarra. 2011.
  *  Parallel reduction to condensed forms for symmetric eigenvalue problems
  *  using aggregated fine-grained and memory-aware kernels. In Proceedings
  *  of 2011 International Conference for High Performance Computing,
- *  Networking, Storage and Analysis (SC '11). ACM, New York, NY, USA, ,
- *  Article 8 , 11 pages.
+ *  Networking, Storage and Analysis (SC '11). ACM, New York, NY, USA,
+ *  Article 8, 11 pages.
  *  http://doi.acm.org/10.1145/2063384.2063394
  *
  *******************************************************************************
  *
- * @param[in] N
+ * @param[in] n
  *          The order of the matrix A.
  *
- * @param[in] NB
+ * @param[in] nb
  *          The size of the band.
  *
- * @param[in, out] A
- *          A pointer to the matrix A of size (2*NB+1)-by-N.
+ * @param[in,out] A
+ *          A pointer to the matrix A of size (2*nb+1)-by-n.
  *
- * @param[in] LDA
- *          The leading dimension of the matrix A. LDA >= max(1,2*NB+1)
+ * @param[in] lda
+ *          The leading dimension of the matrix A. lda >= max(1, 2*nb+1)
  *
  * @param[in] V
- *          plasma_complex64_t array, dimension N if eigenvalue only
+ *          plasma_complex64_t array, dimension n if eigenvalue only
  *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
  *          The Householder reflectors are stored in this array.
  *
- * @param[in] TAU
- *          plasma_complex64_t array, dimension (N).
+ * @param[in] tau
+ *          plasma_complex64_t array, dimension (n).
  *          The scalar factors of the Householder reflectors are stored
  *          in this array.
  *
@@ -76,11 +76,11 @@
  *          it serve to calculate the pointer to the position where to store the
  *          Vs and Ts.
  *
- * @param[in] WANTZ
+ * @param[in] wantz
  *          constant which indicate if Eigenvalue are requested or both
  *          Eigenvalue/Eigenvectors.
  *
- * @param[in] WORK
+ * @param[in] work
  *          Workspace of size nb.
  *
  *******************************************************************************
@@ -94,31 +94,32 @@
 /***************************************************************************//**
  *          TYPE 3-BAND Lower-columnwise-Householder
  ***************************************************************************/
-void core_zhbtype3cb(int N, int NB,
-                     plasma_complex64_t *A, int LDA,
-                     const plasma_complex64_t *V, const plasma_complex64_t *TAU,
-                     int st, int ed, int sweep, int Vblksiz, int WANTZ,
-                     plasma_complex64_t *WORK)
+void plasma_core_zhbtype3cb(
+    int n, int nb,
+    plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *V, const plasma_complex64_t *tau,
+    int st, int ed, int sweep, int Vblksiz, int wantz,
+    plasma_complex64_t *work)
 {
-    int len, LDX;
+    int len, ldx;
     int blkid, vpos, taupos, tpos;
 
-    if( WANTZ == 0 ) {
-        vpos   = ((sweep+1)%2)*N + st;
-        taupos = ((sweep+1)%2)*N + st;
-    } else {
-        findVTpos(N, NB, Vblksiz, sweep, st,
+    if (wantz == 0) {
+        vpos   = ((sweep+1)%2)*n + st;
+        taupos = ((sweep+1)%2)*n + st;
+    }
+    else {
+        findVTpos(n, nb, Vblksiz, sweep, st,
                   &vpos, &taupos, &tpos, &blkid);
     }
 
-    LDX = LDA-1;
+    ldx = lda-1;
     len = ed-st+1;
 
-    /* Apply left and right on A(st:ed,st:ed)*/
-    core_zlarfy(len, A(st,st), LDX, V(vpos), TAU(taupos), WORK);
-    return;
+    // Apply left and right on A(st:ed, st:ed)
+    plasma_core_zlarfy(len, A(st, st), ldx, V(vpos), tau(taupos), work);
 }
 /***************************************************************************/
 #undef A
 #undef V
-#undef TAU
+#undef tau
diff --git a/core_blas/core_zherfb.c b/core_blas/core_zherfb.c
index e50efdfa..727683a4 100644
--- a/core_blas/core_zherfb.c
+++ b/core_blas/core_zherfb.c
@@ -10,7 +10,7 @@
  *
  **/
 
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "plasma_internal.h"
 #include "core_lapack.h"
@@ -21,9 +21,9 @@
  *
  * @ingroup core_herfb
  *
- *  Overwrites the symmetric complex n-by-n tile C with
+ *  Overwrites the Hermitian complex n-by-n tile C with
  *
- *    Q**T*C*Q
+ *    Q^H*C*Q
  *
  *  where Q is a complex unitary matrix defined as the product of k
  *  elementary reflectors
@@ -35,19 +35,19 @@
  *******************************************************************************
  *
  * @param[in] uplo
- *         - PlasmaLower : the upper part of the symmetric matrix C
+ *         - PlasmaLower : the upper part of the Hermitian matrix C
  *                         is not referenced.
- *         - PlasmaUpper : the lower part of the symmetric matrix C
+ *         - PlasmaUpper : the lower part of the Hermitian matrix C
  *                         is not referenced (not supported).
  * @param[in] n
- *          The number of rows/columns of the tile C.  n >= 0.
+ *          The number of rows/columns of the tile C. n >= 0.
  *
  * @param[in] k
  *         The number of elementary reflectors whose product defines
  *         the matrix Q. k >= 0.
  *
  * @param[in] ib
- *         The inner-blocking size.  ib >= 0.
+ *         The inner-blocking size. ib >= 0.
 
  * @param[in] A
  *         The i-th column must contain the vector which defines the
@@ -66,8 +66,8 @@
  *         The leading dimension of the array T. ldt >= ib.
  *
  * @param[in,out] C
- *         On entry, the symmetric n-by-n tile C.
- *         On exit, C is overwritten by Q**T*C*Q.
+ *         On entry, the Hermitian n-by-n tile C.
+ *         On exit, C is overwritten by Q^H*C*Q.
  *
  * @param[in] ldc
  *         The leading dimension of the array C. ldc >= max(1,m).
@@ -84,62 +84,62 @@
  * @retval  < 0 if -i, the i-th argument had an illegal value
  *
  ******************************************************************************/
-int core_zherfb(plasma_enum_t uplo,
-                 int n, int k, int ib,
-                 const plasma_complex64_t *A,    int lda,
-                 const plasma_complex64_t *T,    int ldt,
-                       plasma_complex64_t *C,    int ldc,
-                       plasma_complex64_t *work, int ldwork )
+int plasma_core_zherfb(
+    plasma_enum_t uplo,
+    int n, int k, int ib,
+    const plasma_complex64_t *A,    int lda,
+    const plasma_complex64_t *T,    int ldt,
+          plasma_complex64_t *C,    int ldc,
+          plasma_complex64_t *work, int ldwork )
 {
     plasma_complex64_t tmp;
     int i, j;
 
-    // Check input arguments. 
-    if ((uplo != PlasmaUpper) && (uplo != PlasmaLower)) {
-        coreblas_error("Illegal value of uplo");
+    // Check input arguments.
+    if (uplo != PlasmaUpper && uplo != PlasmaLower) {
+        plasma_coreblas_error("Illegal value of uplo");
         return -1;
     }
     if (n < 0) {
-        coreblas_error("Illegal value of n");
+        plasma_coreblas_error("Illegal value of n");
         return -2;
     }
     if (k < 0) {
-        coreblas_error("Illegal value of k");
+        plasma_coreblas_error("Illegal value of k");
         return -3;
     }
     if (ib < 0) {
-        coreblas_error("Illegal value of ib");
+        plasma_coreblas_error("Illegal value of ib");
         return -4;
     }
-    if ( (lda < imax(1,n)) && (n > 0) ) {
-        coreblas_error("Illegal value of lda");
+    if (lda < imax(1,n) && n > 0) {
+        plasma_coreblas_error("Illegal value of lda");
         return -6;
     }
-    if ( (ldt < imax(1,ib)) && (ib > 0) ) {
-        coreblas_error("Illegal value of ldt");
+    if (ldt < imax(1,ib) && ib > 0) {
+        plasma_coreblas_error("Illegal value of ldt");
         return -8;
     }
-    if ( (ldc < imax(1,n)) && (n > 0) ) {
-        coreblas_error("Illegal value of ldc");
+    if (ldc < imax(1,n) && n > 0) {
+        plasma_coreblas_error("Illegal value of ldc");
         return -10;
     }
     if (ldwork < imax(1,n)) {
-        coreblas_error("Illegal value of ldwork");
+        plasma_coreblas_error("Illegal value of ldwork");
         return -12;
     }
 
     // Quick return
-    if ((n == 0) || (k == 0) ||
-        (ib == 0))
+    if (n == 0 || k == 0 || ib == 0)
         return PlasmaSuccess;
 
     int nb = n;
 
     if (uplo == PlasmaLower) {
-        // Rebuild the symmetric block: work <- C
-        for (j = 0; j < n; j++) {
+        // Rebuild the Hermitian block: work <- C
+        for (j = 0; j < n; ++j) {
             *(work + j + j * ldwork) =  *(C + ldc*j + j);
-            for (i = j+1; i < n; i++){
+            for (i = j+1; i < n; ++i) {
                 tmp = *(C + i + j*ldc);
                 *(work + i + j * ldwork) = tmp;
                 *(work + j + i * ldwork) = conj( tmp );
@@ -147,56 +147,65 @@ int core_zherfb(plasma_enum_t uplo,
         }
 
         // Left
-        core_zunmqr(PlasmaLeft, Plasma_ConjTrans, n, n, k, ib,
-                    A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
+        plasma_core_zunmqr(
+            PlasmaLeft, Plasma_ConjTrans, n, n, k, ib,
+            A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
         // Right
-        core_zunmqr(PlasmaRight, PlasmaNoTrans, n, n, k, ib,
-                    A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
-        
+        plasma_core_zunmqr(
+            PlasmaRight, PlasmaNoTrans, n, n, k, ib,
+            A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
+
         //====================================================
         // Copy back the final result to the lower part of C
         //===================================================
-        LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, lapack_const(PlasmaLower), n, n, work, ldwork, C, ldc );
+        LAPACKE_zlacpy_work(
+            LAPACK_COL_MAJOR, lapack_const(PlasmaLower),
+            n, n, work, ldwork, C, ldc );
     }
     else {
         //===================================================
-        // Rebuild the symmetric block: work <- C
+        // Rebuild the Hermitian block: work <- C
         //===================================================
-        for (j = 0; j < n; j++) {
-            for (i = 0; i < j; i++){
+        for (j = 0; j < n; ++j) {
+            for (i = 0; i < j; ++i) {
                 tmp = *(C + i + j*ldc);
                 *(work + i + j * ldwork) = tmp;
                 *(work + j + i * ldwork) = conj( tmp );
             }
             *(work + j + j * ldwork) =  *(C + ldc*j + j);
         }
-        
-        // Right 
-        core_zunmlq(PlasmaRight, Plasma_ConjTrans, n, n, k, ib,
-                    A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
-        // Left 
-        core_zunmlq(PlasmaLeft, PlasmaNoTrans, n, n, k, ib,
-                    A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
-        
+
+        // Right
+        plasma_core_zunmlq(
+            PlasmaRight, Plasma_ConjTrans, n, n, k, ib,
+            A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
+        // Left
+        plasma_core_zunmlq(
+            PlasmaLeft, PlasmaNoTrans, n, n, k, ib,
+            A, lda, T, ldt, work, ldwork, work+nb*ldwork, ldwork);
+
         //===================================================
         // Copy back the final result to the upper part of C
         //==================================================
-        LAPACKE_zlacpy_work( LAPACK_COL_MAJOR, lapack_const(PlasmaUpper), n, n, work, ldwork, C, ldc );
+        LAPACKE_zlacpy_work(
+            LAPACK_COL_MAJOR, lapack_const(PlasmaUpper),
+            n, n, work, ldwork, C, ldc );
     }
     return PlasmaSuccess;
 }
 
 /******************************************************************************/
-void core_omp_zherfb(plasma_enum_t uplo,
-                     int n, int k, int ib,
-                     const plasma_complex64_t *A, int lda,
-                     const plasma_complex64_t *T, int ldt,
-                           plasma_complex64_t *C, int ldc,
-                     plasma_workspace_t work,
-                     plasma_sequence_t *sequence, plasma_request_t *request)
+void plasma_core_omp_zherfb(
+    plasma_enum_t uplo,
+    int n, int k, int ib,
+    const plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *T, int ldt,
+          plasma_complex64_t *C, int ldc,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request)
 {
     // OpenMP depends on lda == n == nb, ldc == nb, ldt == ib.
-        #pragma omp task depend(in:A[0:lda*k]) \
+    #pragma omp task depend(in:A[0:lda*k]) \
                      depend(in:T[0:ib*k]) \
                      depend(inout:C[0:ldc*n])
     {
@@ -208,12 +217,13 @@ void core_omp_zherfb(plasma_enum_t uplo,
             int ldwork = n;
 
             // Call the kernel.
-            int info = core_zherfb(uplo,
-                                   n, k, ib,
-                                   A, lda,
-                                   T, ldt,
-                                   C, ldc,
-                                   W, ldwork);
+            int info = plasma_core_zherfb(
+                uplo,
+                n, k, ib,
+                A, lda,
+                T, ldt,
+                C, ldc,
+                W, ldwork);
             if (info != PlasmaSuccess) {
                 plasma_error_with_code("Error in call to COREBLAS in argument",
                                        -info);
diff --git a/core_blas/core_zlarfy.c b/core_blas/core_zlarfy.c
index efaacdc0..4b3cdc86 100644
--- a/core_blas/core_zlarfy.c
+++ b/core_blas/core_zlarfy.c
@@ -10,7 +10,7 @@
  *
  **/
 
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "core_lapack.h"
 
@@ -55,14 +55,15 @@
  *          Workspace.
  *
  ******************************************************************************/
-void core_zlarfy(int N,
-            plasma_complex64_t *A, int LDA,
-            const plasma_complex64_t *V,
-            const plasma_complex64_t *TAU,
-            plasma_complex64_t *WORK)
+void plasma_core_zlarfy(
+    int N,
+    plasma_complex64_t *A, int LDA,
+    const plasma_complex64_t *V,
+    const plasma_complex64_t *TAU,
+    plasma_complex64_t *WORK)
 {
-    static plasma_complex64_t zzero =  0.0;
-    static plasma_complex64_t zmone = -1.0;
+    const plasma_complex64_t zzero =  0.0;
+    const plasma_complex64_t zmone = -1.0;
 
     int j;
     plasma_complex64_t dtmp;
@@ -75,7 +76,7 @@ void core_zlarfy(int N,
 
     /* cblas_zdotc_sub(N, WORK, 1, V, 1, &dtmp);*/
     dtmp = 0.;
-    for (j = 0; j < N ; j++)
+    for (j = 0; j < N; ++j)
         dtmp = dtmp + conj(WORK[j]) * V[j];
 
     /* Compute 1/2 X'*V*t = 1/2*dtmp*tau  */
diff --git a/core_blas/core_ztsmlq_corner.c b/core_blas/core_ztsmlq_corner.c
index dac68fb4..0fed1bf1 100644
--- a/core_blas/core_ztsmlq_corner.c
+++ b/core_blas/core_ztsmlq_corner.c
@@ -10,7 +10,7 @@
  *
  **/
 
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "plasma_internal.h"
 #include "core_lapack.h"
@@ -107,37 +107,38 @@
  * @retval < 0 if -i, the i-th argument had an illegal value
  *
  ******************************************************************************/
-int core_ztsmlq_corner(int m1, int n1, int m2, int n2,
-                       int m3, int n3, int k, int ib,
-                             plasma_complex64_t *A1, int lda1,
-                             plasma_complex64_t *A2, int lda2,
-                             plasma_complex64_t *A3, int lda3,
-                       const plasma_complex64_t *V,  int ldv,
-                       const plasma_complex64_t *T,  int ldt,
-                       plasma_complex64_t *work, int ldwork)
+int plasma_core_ztsmlq_corner(
+    int m1, int n1, int m2, int n2,
+    int m3, int n3, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+          plasma_complex64_t *A3, int lda3,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_complex64_t *work, int ldwork)
 {
     plasma_enum_t side;
     plasma_enum_t trans;
     int i, j;
 
     // Check input arguments
-    if ( m1 != n1 ) {
-        coreblas_error("Illegal value of M1, N1");
+    if (m1 != n1) {
+        plasma_coreblas_error("Illegal value of M1, N1");
         return -1;
     }
     int nb = n1;
     // Rebuild the symmetric block: work <- A1
-    for (i = 0; i < m1; i++)
-        for (j = i; j < n1; j++){
+    for (i = 0; i < m1; ++i)
+        for (j = i; j < n1; ++j) {
             *(work + i + j*ldwork) = *(A1 + i + j*lda1);
-            if (j > i){
+            if (j > i) {
                 *(work + j + i*ldwork) =  conj( *(work + i + j*ldwork) );
             }
         }
 
     //  Copy the transpose of A2: work+nb*ldwork <- A2'
-    for (j = 0; j < n2; j++)
-        for (i = 0; i < m2; i++){
+    for (j = 0; j < n2; ++j)
+        for (i = 0; i < m2; ++i) {
             *(work + j + (i + nb) * ldwork) = conj( *(A2 + i + j*lda2) );
         }
 
@@ -145,25 +146,27 @@ int core_ztsmlq_corner(int m1, int n1, int m2, int n2,
     trans = Plasma_ConjTrans;
 
     //  Right application on |A1 A2|
-    core_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
-                work, ldwork, A2, lda2,
-                V, ldv, T, ldt,
-                work+3*nb*ldwork, ldwork);
+    plasma_core_ztsmlq(
+        side, trans, m1, n1, m2, n2, k, ib,
+        work, ldwork, A2, lda2,
+        V, ldv, T, ldt,
+        work+3*nb*ldwork, ldwork);
 
     //  Rebuild the symmetric block: work+2*nb*ldwork <- A3
-    for (i = 0; i < m3; i++)
-        for (j = i; j < n3; j++){
+    for (i = 0; i < m3; ++i)
+        for (j = i; j < n3; ++j) {
             *(work + i + (j + 2*nb) * ldwork) = *(A3 + i + j*lda3);
-            if (j > i){
+            if (j > i) {
                 *(work + j + (i + 2*nb) * ldwork) =  conj ( *(work + i + (j + 2*nb) * ldwork) );
             }
         }
 
     //  Right application on | A2' A3 |
-    core_ztsmlq(side, trans, n2, m2, m3, n3, k, ib,
-                work+nb*ldwork, ldwork, work+2*nb*ldwork, ldwork,
-                V, ldv, T, ldt,
-                work + 3*nb*ldwork, ldwork);
+    plasma_core_ztsmlq(
+        side, trans, n2, m2, m3, n3, k, ib,
+        work+nb*ldwork, ldwork, work+2*nb*ldwork, ldwork,
+        V, ldv, T, ldt,
+        work + 3*nb*ldwork, ldwork);
 
     side = PlasmaLeft;
     trans = PlasmaNoTrans;
@@ -172,48 +175,51 @@ int core_ztsmlq_corner(int m1, int n1, int m2, int n2,
     //  Left application on | A1  |
     //                      | A2' |
     //========================================================
-    core_ztsmlq(side, trans, m1, n1, n2, m2, k, ib,
-                work, ldwork, work+nb*ldwork, ldwork,
-                V, ldv, T, ldt,
-                work + 3*nb*ldwork, ldwork);
+    plasma_core_ztsmlq(
+        side, trans, m1, n1, n2, m2, k, ib,
+        work, ldwork, work+nb*ldwork, ldwork,
+        V, ldv, T, ldt,
+        work + 3*nb*ldwork, ldwork);
 
     //========================================================
     //  Copy back the final result to the upper part of A1
     //  A1 = work
     //========================================================
-    for (i = 0; i < m1; i++)
-        for (j = i; j < n1; j++)
+    for (i = 0; i < m1; ++i)
+        for (j = i; j < n1; ++j)
             *(A1 + i + j*lda1) = *(work + i + j*ldwork);
 
     //========================================================
     //  Left application on | A2 |
     //                     | A3 |
     //========================================================
-    core_ztsmlq(side, trans, m2, n2, m3, n3, k, ib,
-                A2, lda2, work+2*nb*ldwork, ldwork,
-                V, ldv, T, ldt,
-                work + 3*nb*ldwork, ldwork);
+    plasma_core_ztsmlq(
+        side, trans, m2, n2, m3, n3, k, ib,
+        A2, lda2, work+2*nb*ldwork, ldwork,
+        V, ldv, T, ldt,
+        work + 3*nb*ldwork, ldwork);
 
     //========================================================
     //  Copy back the final result to the upper part of A3
     //  A3 = work+2*nb*ldwork
     //========================================================
-    for (i = 0; i < m3; i++)
-        for (j = i; j < n3; j++)
+    for (i = 0; i < m3; ++i)
+        for (j = i; j < n3; ++j)
             *(A3 + i + j*lda3) = *(work + i + (j+ 2*nb) * ldwork);
 
     return PlasmaSuccess;
 }
 
-void core_omp_ztsmlq_corner(int m1, int n1, int m2, int n2,
-                            int m3, int n3, int k, int ib,
-                                  plasma_complex64_t *A1, int lda1,
-                                  plasma_complex64_t *A2, int lda2,
-                                  plasma_complex64_t *A3, int lda3,
-                            const plasma_complex64_t *V,  int ldv,
-                            const plasma_complex64_t *T,  int ldt,
-                            plasma_workspace_t work,
-                            plasma_sequence_t *sequence, plasma_request_t *request)
+void plasma_core_omp_ztsmlq_corner(
+    int m1, int n1, int m2, int n2,
+    int m3, int n3, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+          plasma_complex64_t *A3, int lda3,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request)
 {
     int nb = n1;
     // assuming m1 == nb, n1 == nb, m2 == nb, n2 == nb
@@ -232,13 +238,14 @@ void core_omp_ztsmlq_corner(int m1, int n1, int m2, int n2,
             int ldwork = nb;
 
             // call the kernel
-            int info = core_ztsmlq_corner(m1, n1, m2, n2, m3, n3, k, ib,
-                                          A1, lda1,
-                                          A2, lda2,
-                                          A3, lda3,
-                                          V, ldv,
-                                          T, ldt,
-                                          W, ldwork);
+            int info = plasma_core_ztsmlq_corner(
+                m1, n1, m2, n2, m3, n3, k, ib,
+                A1, lda1,
+                A2, lda2,
+                A3, lda3,
+                V, ldv,
+                T, ldt,
+                W, ldwork);
 
             if (info != PlasmaSuccess) {
                 plasma_error_with_code("Error in call to COREBLAS in argument",
diff --git a/core_blas/core_ztsmlq_hetra1.c b/core_blas/core_ztsmlq_hetra1.c
index 14829ddb..e0e2592a 100644
--- a/core_blas/core_ztsmlq_hetra1.c
+++ b/core_blas/core_ztsmlq_hetra1.c
@@ -10,7 +10,7 @@
  *
  **/
 
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "plasma_internal.h"
 #include "core_lapack.h"
@@ -105,43 +105,45 @@
  * @retval < 0 if -i, the i-th argument had an illegal value
  *
  ******************************************************************************/
-int core_ztsmlq_hetra1(plasma_enum_t side, plasma_enum_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib,
-                             plasma_complex64_t *A1, int lda1,
-                             plasma_complex64_t *A2, int lda2,
-                       const plasma_complex64_t *V,  int ldv,
-                       const plasma_complex64_t *T,  int ldt,
-                       plasma_complex64_t *work, int ldwork)
+int plasma_core_ztsmlq_hetra1(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_complex64_t *work, int ldwork)
 {
     int i, j;
 
     // Check input arguments
-    if ( (m1 != n1) ) {
-        coreblas_error("illegal value of m1, n1");
+    if (m1 != n1) {
+        plasma_coreblas_error("illegal value of m1, n1");
         return -3;
     }
 
     // in-place transposition of A1
-    for (j = 0; j < n1; j++){
+    for (j = 0; j < n1; ++j) {
         A1[j + j*lda1] = conj(A1[j + j*lda1]);
 
-        for (i = j+1; i < m1; i++){
+        for (i = j+1; i < m1; ++i) {
             *work = *(A1 + i + j*lda1);
             *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
             *(A1 + j + i*lda1) = conj(*work);
         }
     }
 
-    core_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
-                A1, lda1, A2, lda2,
-                V,  ldv,  T,  ldt,
-                work, ldwork);
+    plasma_core_ztsmlq(
+        side, trans, m1, n1, m2, n2, k, ib,
+        A1, lda1, A2, lda2,
+        V,  ldv,  T,  ldt,
+        work, ldwork);
 
     // in-place transposition of A1
-    for (j = 0; j < n1; j++){
+    for (j = 0; j < n1; ++j) {
         A1[j + j*lda1] = conj(A1[j + j*lda1]);
 
-        for (i = j+1; i < m1; i++){
+        for (i = j+1; i < m1; ++i) {
             *work = *(A1 + i + j*lda1);
             *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
             *(A1 + j + i*lda1) = conj(*work);
@@ -152,14 +154,15 @@ int core_ztsmlq_hetra1(plasma_enum_t side, plasma_enum_t trans,
 }
 
 /******************************************************************************/
-void core_omp_ztsmlq_hetra1(plasma_enum_t side, plasma_enum_t trans,
-                            int m1, int n1, int m2, int n2, int k, int ib,
-                                  plasma_complex64_t *A1, int lda1,
-                                  plasma_complex64_t *A2, int lda2,
-                            const plasma_complex64_t *V,  int ldv,
-                            const plasma_complex64_t *T,  int ldt,
-                            plasma_workspace_t work,
-                            plasma_sequence_t *sequence, plasma_request_t *request)
+void plasma_core_omp_ztsmlq_hetra1(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request)
 {
     int nb = n1;
     // assuming m1 == nb, n1 == nb, m2 == nb, n2 == nb
@@ -176,13 +179,14 @@ void core_omp_ztsmlq_hetra1(plasma_enum_t side, plasma_enum_t trans,
             int ldwork = side == PlasmaLeft ? ib : nb;
 
             // call the kernel
-            int info = core_ztsmlq_hetra1(side, trans,
-                                          m1, n1, m2, n2, k, ib,
-                                          A1, lda1,
-                                          A2, lda2,
-                                          V, ldv,
-                                          T, ldt,
-                                          W, ldwork);
+            int info = plasma_core_ztsmlq_hetra1(
+                side, trans,
+                m1, n1, m2, n2, k, ib,
+                A1, lda1,
+                A2, lda2,
+                V, ldv,
+                T, ldt,
+                W, ldwork);
 
             if (info != PlasmaSuccess) {
                 plasma_error_with_code("Error in call to COREBLAS in argument",
diff --git a/core_blas/core_ztsmqr_corner.c b/core_blas/core_ztsmqr_corner.c
index 02e22c4e..87a2b301 100644
--- a/core_blas/core_ztsmqr_corner.c
+++ b/core_blas/core_ztsmqr_corner.c
@@ -10,7 +10,7 @@
  *
  **/
 
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "plasma_internal.h"
 #include "core_lapack.h"
@@ -78,7 +78,7 @@
  * @param[in] V
  *         The i-th row must contain the vector which defines the
  *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         core_ZTSQRT in the first k columns of its array argument V.
+ *         plasma_core_ztsqrt in the first k columns of its array argument V.
  *
  * @param[in] ldv
  *         The leading dimension of the array V. ldv >= max(1,K).
@@ -107,36 +107,37 @@
  * @retval < 0 if -i, the i-th argument had an illegal value
  *
  ******************************************************************************/
-int core_ztsmqr_corner(int m1, int n1, int m2, int n2,
-                       int m3, int n3, int k, int ib,
-                              plasma_complex64_t *A1, int lda1,
-                              plasma_complex64_t *A2, int lda2,
-                              plasma_complex64_t *A3, int lda3,
-                        const plasma_complex64_t *V, int ldv,
-                        const plasma_complex64_t *T, int ldt,
-                        plasma_complex64_t *work, int ldwork)
+int plasma_core_ztsmqr_corner(
+    int m1, int n1, int m2, int n2,
+    int m3, int n3, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+          plasma_complex64_t *A3, int lda3,
+    const plasma_complex64_t *V, int ldv,
+    const plasma_complex64_t *T, int ldt,
+    plasma_complex64_t *work, int ldwork)
 {
     int i, j;
     plasma_enum_t side, trans;
-    
+
     // Check input arguments.
-    if ( m1 != n1 ) {
-        coreblas_error("Illegal value of m1, n1");
+    if (m1 != n1) {
+        plasma_coreblas_error("Illegal value of m1, n1");
         return -1;
     }
     int nb = n1;
     //  Rebuild the symmetric block: work <- A1
-    for (j = 0; j < n1; j++)
-        for (i = j; i < m1; i++){
+    for (j = 0; j < n1; ++j)
+        for (i = j; i < m1; ++i) {
             *(work + i + j*ldwork) = *(A1 + i + j*lda1);
-            if (i > j){
+            if (i > j) {
                 *(work + j + i*ldwork) =  conj( *(work + i + j*ldwork) );
             }
         }
-    
+
     //  Copy the transpose of A2: work+nb*ldwork <- A2'
-    for (j = 0; j < n2; j++)
-        for (i = 0; i < m2; i++){
+    for (j = 0; j < n2; ++j)
+        for (i = 0; i < m2; ++i) {
             *(work + j + (i + nb) * ldwork) = conj( *(A2 + i + j*lda2) );
         }
 
@@ -147,16 +148,17 @@ int core_ztsmqr_corner(int m1, int n1, int m2, int n2,
     //  Left application on |A1|
     //                      |A2|
     //=============================================
-    core_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
-                work, ldwork, A2, lda2,
-                V, ldv, T, ldt,
-                work + 3*nb*ldwork, ldwork);
+    plasma_core_ztsmqr(
+        side, trans, m1, n1, m2, n2, k, ib,
+        work, ldwork, A2, lda2,
+        V, ldv, T, ldt,
+        work + 3*nb*ldwork, ldwork);
 
     //  Rebuild the symmetric block: work+2*nb*ldwork <- A3
-    for (j = 0; j < n3; j++)
-        for (i = j; i < m3; i++){
+    for (j = 0; j < n3; ++j)
+        for (i = j; i < m3; ++i) {
             *(work + i + (j + 2*nb) * ldwork) = *(A3 + i + j*lda3);
-            if (i != j){
+            if (i != j) {
                 *(work + j + (i + 2*nb) * ldwork) =  conj( *(work + i + (j + 2*nb) * ldwork) );
             }
         }
@@ -164,55 +166,59 @@ int core_ztsmqr_corner(int m1, int n1, int m2, int n2,
     //  Left application on | A2'|
     //                      | A3 |
     //==========================================
-    core_ztsmqr(side, trans, n2, m2, m3, n3, k, ib,
-                work+nb*ldwork, ldwork, work+2*nb*ldwork, ldwork,
-                V, ldv, T, ldt,
-                work + 3*nb*ldwork, ldwork);
+    plasma_core_ztsmqr(
+        side, trans, n2, m2, m3, n3, k, ib,
+        work+nb*ldwork, ldwork, work+2*nb*ldwork, ldwork,
+        V, ldv, T, ldt,
+        work + 3*nb*ldwork, ldwork);
 
     side  = PlasmaRight;
     trans = PlasmaNoTrans;
 
     //  Right application on | A1 A2' |
-    core_ztsmqr(side, trans, m1, n1, n2, m2, k, ib,
-                work, ldwork, work+nb*ldwork, ldwork,
-                V, ldv, T, ldt,
-                work + 3*nb*ldwork, ldwork);
+    plasma_core_ztsmqr(
+        side, trans, m1, n1, n2, m2, k, ib,
+        work, ldwork, work+nb*ldwork, ldwork,
+        V, ldv, T, ldt,
+        work + 3*nb*ldwork, ldwork);
 
     //  Copy back the final result to the lower part of A1
     //  A1 = work
-    for (j = 0; j < n1; j++)
-        for (i = j; i < m1; i++)
+    for (j = 0; j < n1; ++j)
+        for (i = j; i < m1; ++i)
             *(A1 + i + j*lda1) = *(work + i + j*ldwork);
 
     //  Right application on | A2 A3 |
-    core_ztsmqr(side, trans, m2, n2, m3, n3, k, ib,
-                A2, lda2, work+2*nb*ldwork, ldwork,
-                V,  ldv,  T, ldt,
-                work + 3*nb*ldwork, ldwork);
+    plasma_core_ztsmqr(
+        side, trans, m2, n2, m3, n3, k, ib,
+        A2, lda2, work+2*nb*ldwork, ldwork,
+        V,  ldv,  T, ldt,
+        work + 3*nb*ldwork, ldwork);
 
     //=======================================================
     //  Copy back the final result to the lower part of A3
     //  A3 = work+2*nb*ldwork
     //=======================================================
-    for (j = 0; j < n3; j++)
-        for (i = j; i < m3; i++)
+    for (j = 0; j < n3; ++j)
+        for (i = j; i < m3; ++i)
             *(A3 + i + j*lda3) = *(work + i + (j+ 2*nb) * ldwork);
-    
+
     return PlasmaSuccess;
 }
 
 /*****************************************************************************/
-void core_omp_ztsmqr_corner(int m1, int n1, int m2, int n2,
-                            int m3, int n3, int k, int ib,
-                                  plasma_complex64_t *A1, int lda1,
-                                  plasma_complex64_t *A2, int lda2,
-                                  plasma_complex64_t *A3, int lda3,
-                            const plasma_complex64_t *V,  int ldv,
-                            const plasma_complex64_t *T,  int ldt,
-                            plasma_workspace_t work,
-                            plasma_sequence_t *sequence, plasma_request_t *request)
+void plasma_core_omp_ztsmqr_corner(
+    int m1, int n1, int m2, int n2,
+    int m3, int n3, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+          plasma_complex64_t *A3, int lda3,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request)
 {
-    
+
     // omp depends assume m1 == nb, n1 == nb, m2 == nb, n2 == nb,
     // m3 == nb, n3 == nb.
     int nb = n1;
@@ -230,13 +236,14 @@ void core_omp_ztsmqr_corner(int m1, int n1, int m2, int n2,
             int ldwork = nb;
 
             // Call the kernel.
-            int info = core_ztsmqr_corner(m1, n1, m2, n2, m3, n3, k, ib,
-                                          A1, lda1,
-                                          A2, lda2,
-                                          A3, lda3,
-                                          V, ldv,
-                                          T, ldt,
-                                          W, ldwork);
+            int info = plasma_core_ztsmqr_corner(
+                m1, n1, m2, n2, m3, n3, k, ib,
+                A1, lda1,
+                A2, lda2,
+                A3, lda3,
+                V, ldv,
+                T, ldt,
+                W, ldwork);
             if (info != PlasmaSuccess) {
                 plasma_error_with_code("Error in call to COREBLAS in argument",
                                        -info);
diff --git a/core_blas/core_ztsmqr_hetra1.c b/core_blas/core_ztsmqr_hetra1.c
index 50d8dda0..273b8c43 100644
--- a/core_blas/core_ztsmqr_hetra1.c
+++ b/core_blas/core_ztsmqr_hetra1.c
@@ -10,7 +10,7 @@
  *
  **/
 
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "plasma_types.h"
 #include "plasma_internal.h"
 #include "core_lapack.h"
@@ -77,7 +77,7 @@
  * @param[in] V
  *         The i-th row must contain the vector which defines the
  *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         core_ZTSQRT in the first k columns of its array argument V.
+ *         plasma_core_ztsqrt in the first k columns of its array argument V.
  *
  * @param[in] ldv
  *         The leading dimension of the array V. ldv >= max(1,K).
@@ -106,46 +106,48 @@
  * @retval < 0 if -i, the i-th argument had an illegal value
  *
  ******************************************************************************/
-int core_ztsmqr_hetra1(plasma_enum_t side, plasma_enum_t trans,
-                        int m1, int n1, int m2, int n2, int k, int ib,
-                              plasma_complex64_t *A1, int lda1,
-                              plasma_complex64_t *A2, int lda2,
-                        const plasma_complex64_t *V,  int ldv,
-                        const plasma_complex64_t *T,  int ldt,
-                              plasma_complex64_t *work, int ldwork)
+int plasma_core_ztsmqr_hetra1(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+          plasma_complex64_t *work, int ldwork)
 {
     int i, j;
 
     // Check input arguments.
-    if ( (m1 != n1) ) {
-        coreblas_error("Illegal value of m1, n1");
+    if (m1 != n1) {
+        plasma_coreblas_error("Illegal value of m1, n1");
         return -3;
     }
 
-    // in-place transposition of A1 
-    for (j = 0; j < n1; j++){
+    // in-place transposition of A1
+    for (j = 0; j < n1; ++j) {
         A1[j + j*lda1] = conj(A1[j + j*lda1]);
 
-        for (i = j+1; i < m1; i++){
+        for (i = j+1; i < m1; ++i) {
             *work = *(A1 + i + j*lda1);
             *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
             *(A1 + j + i*lda1) = conj(*work);
         }
     }
 
-    core_ztsmqr(side, trans,
-                m1, n1, m2, n2, k, ib,
-                A1, lda1,
-                A2, lda2,
-                V, ldv,
-                T, ldt,
-                work, ldwork);
+    plasma_core_ztsmqr(
+        side, trans,
+        m1, n1, m2, n2, k, ib,
+        A1, lda1,
+        A2, lda2,
+        V, ldv,
+        T, ldt,
+        work, ldwork);
 
     // in-place transposition of A1
-    for (j = 0; j < n1; j++){
+    for (j = 0; j < n1; ++j) {
         A1[j + j*lda1] = conj(A1[j + j*lda1]);
 
-        for (i = j+1; i < m1; i++){
+        for (i = j+1; i < m1; ++i) {
             *work = *(A1 + i + j*lda1);
             *(A1 + i + j*lda1) = conj(*(A1 + j + i*lda1));
             *(A1 + j + i*lda1) = conj(*work);
@@ -156,14 +158,15 @@ int core_ztsmqr_hetra1(plasma_enum_t side, plasma_enum_t trans,
 }
 
 /******************************************************************************/
-void core_omp_ztsmqr_hetra1(plasma_enum_t side, plasma_enum_t trans,
-                            int m1, int n1, int m2, int n2, int k, int ib,
-                                  plasma_complex64_t *A1, int lda1,
-                                  plasma_complex64_t *A2, int lda2,
-                            const plasma_complex64_t *V,  int ldv,
-                            const plasma_complex64_t *T,  int ldt,
-                            plasma_workspace_t work,
-                            plasma_sequence_t *sequence, plasma_request_t *request)
+void plasma_core_omp_ztsmqr_hetra1(
+    plasma_enum_t side, plasma_enum_t trans,
+    int m1, int n1, int m2, int n2, int k, int ib,
+          plasma_complex64_t *A1, int lda1,
+          plasma_complex64_t *A2, int lda2,
+    const plasma_complex64_t *V,  int ldv,
+    const plasma_complex64_t *T,  int ldt,
+    plasma_workspace_t work,
+    plasma_sequence_t *sequence, plasma_request_t *request)
 {
     int nb = n1;
     // omp depends assume m1 == nb, n1 == nb, m2 == nb, n2 == nb.
@@ -180,13 +183,14 @@ void core_omp_ztsmqr_hetra1(plasma_enum_t side, plasma_enum_t trans,
             int ldwork = side == PlasmaLeft ? ib : nb;
 
             // Call the kernel.
-            int info = core_ztsmqr_hetra1(side, trans,
-                                   m1, n1, m2, n2, k, ib,
-                                   A1, lda1,
-                                   A2, lda2,
-                                   V, ldv,
-                                   T, ldt,
-                                   W, ldwork);
+            int info = plasma_core_ztsmqr_hetra1(
+                side, trans,
+                m1, n1, m2, n2, k, ib,
+                A1, lda1,
+                A2, lda2,
+                V, ldv,
+                T, ldt,
+                W, ldwork);
 
             if (info != PlasmaSuccess) {
                 plasma_error_with_code("Error in call to COREBLAS in argument",
diff --git a/test/test_zheevd.c b/test/test_zheevd.c
index 1a55d85d..36a269d7 100644
--- a/test/test_zheevd.c
+++ b/test/test_zheevd.c
@@ -12,7 +12,7 @@
 
 #include "test.h"
 #include "flops.h"
-#include "core_blas.h"
+#include "plasma_core_blas.h"
 #include "core_lapack.h"
 #include "plasma.h"
 
@@ -24,8 +24,6 @@
 #include <math.h>
 #include <omp.h>
 
-#define COMPLEX
-
 #undef  REAL
 #define COMPLEX
 
@@ -43,11 +41,10 @@
  ******************************************************************************/
 void test_zheevd(param_value_t param[], bool run)
 {
-
     //================================================================
     // Mark which parameters are used.
     //================================================================
-    param[PARAM_EIGT  ].used = true;
+    param[PARAM_JOB   ].used = true;
     param[PARAM_UPLO  ].used = true;
     param[PARAM_DIM   ].used = PARAM_USE_N;
     param[PARAM_PADA  ].used = true;
@@ -60,9 +57,9 @@ void test_zheevd(param_value_t param[], bool run)
     //================================================================
     // Set parameters.
     //================================================================
-    plasma_enum_t eigt = plasma_eigt_const(param[PARAM_EIGT].c);
+    plasma_enum_t job  = plasma_job_const(param[PARAM_JOB].c);
     plasma_enum_t uplo = plasma_uplo_const(param[PARAM_UPLO].c);
- 
+
     int n = param[PARAM_DIM].dim.n;
 
     int lda = imax(1, n + param[PARAM_PADA].i);
@@ -76,56 +73,56 @@ void test_zheevd(param_value_t param[], bool run)
     plasma_set(PlasmaNb, param[PARAM_NB].i);
     plasma_set(PlasmaIb, param[PARAM_NB].i/4);
 
-
     //================================================================
     // Allocate and initialize arrays.
     //================================================================
     plasma_complex64_t *A = (plasma_complex64_t *)malloc(
         (size_t)n*lda*sizeof(plasma_complex64_t));
-    
+
     plasma_complex64_t *Aref = NULL;
     plasma_complex64_t *Q    = NULL;
-    double             *Wref = NULL;
+    double             *Lambda_ref = NULL;
     plasma_complex64_t *work = NULL;
-    double             *W = (double*)malloc((size_t)n*sizeof(double));
+    double             *Lambda = (double*)malloc((size_t)n*sizeof(double));
     int seed[] = {0, 0, 0, 1};
     if (test) {
-        Wref = (double*)malloc((size_t)n*sizeof(double));
+        Lambda_ref = (double*)malloc((size_t)n*sizeof(double));
         work = (plasma_complex64_t *)malloc(
             (size_t)3*n*sizeof(plasma_complex64_t));
-        
-        for (int i=0; i< n; i++){
-            Wref[i] = (double )i+1;
+
+        for (int i = 0; i < n; ++i) {
+            Lambda_ref[i] = (double)i + 1;
         }
-        
+
         int    mode  = 0;
         double dmax  = 1.0;
         double rcond = 1.0e6;
         LAPACKE_zlatms_work(LAPACK_COL_MAJOR, n, n,
                            'S', seed,
-                           'H', Wref, mode, rcond,
+                           'H', Lambda_ref, mode, rcond,
                             dmax, n, n,
                            'N', A, lda, work);
 
         // Sort the eigenvalues
-        LAPACKE_dlasrt_work( 'I', n, Wref);
+        LAPACKE_dlasrt_work( 'I', n, Lambda_ref );
 
         // Copy A into Aref
         Aref = (plasma_complex64_t *)malloc(
             (size_t)n*lda*sizeof(plasma_complex64_t));
         LAPACKE_zlacpy_work(LAPACK_COL_MAJOR,
                             'A', n, n, A, lda, Aref, lda);
-    } else {
+    }
+    else {
         LAPACKE_zlarnv(1, seed, (size_t)lda*n, A);
     }
 
 
     int ldq = lda;
-    if (eigt == PlasmaEigValVec) {
+    if (job == PlasmaNoVec) {
         Q = (plasma_complex64_t *)malloc(
-            (size_t)n*ldq*sizeof(plasma_complex64_t)); 
+            (size_t)n*ldq*sizeof(plasma_complex64_t));
     }
-    
+
     //================================================================
     // Prepare the descriptor for matrix T.
     //================================================================
@@ -136,54 +133,58 @@ void test_zheevd(param_value_t param[], bool run)
     //================================================================
     plasma_time_t start = omp_get_wtime();
 
-    plasma_zheevd(eigt, uplo, n, A, lda, &T, W, Q, ldq);
+    plasma_zheevd(job, uplo, n, A, lda, &T, Lambda, Q, ldq);
     //LAPACKE_zheevd( LAPACK_COL_MAJOR,
-    //               'N', 'L',  n, A, lda, W);
+    //               'N', 'L',  n, A, lda, Lambda);
     plasma_time_t stop = omp_get_wtime();
-    plasma_time_t time = stop-start;
-    
+    plasma_time_t time = stop - start;
+
     param[PARAM_TIME].d = time;
     param[PARAM_GFLOPS].d = flops_zgeqrf(n, n) / time / 1e9;
 
     if (test) {
-        
-        // check the correctness of  the eigenvalues values
+        // Check the correctness of the eigenvalues values.
         double error = 0;
-        for (int i = 0; i < n; i++){
-            error  += fabs(fabs(W[i])-fabs(Wref[i]))/fabs(Wref[i]);
+        for (int i = 0; i < n; ++i) {
+            error += fabs( Lambda[i] - Lambda_ref[i] )
+                     / fabs( Lambda_ref[i] );
         }
 
-        error /= n*40 ;
-        // Othorgonality test 
+        error /= n*40;
+        // Othorgonality test
         double done  =  1.0;
         double mdone = -1.0;
-        
-        // Build the idendity matrix 
-        plasma_complex64_t *Id = (plasma_complex64_t *) malloc(n*n*sizeof(plasma_complex64_t));
+
+        // Build the idendity matrix
+        plasma_complex64_t *Id
+            = (plasma_complex64_t *) malloc(n*n*sizeof(plasma_complex64_t));
         LAPACKE_zlaset_work(LAPACK_COL_MAJOR, 'A', n, n, 0., 1., Id, n);
 
         double ortho = 0.;
-        if (eigt == PlasmaEigValVec) {
-            // Perform Id - Q'Q
-            cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, n, n, done, Q, n, mdone, Id, n);
-            double normQ = LAPACKE_zlanhe_work(LAPACK_COL_MAJOR, 'I', 'U', n, Id, n, (double*)work);
+        if (job == PlasmaVec) {
+            // Perform Id - Q^H Q
+            cblas_zherk(
+                CblasColMajor, CblasUpper, CblasConjTrans,
+                n, n, done, Q, n, mdone, Id, n);
+            double normQ = LAPACKE_zlanhe_work(
+                LAPACK_COL_MAJOR, 'I', 'U', n, Id, n, (double*)work);
             ortho = normQ/n;
         }
         param[PARAM_ERROR].d = error;
         param[PARAM_ORTHO].d = ortho;
         param[PARAM_SUCCESS].i = (error < tol && ortho < tol);
-        
+
     }
     //================================================================
     // Free arrays.
     //================================================================
     // plasma_desc_destroy(&T);
     free(A);
-    free(W);
+    free(Q);
+    free(Lambda);
     free(work);
     if (test) {
         free(Aref);
-        free(Wref);
+        free(Lambda_ref);
     }
-    if (eigt == PlasmaEigValVec) free(Q);
 }

From bc0ea3b82199cc5562ba5d48dd5be54f83f97941 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Fri, 17 Jan 2025 20:13:59 -0500
Subject: [PATCH 11/12] add pzhbtrd_static.c

---
 compute/pzhbtrd_static.c | 204 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100755 compute/pzhbtrd_static.c

diff --git a/compute/pzhbtrd_static.c b/compute/pzhbtrd_static.c
new file mode 100755
index 00000000..5d8eb602
--- /dev/null
+++ b/compute/pzhbtrd_static.c
@@ -0,0 +1,204 @@
+/**
+ *
+ * @file
+ *
+ *  PLASMA is a software package provided by:
+ *  University of Tennessee, US,
+ *  University of Manchester, UK.
+ *
+ * @precisions normal z -> s d c
+ *
+ **/
+
+#include "plasma_async.h"
+#include "plasma_context.h"
+#include "plasma_descriptor.h"
+#include "plasma_internal.h"
+#include "plasma_types.h"
+#include "plasma_workspace.h"
+#include "bulge.h"
+#include "core_blas.h"
+#include <omp.h>
+#include <sched.h>
+#include <string.h>
+
+#undef REAL
+#define COMPLEX
+
+/***************************************************************************//**
+ *  Static scheduler
+ **/
+
+#define shift 3
+
+#define ss_cond_set(m, n, val)                  \
+    {                                                   \
+        plasma->ss_progress[(m)+plasma->ss_ld*(n)] = (val); \
+    }
+
+
+#define ss_cond_wait(m, n, val) \
+    {                                                           \
+        while (plasma->ss_progress[(m)+plasma->ss_ld*(n)] != (val)) \
+            sched_yield();                                          \
+    }
+
+
+//  Parallel bulge chasing column-wise - static scheduling
+
+void plasma_pzhbtrd_static( plasma_enum_t uplo, int n, int nb, int Vblksiz,
+			 plasma_complex64_t *A, int lda,
+			 plasma_complex64_t *V, plasma_complex64_t *TAU,
+			 double *D, double *E, int wantz,
+			 plasma_workspace_t work,
+			 plasma_sequence_t *sequence, plasma_request_t *request) 
+{
+
+    plasma_context_t *plasma = plasma_context_self();
+    if (plasma == NULL) {
+        plasma_error("PLASMA not initialized");
+        return;
+    }    
+    
+    // Check sequence status.
+    if (sequence->status != PlasmaSuccess) {
+        plasma_request_fail(sequence, request, PlasmaErrorSequence);
+        return;
+    }
+
+    if ( uplo != PlasmaLower ) {
+        plasma_request_fail(sequence, request, PlasmaErrorNotSupported);
+        return;
+    }
+    
+    
+    // Quick return
+    if (n == 0) {
+        return;
+    }
+
+
+    int nbtiles = plasma_ceildiv(n,nb);
+    int colblktile = 1;
+    int grsiz = 1;    
+    int maxrequiredcores = imax( nbtiles/colblktile, 1 );
+    int colpercore = colblktile*nb;
+    int thgrsiz = n;
+    
+    
+    // Initialize static scheduler progress table
+    int cores_num;
+    #pragma omp parallel 
+    {
+        cores_num  = omp_get_num_threads();
+    }
+    int size = 2*nbtiles+shift+cores_num+10;
+    plasma->ss_progress = (volatile int *)malloc(size*sizeof(int));
+    for(int index = 0; index < size; index++) plasma->ss_progress[index] = 0;
+    plasma->ss_ld = (size);
+    
+    // main bulge chasing code 
+    int ii = shift/grsiz;
+    int  stepercol =  ii*grsiz == shift ? ii:ii+1;
+    ii       = (n-1)/thgrsiz;
+    int thgrnb  = ii*thgrsiz == (n-1) ? ii:ii+1;
+    int allcoresnb = imin( cores_num, maxrequiredcores );
+
+    #pragma omp parallel
+    {   
+        int coreid, sweepid, myid, stt, st, ed, stind, edind;
+        int blklastind, colpt,  thgrid, thed;
+        int i,j,m,k;
+
+        int my_core_id = omp_get_thread_num();
+        plasma_complex64_t  *WORK = work.spaces[my_core_id];
+
+        for (thgrid = 1; thgrid<=thgrnb; thgrid++){
+            stt  = (thgrid-1)*thgrsiz+1;
+            thed = imin( (stt + thgrsiz -1), (n-1));
+            for (i = stt; i <= n-1; i++){
+                ed = imin(i,thed);
+                if(stt>ed) break;
+                for (m = 1; m <=stepercol; m++){
+                    st=stt;
+                    for (sweepid = st; sweepid <=ed; sweepid++){
+                        
+                        for (k = 1; k <=grsiz; k++){
+                            myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k;
+                            if(myid%2 ==0){
+                                colpt      = (myid/2)*nb+1+sweepid-1;
+                                stind      = colpt-nb+1;
+                                edind      = imin(colpt,n);
+                                blklastind = colpt;
+                            } else {
+                                colpt      = ((myid+1)/2)*nb + 1 +sweepid -1 ;
+                                stind      = colpt-nb+1;
+                                edind      = imin(colpt,n);
+                                if( (stind>=edind-1) && (edind==n) )
+                                    blklastind=n;
+                                else
+                                    blklastind=0;
+                            }
+                            coreid = (stind/colpercore)%allcoresnb;
+                            
+                            if(my_core_id==coreid) {
+                                if(myid==1) {
+                                    
+                                    ss_cond_wait(myid+shift-1, 0, sweepid-1);
+                                    core_zhbtype1cb(n, nb, A, lda, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, wantz, WORK);
+                                    ss_cond_set(myid, 0, sweepid);
+                                    
+                                    if(blklastind >= (n-1)) {
+                                        for (j = 1; j <= shift; j++)
+                                            ss_cond_set(myid+j, 0, sweepid);
+                                    }
+                                } else {
+                                    ss_cond_wait(myid-1,       0, sweepid);
+                                    ss_cond_wait(myid+shift-1, 0, sweepid-1);
+                                    if(myid%2 == 0)
+                                        core_zhbtype2cb(n, nb, A, lda, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, wantz, WORK);
+                                    else
+                                        core_zhbtype3cb(n, nb, A, lda, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, wantz, WORK);
+                                    
+                                    ss_cond_set(myid, 0, sweepid);
+                                    if(blklastind >= (n-1)) {
+                                        for (j = 1; j <= shift+allcoresnb; j++)
+                                            ss_cond_set(myid+j, 0, sweepid);
+                                    }
+                                } // if myid==1 
+                            } // if my_core_id==coreid
+                            
+                            if(blklastind >= (n-1)) {
+                                stt++;
+                                break;
+                            }
+                        } // for k=1:grsiz
+                    } // for sweepid=st:ed 
+                } // for m=1:stepercol
+            } // for i=1:N-1
+        } // for thgrid=1:thgrnb
+    }
+
+    free((void*)plasma->ss_progress);
+    
+    //================================================
+    //  store resulting diag and lower diag D and E
+    //  note that D and E are always real
+    //================================================
+
+    // sequential code here so only core 0 will work 
+    if( uplo == PlasmaLower ) {
+        for (int i=0; i < n-1 ; i++) {
+            D[i] = creal(A[i*lda]);
+            E[i] = creal(A[i*lda+1]);
+        }
+        D[n-1] = creal(A[(n-1)*lda]);
+    } else { 
+        for (int i=0; i<n-1; i++) {
+            D[i] = creal(A[i*lda+nb]);
+            E[i] = creal(A[i*lda+nb-1]);
+        }
+        D[n-1] = creal(A[(n-1)*lda+nb]);
+    } 
+    return;
+}

From 8fb71e5c4f3040a8c75c31d240e0cb7cd79bd525 Mon Sep 17 00:00:00 2001
From: Mark Gates <mgates3@icl.utk.edu>
Date: Fri, 17 Jan 2025 20:14:19 -0500
Subject: [PATCH 12/12] updates

---
 compute/pzhbtrd_static.c    | 209 +++++++++++++++++++-----------------
 compute/pzhe2hb.c           |   2 +-
 compute/pzheb2trd_static.c  |  34 +++---
 compute/zheevd.c            |   6 +-
 core_blas/core_zhbtype1cb.c |   2 +-
 core_blas/core_zhbtype2cb.c |   2 +-
 core_blas/core_zhbtype3cb.c |  20 ++--
 core_blas/core_zlarfy.c     |  78 +++++++-------
 8 files changed, 179 insertions(+), 174 deletions(-)

diff --git a/compute/pzhbtrd_static.c b/compute/pzhbtrd_static.c
index 5d8eb602..fa40b514 100755
--- a/compute/pzhbtrd_static.c
+++ b/compute/pzhbtrd_static.c
@@ -16,8 +16,9 @@
 #include "plasma_internal.h"
 #include "plasma_types.h"
 #include "plasma_workspace.h"
+#include "plasma_core_blas.h"
 #include "bulge.h"
-#include "core_blas.h"
+
 #include <omp.h>
 #include <sched.h>
 #include <string.h>
@@ -36,169 +37,177 @@
         plasma->ss_progress[(m)+plasma->ss_ld*(n)] = (val); \
     }
 
-
 #define ss_cond_wait(m, n, val) \
     {                                                           \
         while (plasma->ss_progress[(m)+plasma->ss_ld*(n)] != (val)) \
             sched_yield();                                          \
     }
 
-
 //  Parallel bulge chasing column-wise - static scheduling
-
-void plasma_pzhbtrd_static( plasma_enum_t uplo, int n, int nb, int Vblksiz,
-			 plasma_complex64_t *A, int lda,
-			 plasma_complex64_t *V, plasma_complex64_t *TAU,
-			 double *D, double *E, int wantz,
-			 plasma_workspace_t work,
-			 plasma_sequence_t *sequence, plasma_request_t *request) 
+void plasma_pzhbtrd_static(
+    plasma_enum_t uplo, int n, int nb, int Vblksiz,
+	plasma_complex64_t *A, int lda,
+	plasma_complex64_t *V, plasma_complex64_t *tau,
+	double *D, double *E, int wantz,
+	plasma_workspace_t work,
+	plasma_sequence_t *sequence, plasma_request_t *request)
 {
-
     plasma_context_t *plasma = plasma_context_self();
     if (plasma == NULL) {
         plasma_error("PLASMA not initialized");
         return;
-    }    
-    
+    }
+
     // Check sequence status.
     if (sequence->status != PlasmaSuccess) {
         plasma_request_fail(sequence, request, PlasmaErrorSequence);
         return;
     }
 
-    if ( uplo != PlasmaLower ) {
+    if (uplo != PlasmaLower) {
         plasma_request_fail(sequence, request, PlasmaErrorNotSupported);
         return;
     }
-    
-    
+
     // Quick return
     if (n == 0) {
         return;
     }
 
-
-    int nbtiles = plasma_ceildiv(n,nb);
+    int nbtiles = plasma_ceildiv(n, nb);
     int colblktile = 1;
-    int grsiz = 1;    
+    int grsiz = 1;
     int maxrequiredcores = imax( nbtiles/colblktile, 1 );
     int colpercore = colblktile*nb;
     int thgrsiz = n;
-    
-    
+
     // Initialize static scheduler progress table
     int cores_num;
-    #pragma omp parallel 
+    #pragma omp parallel
     {
         cores_num  = omp_get_num_threads();
     }
-    int size = 2*nbtiles+shift+cores_num+10;
+    int size = 2*nbtiles + shift + cores_num + 10;
     plasma->ss_progress = (volatile int *)malloc(size*sizeof(int));
-    for(int index = 0; index < size; index++) plasma->ss_progress[index] = 0;
+    for (int index = 0; index < size; ++index) {
+        plasma->ss_progress[index] = 0;
+    }
     plasma->ss_ld = (size);
-    
-    // main bulge chasing code 
+
+    // main bulge chasing code
     int ii = shift/grsiz;
-    int  stepercol =  ii*grsiz == shift ? ii:ii+1;
-    ii       = (n-1)/thgrsiz;
-    int thgrnb  = ii*thgrsiz == (n-1) ? ii:ii+1;
+    int stepercol = ii*grsiz == shift ? ii:ii + 1;
+    ii = (n - 1)/thgrsiz;
+    int thgrnb    = ii*thgrsiz == (n - 1) ? ii:ii + 1;
     int allcoresnb = imin( cores_num, maxrequiredcores );
 
     #pragma omp parallel
-    {   
+    {
         int coreid, sweepid, myid, stt, st, ed, stind, edind;
         int blklastind, colpt,  thgrid, thed;
-        int i,j,m,k;
+        int i, j, m, k;
 
         int my_core_id = omp_get_thread_num();
-        plasma_complex64_t  *WORK = work.spaces[my_core_id];
-
-        for (thgrid = 1; thgrid<=thgrnb; thgrid++){
-            stt  = (thgrid-1)*thgrsiz+1;
-            thed = imin( (stt + thgrsiz -1), (n-1));
-            for (i = stt; i <= n-1; i++){
-                ed = imin(i,thed);
-                if(stt>ed) break;
-                for (m = 1; m <=stepercol; m++){
-                    st=stt;
-                    for (sweepid = st; sweepid <=ed; sweepid++){
-                        
-                        for (k = 1; k <=grsiz; k++){
-                            myid = (i-sweepid)*(stepercol*grsiz) +(m-1)*grsiz + k;
-                            if(myid%2 ==0){
-                                colpt      = (myid/2)*nb+1+sweepid-1;
-                                stind      = colpt-nb+1;
-                                edind      = imin(colpt,n);
+        plasma_complex64_t *my_work = work.spaces[my_core_id];
+
+        for (thgrid = 1; thgrid <= thgrnb; ++thgrid) {
+            stt  = (thgrid - 1)*thgrsiz + 1;
+            thed = imin( stt + thgrsiz - 1, n - 1 );
+            for (i = stt; i <= n - 1; ++i) {
+                ed = imin(i, thed);
+                if (stt > ed) break;
+                for (m = 1; m <=stepercol; ++m) {
+                    st = stt;
+                    for (sweepid = st; sweepid <=ed; ++sweepid) {
+                        for (k = 1; k <= grsiz; ++k) {
+                            myid = (i - sweepid)*(stepercol*grsiz) +(m - 1)*grsiz + k;
+                            if (myid % 2 ==0) {
+                                colpt      = (myid/2)*nb + 1 + sweepid - 1;
+                                stind      = colpt - nb + 1;
+                                edind      = imin(colpt, n);
                                 blklastind = colpt;
-                            } else {
-                                colpt      = ((myid+1)/2)*nb + 1 +sweepid -1 ;
-                                stind      = colpt-nb+1;
-                                edind      = imin(colpt,n);
-                                if( (stind>=edind-1) && (edind==n) )
-                                    blklastind=n;
+                            }
+                            else {
+                                colpt      = ((myid + 1)/2)*nb + 1 +sweepid - 1;
+                                stind      = colpt - nb + 1;
+                                edind      = imin(colpt, n);
+                                if (stind >= edind - 1 && edind == n)
+                                    blklastind = n;
                                 else
-                                    blklastind=0;
+                                    blklastind = 0;
                             }
-                            coreid = (stind/colpercore)%allcoresnb;
-                            
-                            if(my_core_id==coreid) {
-                                if(myid==1) {
-                                    
-                                    ss_cond_wait(myid+shift-1, 0, sweepid-1);
-                                    core_zhbtype1cb(n, nb, A, lda, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, wantz, WORK);
+                            coreid = (stind / colpercore) % allcoresnb;
+
+                            if (my_core_id == coreid) {
+                                if (myid == 1) {
+                                    ss_cond_wait(myid + shift - 1, 0, sweepid - 1);
+                                    plasma_core_zhbtype1cb(
+                                        n, nb, A, lda, V, tau,
+                                        stind - 1, edind - 1, sweepid - 1,
+                                        Vblksiz, wantz, my_work);
                                     ss_cond_set(myid, 0, sweepid);
-                                    
-                                    if(blklastind >= (n-1)) {
-                                        for (j = 1; j <= shift; j++)
-                                            ss_cond_set(myid+j, 0, sweepid);
+
+                                    if (blklastind >= n - 1) {
+                                        for (j = 1; j <= shift; ++j)
+                                            ss_cond_set(myid + j, 0, sweepid);
+                                    }
+                                }
+                                else {
+                                    ss_cond_wait(myid - 1,         0, sweepid);
+                                    ss_cond_wait(myid + shift - 1, 0, sweepid - 1);
+                                    if (myid % 2 == 0) {
+                                        plasma_core_zhbtype2cb(
+                                            n, nb, A, lda, V, tau,
+                                            stind - 1, edind - 1, sweepid - 1,
+                                            Vblksiz, wantz, my_work);
+                                    }
+                                    else {
+                                        plasma_core_zhbtype3cb(
+                                            n, nb, A, lda, V, tau,
+                                            stind - 1, edind - 1, sweepid - 1,
+                                            Vblksiz, wantz, my_work);
                                     }
-                                } else {
-                                    ss_cond_wait(myid-1,       0, sweepid);
-                                    ss_cond_wait(myid+shift-1, 0, sweepid-1);
-                                    if(myid%2 == 0)
-                                        core_zhbtype2cb(n, nb, A, lda, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, wantz, WORK);
-                                    else
-                                        core_zhbtype3cb(n, nb, A, lda, V, TAU, stind-1, edind-1, sweepid-1, Vblksiz, wantz, WORK);
-                                    
+
                                     ss_cond_set(myid, 0, sweepid);
-                                    if(blklastind >= (n-1)) {
-                                        for (j = 1; j <= shift+allcoresnb; j++)
-                                            ss_cond_set(myid+j, 0, sweepid);
+                                    if (blklastind >= n - 1) {
+                                        for (j = 1; j <= shift + allcoresnb; ++j)
+                                            ss_cond_set(myid + j, 0, sweepid);
                                     }
-                                } // if myid==1 
-                            } // if my_core_id==coreid
-                            
-                            if(blklastind >= (n-1)) {
-                                stt++;
+                                } // if myid == 1
+                            } // if my_core_id == coreid
+
+                            if (blklastind >= n - 1) {
+                                ++stt;
                                 break;
                             }
-                        } // for k=1:grsiz
-                    } // for sweepid=st:ed 
-                } // for m=1:stepercol
-            } // for i=1:N-1
-        } // for thgrid=1:thgrnb
+                        } // for k = 1:grsiz
+                    } // for sweepid = st:ed
+                } // for m = 1:stepercol
+            } // for i = 1:n - 1
+        } // for thgrid = 1:thgrnb
     }
 
     free((void*)plasma->ss_progress);
-    
+
     //================================================
     //  store resulting diag and lower diag D and E
     //  note that D and E are always real
     //================================================
 
-    // sequential code here so only core 0 will work 
-    if( uplo == PlasmaLower ) {
-        for (int i=0; i < n-1 ; i++) {
+    // sequential code here so only core 0 will work
+    if (uplo == PlasmaLower) {
+        for (int i = 0; i < n - 1 ; ++i) {
             D[i] = creal(A[i*lda]);
-            E[i] = creal(A[i*lda+1]);
+            E[i] = creal(A[i*lda + 1]);
         }
-        D[n-1] = creal(A[(n-1)*lda]);
-    } else { 
-        for (int i=0; i<n-1; i++) {
-            D[i] = creal(A[i*lda+nb]);
-            E[i] = creal(A[i*lda+nb-1]);
+        D[n - 1] = creal(A[(n - 1)*lda]);
+    }
+    else {
+        for (int i = 0; i < n - 1; ++i) {
+            D[i] = creal(A[i*lda + nb]);
+            E[i] = creal(A[i*lda + nb - 1]);
         }
-        D[n-1] = creal(A[(n-1)*lda+nb]);
-    } 
+        D[n - 1] = creal(A[(n - 1)*lda + nb]);
+    }
     return;
 }
diff --git a/compute/pzhe2hb.c b/compute/pzhe2hb.c
index cc9aaf7d..c96e4f90 100644
--- a/compute/pzhe2hb.c
+++ b/compute/pzhe2hb.c
@@ -31,7 +31,7 @@ void plasma_pzhe2hb(plasma_enum_t uplo,
     if (sequence->status != PlasmaSuccess)
         return;
 
-    // Case nb>n  only 1 tile
+    // Case nb > n only 1 tile
     if (A.mt > A.m)
         return;
 
diff --git a/compute/pzheb2trd_static.c b/compute/pzheb2trd_static.c
index 1acb5c55..d0468b10 100755
--- a/compute/pzheb2trd_static.c
+++ b/compute/pzheb2trd_static.c
@@ -46,8 +46,8 @@
 
 //  Parallel bulge chasing column-wise, static scheduling
 void plasma_pzheb2trd_static(
-    plasma_enum_t uplo, int n, int NB, int Vblksiz,
-    plasma_complex64_t *A, int LDA,
+    plasma_enum_t uplo, int n, int nb, int Vblksiz,
+    plasma_complex64_t *A, int lda,
     plasma_complex64_t *V, plasma_complex64_t *tau,
     double *D, double *E, int wantz,
     plasma_workspace_t work,
@@ -93,11 +93,11 @@ void plasma_pzheb2trd_static(
 
     // Some tunning for the bulge chasing code;
     // see technical report for details.
-    int nbtiles = plasma_ceildiv(n,NB);
+    int nbtiles = plasma_ceildiv(n,nb);
     int colblktile = 1;
     int grsiz = 1;
     int maxrequiredcores = imax( nbtiles/colblktile, 1 );
-    int colpercore = colblktile*NB;
+    int colpercore = colblktile*nb;
     int thgrsiz = n;
 
 
@@ -142,14 +142,14 @@ void plasma_pzheb2trd_static(
                         for (k = 1; k <= grsiz; ++k) {
                             myid = (i - sweepid)*(stepercol*grsiz) + (m - 1)*grsiz + k;
                             if (myid % 2 == 0) {
-                                colpt = (myid/2)*NB + 1 + sweepid - 1;
-                                stind = colpt - NB + 1;
+                                colpt = (myid/2)*nb + 1 + sweepid - 1;
+                                stind = colpt - nb + 1;
                                 edind = imin(colpt,n);
                                 blklastind = colpt;
                             }
                             else {
-                                colpt = ((myid + 1)/2)*NB + 1 + sweepid - 1;
-                                stind = colpt - NB + 1;
+                                colpt = ((myid + 1)/2)*nb + 1 + sweepid - 1;
+                                stind = colpt - nb + 1;
                                 edind = imin(colpt,n);
                                 if ((stind >= edind - 1) && (edind == n))
                                     blklastind = n;
@@ -162,7 +162,7 @@ void plasma_pzheb2trd_static(
                                 if (myid == 1) {
                                     ss_cond_wait(myid + shift - 1, 0, sweepid - 1);
                                     plasma_core_zhbtype1cb(
-                                        n, NB, A, LDA, V, tau,
+                                        n, nb, A, lda, V, tau,
                                         stind - 1, edind - 1, sweepid - 1,
                                         Vblksiz, wantz, my_work);
                                     ss_cond_set(myid, 0, sweepid);
@@ -177,13 +177,13 @@ void plasma_pzheb2trd_static(
                                     ss_cond_wait(myid + shift - 1, 0, sweepid - 1);
                                     if (myid%2 == 0) {
                                         plasma_core_zhbtype2cb(
-                                            n, NB, A, LDA, V, tau,
+                                            n, nb, A, lda, V, tau,
                                             stind - 1, edind - 1, sweepid - 1,
                                             Vblksiz, wantz, my_work);
                                     }
                                     else {
                                         plasma_core_zhbtype3cb(
-                                            n, NB, A, LDA, V, tau,
+                                            n, nb, A, lda, V, tau,
                                             stind - 1, edind - 1, sweepid - 1,
                                             Vblksiz, wantz, my_work);
                                     }
@@ -225,16 +225,16 @@ void plasma_pzheb2trd_static(
     // Sequential code here so only core 0 will work.
     if (uplo == PlasmaLower) {
         for (int i = 0; i < n - 1; ++i) {
-            D[i] = creal(A[i*LDA]);
-            E[i] = creal(A[i*LDA + 1]);
+            D[i] = creal(A[i*lda]);
+            E[i] = creal(A[i*lda + 1]);
         }
-        D[n - 1] = creal(A[(n - 1)*LDA]);
+        D[n - 1] = creal(A[(n - 1)*lda]);
     }
     else { // PlasmaUpper not yet tested
         for (int i = 0; i < n - 1; ++i) {
-            D[i] = creal(A[i*LDA + NB]);
-            E[i] = creal(A[i*LDA + NB - 1]);
+            D[i] = creal(A[i*lda + nb]);
+            E[i] = creal(A[i*lda + nb - 1]);
         }
-        D[n - 1] = creal(A[(n - 1)*LDA + NB]);
+        D[n - 1] = creal(A[(n - 1)*lda + nb]);
     }
 }
diff --git a/compute/zheevd.c b/compute/zheevd.c
index 4ba4c8f5..626f0904 100644
--- a/compute/zheevd.c
+++ b/compute/zheevd.c
@@ -355,7 +355,7 @@ void plasma_omp_zheevd(
     }
     double stop = omp_get_wtime();
     double time = stop - start;
-    //printf("\n N=%d:  1-stage time = %lf\t", n, time);
+    //printf("\n n=%d:  1-stage time = %lf\t", n, time);
 
     //====================
     //  Bulge chasing
@@ -468,7 +468,7 @@ void plasma_omp_zheevd(
         //=======================================
         // Apply Q1 from the first stage .
         //=======================================
-        // If nb > N, Q1 doesn't need to be applied,
+        // If nb > n, Q1 doesn't need to be applied,
         // only bulge chasing has been done
         if (nb < n) {
             plasma_desc_t Q;
@@ -504,7 +504,7 @@ void plasma_omp_zheevd(
             }
 
             plasma_desc_destroy(&Q);
-        } // end (nb < N)
+        } // end (nb < n)
     }
     stop = omp_get_wtime();
     time = stop - start;
diff --git a/core_blas/core_zhbtype1cb.c b/core_blas/core_zhbtype1cb.c
index 99ecbae2..03a02682 100644
--- a/core_blas/core_zhbtype1cb.c
+++ b/core_blas/core_zhbtype1cb.c
@@ -55,7 +55,7 @@
  *
  * @param[out] V
  *          PLASMA_Complex64_t array, dimension n if eigenvalue only
- *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
+ *          requested or (ldv*blkcnt*Vblksiz) if Eigenvectors requested
  *          The Householder reflectors are stored in this array.
  *
  * @param[out] tau
diff --git a/core_blas/core_zhbtype2cb.c b/core_blas/core_zhbtype2cb.c
index 97c977d8..81e08a31 100644
--- a/core_blas/core_zhbtype2cb.c
+++ b/core_blas/core_zhbtype2cb.c
@@ -55,7 +55,7 @@
  *
  * @param[in, out] V
  *          plasma_complex64_t array, dimension n if eigenvalue only
- *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
+ *          requested or (ldv*blkcnt*Vblksiz) if Eigenvectors requested
  *          The Householder reflectors of the previous type 1 are used here
  *          to continue update then new one are generated to eliminate the
  *          bulge and stored in this array.
diff --git a/core_blas/core_zhbtype3cb.c b/core_blas/core_zhbtype3cb.c
index 64883328..81ca3cf8 100644
--- a/core_blas/core_zhbtype3cb.c
+++ b/core_blas/core_zhbtype3cb.c
@@ -46,14 +46,14 @@
  *          The size of the band.
  *
  * @param[in,out] A
- *          A pointer to the matrix A of size (2*nb+1)-by-n.
+ *          A pointer to the matrix A of size (2*nb + 1)-by-n.
  *
  * @param[in] lda
- *          The leading dimension of the matrix A. lda >= max(1, 2*nb+1)
+ *          The leading dimension of the matrix A. lda >= max(1, 2*nb + 1)
  *
  * @param[in] V
- *          plasma_complex64_t array, dimension n if eigenvalue only
- *          requested or (LDV*blkcnt*Vblksiz) if Eigenvectors requested
+ *          plasma_complex64_t array, dimension n if only eigenvalues are
+ *          requested, or (ldv*blkcnt*Vblksiz) if eigenvectors are requested.
  *          The Householder reflectors are stored in this array.
  *
  * @param[in] tau
@@ -68,17 +68,17 @@
  *          A pointer to the end index where this kernel will operate.
  *
  * @param[in] sweep
- *          The sweep number that is eliminated. it serve to calculate the
+ *          The sweep number that is eliminated. It serves to calculate the
  *          pointer to the position where to store the Vs and Ts.
  *
  * @param[in] Vblksiz
- *          constant which correspond to the blocking used when applying the Vs.
- *          it serve to calculate the pointer to the position where to store the
- *          Vs and Ts.
+ *          Constant that corresponds to the blocking used when applying the Vs.
+ *          It serves to calculate the pointer to the position where to store
+ *          the Vs and Ts.
  *
  * @param[in] wantz
- *          constant which indicate if Eigenvalue are requested or both
- *          Eigenvalue/Eigenvectors.
+ *          Specifies whether only eigenvalues are requested or both
+ *          eigenvalue and eigenvectors.
  *
  * @param[in] work
  *          Workspace of size nb.
diff --git a/core_blas/core_zlarfy.c b/core_blas/core_zlarfy.c
index 4b3cdc86..0ac3747f 100644
--- a/core_blas/core_zlarfy.c
+++ b/core_blas/core_zlarfy.c
@@ -22,45 +22,45 @@
  * @ingroup CORE_plasma_complex64_t
  *
  *  CORE_zlarfy applies an elementary reflector, or Householder matrix, H,
- *  to a N-by-N hermitian matrix C, from both the left and the right.
+ *  to a n-by-n Hermitian matrix C, from both the left and the right.
  *
  *  H is represented in the form
  *
- *     H = I - tau * v * v'
+ *     H = I - tau v v^H
  *
- *  where  tau  is a scalar and  v  is a vector.
+ *  where tau is a scalar and v is a vector.
  *
  *  If tau is zero, then H is taken to be the unit matrix.
  *
  *******************************************************************************
  *
- * @param[in] N
- *          The number of rows and columns of the matrix C.  N >= 0.
+ * @param[in] n
+ *          The number of rows and columns of the matrix C. n >= 0.
  *
  * @param[in,out] A
- *          COMPLEX*16 array, dimension (LDA, N)
+ *          COMPLEX*16 array, dimension (lda, n)
  *          On entry, the Hermetian matrix A.
- *          On exit, A is overwritten by H * A * H'.
+ *          On exit, A is overwritten by H A H^H.
  *
- * @param[in] LDA
- *         The leading dimension of the array A.  LDA >= max(1,N).
+ * @param[in] lda
+ *         The leading dimension of the array A. lda >= max(1,n).
  *
- * @param[in] V
- *          The vector V that contains the Householder reflectors.
+ * @param[in] v
+ *          The vector v that contains the Householder reflectors.
  *
- * @param[in] TAU
+ * @param[in] tau
  *          The value tau.
  *
- * @param[out] WORK
- *          Workspace.
+ * @param[out] work
+ *          Workspace of size n.
  *
  ******************************************************************************/
 void plasma_core_zlarfy(
-    int N,
-    plasma_complex64_t *A, int LDA,
-    const plasma_complex64_t *V,
-    const plasma_complex64_t *TAU,
-    plasma_complex64_t *WORK)
+    int n,
+    plasma_complex64_t *A, int lda,
+    const plasma_complex64_t *v,
+    const plasma_complex64_t *tau,
+    plasma_complex64_t *work)
 {
     const plasma_complex64_t zzero =  0.0;
     const plasma_complex64_t zmone = -1.0;
@@ -68,33 +68,29 @@ void plasma_core_zlarfy(
     int j;
     plasma_complex64_t dtmp;
 
-    /* Compute dtmp = X'*V */
-    /* X = AVtau */
+    // Compute dtmp = x^H v
+    // x = A v tau
     cblas_zhemv(CblasColMajor, CblasLower,
-                N, CBLAS_SADDR(*TAU), A, LDA,
-                V, 1, CBLAS_SADDR(zzero), WORK, 1);
+                n, CBLAS_SADDR(*tau), A, lda,
+                v, 1, CBLAS_SADDR(zzero), work, 1);
 
-    /* cblas_zdotc_sub(N, WORK, 1, V, 1, &dtmp);*/
+    // cblas_zdotc_sub(n, work, 1, v, 1, &dtmp);
     dtmp = 0.;
-    for (j = 0; j < N; ++j)
-        dtmp = dtmp + conj(WORK[j]) * V[j];
+    for (j = 0; j < n; ++j)
+        dtmp = dtmp + conj(work[j]) * v[j];
 
-    /* Compute 1/2 X'*V*t = 1/2*dtmp*tau  */
-    dtmp = -dtmp * 0.5 * (*TAU);
+    // Compute 1/2 x^H v tau = 1/2 dtmp tau
+    dtmp = -dtmp * 0.5 * (*tau);
 
-   /* Compute W=X-1/2VX'Vt = X - dtmp*V */
-    cblas_zaxpy(N, CBLAS_SADDR(dtmp),
-                V, 1, WORK, 1);
+    // Compute w = x - 1/2 v x^H v t = x - dtmp v */
+    cblas_zaxpy(n, CBLAS_SADDR(dtmp),
+                v, 1, work, 1);
 
-    /*
-     * Performs the symmetric rank 2 operation
-     *    A := alpha*x*y' + alpha*y*x' + A
-     */
-    cblas_zher2(CblasColMajor, CblasLower, N,
-                CBLAS_SADDR(zmone), WORK, 1,
-                                    V,    1,
-                                    A,    LDA);
-
-    return;
+    // Performs the symmetric rank 2 operation
+    // A := alpha x y^H + alpha y x^H + A
+    cblas_zher2(CblasColMajor, CblasLower, n,
+                CBLAS_SADDR(zmone), work, 1,
+                                    v,    1,
+                                    A,    lda);
 }
 #undef COMPLEX