diff --git a/common.h b/common.h index b3fc6d7ded..ee1ace7707 100644 --- a/common.h +++ b/common.h @@ -1,935 +1,935 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#ifndef COMMON_H -#define COMMON_H - -#ifdef __cplusplus -extern "C" { - /* Assume C declarations for C++ */ -#endif /* __cplusplus */ - -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif - -#ifndef __USE_XOPEN -#define __USE_XOPEN -#endif - -#ifndef __USE_SVID -#define __USE_SVID -#endif - -#ifdef BUILD_KERNEL -#include "config_kernel.h" -#else -#include "config.h" -#endif - -#undef ENABLE_SSE_EXCEPTION - -#if defined(SMP_SERVER) || defined(SMP_ONDEMAND) -#define SMP -#endif - -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) -#define WINDOWS_ABI -#define OS_WINDOWS - -#ifdef DOUBLE -#define DOUBLE_DEFINED DOUBLE -#undef DOUBLE -#endif -#endif - -#if !defined(NOINCLUDE) && !defined(ASSEMBLER) -#include -#include -#include - -#if !defined(_MSC_VER) -#include -#elif _MSC_VER < 1900 -#define snprintf _snprintf -#endif -#include - -#if defined(OS_LINUX) || defined(OS_QNX) -#include -#include -#endif - -#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID) -#include -#endif - -#ifdef OS_ANDROID -#define NO_SYSV_IPC -//Android NDK only supports complex.h since Android 5.0 -#if __ANDROID_API__ < 21 -#define FORCE_OPENBLAS_COMPLEX_STRUCT -#endif -#endif - -#if defined(OS_HAIKU) || defined(OS_QNX) -#define NO_SYSV_IPC -#endif - -#ifdef OS_WINDOWS -#ifdef ATOM -#define GOTO_ATOM ATOM -#undef ATOM -#endif -#include -#include -#ifdef GOTO_ATOM -#define ATOM GOTO_ATOM -#undef GOTO_ATOM -#endif -#elif !defined(OS_EMBEDDED) -#include -#ifndef NO_SYSV_IPC -#include -#endif -#include -#include -#include -#include -#if defined(SMP) || defined(USE_LOCKING) -#include -#endif -#else -#include -#include -#endif - -#if defined(OS_SUNOS) -#include -#endif - -#ifdef __DECC -#include -#include -#endif - -#if defined(ARCH_IA64) && defined(ENABLE_SSE_EXCEPTION) -#include -#endif - -#endif - -#if defined(OS_WINDOWS) && defined(DOUBLE_DEFINED) -#define DOUBLE DOUBLE_DEFINED -#undef DOUBLE_DEFINED -#endif - -#undef DEBUG_INFO -#define SMP_DEBUG -#undef MALLOC_DEBUG -#undef SMP_ALLOC_DEBUG - -#ifndef ZERO -#ifdef XDOUBLE -#define ZERO 0.e0L -#elif defined DOUBLE -#define ZERO 0.e0 -#else -#define ZERO 0.e0f -#endif -#endif - -#ifndef ONE -#ifdef XDOUBLE -#define ONE 1.e0L -#elif defined DOUBLE -#define ONE 1.e0 -#else -#define ONE 1.e0f -#endif -#endif - -#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) - -#define ALLOCA_ALIGN 63UL - -#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) - -#ifdef NEEDBUNDERSCORE -#define BLASFUNC(FUNC) FUNC##_ -#else -#define BLASFUNC(FUNC) FUNC -#endif - -#undef USE_PTHREAD_LOCK -#undef USE_PTHREAD_SPINLOCK - -#if defined(USE_PTHREAD_LOCK) && defined(USE_PTHREAD_SPINLOCK) -#error "You can't specify both LOCK operation!" -#endif - -#if defined(SMP) || defined(USE_LOCKING) -#define USE_PTHREAD_LOCK -#undef USE_PTHREAD_SPINLOCK -#endif - -#ifdef OS_WINDOWS -#undef USE_PTHREAD_LOCK -#undef USE_PTHREAD_SPINLOCK -#endif - -#if defined(USE_PTHREAD_LOCK) -#define LOCK_COMMAND(x) pthread_mutex_lock(x) -#define UNLOCK_COMMAND(x) pthread_mutex_unlock(x) -#elif defined(USE_PTHREAD_SPINLOCK) -#ifndef ASSEMBLER -typedef volatile int pthread_spinlock_t; -int pthread_spin_lock (pthread_spinlock_t *__lock); -int pthread_spin_unlock (pthread_spinlock_t *__lock); -#endif -#define LOCK_COMMAND(x) pthread_spin_lock(x) -#define UNLOCK_COMMAND(x) pthread_spin_unlock(x) -#else -#define LOCK_COMMAND(x) blas_lock(x) -#define UNLOCK_COMMAND(x) blas_unlock(x) -#endif - -#define GOTO_SHMID 0x510510 - -#if 0 -#ifndef __CUDACC__ -#define __global__ -#define __device__ -#define __host__ -#define __shared__ -#endif -#endif - -#ifndef ASSEMBLER - -#ifdef QUAD_PRECISION -typedef struct { - unsigned long x[2]; -} xdouble; -#elif defined EXPRECISION -#define xdouble long double -#else -#define xdouble double -#endif - -#if defined(OS_WINDOWS) && defined(__64BIT__) -typedef long long BLASLONG; -typedef unsigned long long BLASULONG; -#else -typedef long BLASLONG; -typedef unsigned long BLASULONG; -#endif - -#ifndef bfloat16 -#include -typedef uint16_t bfloat16; -#define BFLOAT16CONVERSION 1 -#endif - -#ifdef USE64BITINT -typedef BLASLONG blasint; -#if defined(OS_WINDOWS) && defined(__64BIT__) -#define blasabs(x) llabs(x) -#else -#define blasabs(x) labs(x) -#endif -#else -typedef int blasint; -#define blasabs(x) abs(x) -#endif -#else -#ifdef USE64BITINT -#define INTSHIFT 3 -#define INTSIZE 8 -#else -#define INTSHIFT 2 -#define INTSIZE 4 -#endif -#endif - -#ifdef XDOUBLE -#define FLOAT xdouble -#ifdef QUAD_PRECISION -#define XFLOAT xidouble -#endif -#ifdef QUAD_PRECISION -#define SIZE 32 -#define BASE_SHIFT 5 -#define ZBASE_SHIFT 6 -#else -#define SIZE 16 -#define BASE_SHIFT 4 -#define ZBASE_SHIFT 5 -#endif -#elif defined(DOUBLE) -#define FLOAT double -#define SIZE 8 -#define BASE_SHIFT 3 -#define ZBASE_SHIFT 4 -#elif defined(BFLOAT16) -#define IFLOAT bfloat16 -#define XFLOAT IFLOAT -#define FLOAT float -#define SIZE 2 -#define BASE_SHIFT 1 -#define ZBASE_SHIFT 2 -#else -#define FLOAT float -#define SIZE 4 -#define BASE_SHIFT 2 -#define ZBASE_SHIFT 3 -#endif - -#ifndef XFLOAT -#define XFLOAT FLOAT -#endif - -#ifndef IFLOAT -#define IFLOAT FLOAT -#endif - -#ifndef COMPLEX -#define COMPSIZE 1 -#else -#define COMPSIZE 2 -#endif - - -#define Address_H(x) (((x)+(1<<15))>>16) -#define Address_L(x) ((x)-((Address_H(x))<<16)) - -#ifndef MAX_CPU_NUMBER -#define MAX_CPU_NUMBER 2 -#endif - -#if defined(OS_SUNOS) -#define YIELDING thr_yield() -#endif - -#if defined(OS_WINDOWS) -#if defined(_MSC_VER) && !defined(__clang__) -#define YIELDING YieldProcessor() -#else -#define YIELDING SwitchToThread() -#endif -#endif - -#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); -#endif - - -#if defined(POWER8) || defined(POWER9) || defined(POWER10) -#ifndef YIELDING -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); -#endif -#endif - - -#if defined(ARCH_X86_64) -#ifndef YIELDING -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); -#endif -#endif - -#if defined(ARCH_RISCV64) -#ifndef YIELDING -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); -#endif -#endif - - -#ifdef __EMSCRIPTEN__ -#define YIELDING -#endif - -#ifndef YIELDING -#define YIELDING sched_yield() -#endif - -/*** -To alloc job_t on heap or stack. -please https://github.com/xianyi/OpenBLAS/issues/246 -***/ -#if defined(OS_WINDOWS) -#define GETRF_MEM_ALLOC_THRESHOLD 32 -#define BLAS3_MEM_ALLOC_THRESHOLD 32 -#endif - -#ifndef GETRF_MEM_ALLOC_THRESHOLD -#define GETRF_MEM_ALLOC_THRESHOLD 80 -#endif - -#ifndef BLAS3_MEM_ALLOC_THRESHOLD -#define BLAS3_MEM_ALLOC_THRESHOLD 32 -#endif - -#ifdef QUAD_PRECISION -#include "common_quad.h" -#endif - -#ifdef ARCH_ALPHA -#include "common_alpha.h" -#endif - -#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) -#if __has_include() -#include -#endif -#endif -#ifndef _CET_ENDBR -#define _CET_ENDBR -#endif - -#ifdef ARCH_X86 -#include "common_x86.h" -#endif - -#ifdef ARCH_X86_64 -#include "common_x86_64.h" -#endif - -#ifdef ARCH_IA64 -#include "common_ia64.h" -#endif - -#ifdef ARCH_POWER -#include "common_power.h" -#endif - -#ifdef sparc -#include "common_sparc.h" -#endif - -#ifdef ARCH_MIPS -#include "common_mips.h" -#endif - - -#ifdef ARCH_RISCV64 -#include "common_riscv64.h" -#endif - -#ifdef ARCH_MIPS64 -#include "common_mips64.h" -#endif - -#ifdef ARCH_ARM -#include "common_arm.h" -#endif - -#ifdef ARCH_ARM64 -#include "common_arm64.h" -#endif - -#ifdef ARCH_ZARCH -#include "common_zarch.h" -#endif - -#ifdef ARCH_LOONGARCH64 -#include "common_loongarch64.h" -#endif - -#ifdef ARCH_E2K -#include "common_e2k.h" -#endif - -#ifdef ARCH_CSKY -#include "common_csky.h" -#endif - -#ifndef ASSEMBLER -#ifdef OS_WINDOWSSTORE -typedef char env_var_t[MAX_PATH]; -#define readenv(p, n) 0 -#else -#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) -typedef char env_var_t[MAX_PATH]; -#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) -#else -typedef char* env_var_t; -#define readenv(p, n) ((p)=getenv(n)) -#endif -#endif - -#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) -#ifdef _POSIX_MONOTONIC_CLOCK -#if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17) -#if __GLIBC_PREREQ(2, 17) // don't require -lrt -#define USE_MONOTONIC -#endif -#elif defined(OS_ANDROID) -#define USE_MONOTONIC -#endif -#endif -/* use similar scale as x86 rdtsc for timeouts to work correctly */ -static inline unsigned long long rpcc(void){ -#ifdef USE_MONOTONIC - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; -#elif !defined(OS_EMBEDDED) - struct timeval tv; - gettimeofday(&tv,NULL); - return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; -#else - return 0; -#endif -} -#define RPCC_DEFINED -#define RPCC64BIT -#endif // !RPCC_DEFINED - -#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) -static __inline void blas_lock(volatile BLASULONG *address){ - - do { - while (*address) {YIELDING;}; - - } while (!__sync_bool_compare_and_swap(address, 0, 1)); -} -#define BLAS_LOCK_DEFINED -#endif - -#ifndef RPCC_DEFINED -#error "rpcc() implementation is missing for your platform" -#endif -#ifndef BLAS_LOCK_DEFINED -#error "blas_lock() implementation is missing for your platform" -#endif -#endif // !ASSEMBLER - -#ifdef OS_LINUX -#include "common_linux.h" -#endif - -#ifdef OS_EMBEDDED -#define DTB_DEFAULT_ENTRIES 64 -#endif - -#define MMAP_ACCESS (PROT_READ | PROT_WRITE) - -#ifdef __NetBSD__ -#define MMAP_POLICY (MAP_PRIVATE | MAP_ANON) -#else -#define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) -#endif - -#ifndef ASSEMBLER -/* C99 supports complex floating numbers natively, which GCC also offers as an - extension since version 3.0. If neither are available, use a compatible - structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ - (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) - #define OPENBLAS_COMPLEX_C99 - #ifndef __cplusplus - #include - #endif - typedef float _Complex openblas_complex_float; - typedef double _Complex openblas_complex_double; - typedef xdouble _Complex openblas_complex_xdouble; - #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) - #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) -#else - #define OPENBLAS_COMPLEX_STRUCT - typedef struct { float real, imag; } openblas_complex_float; - typedef struct { double real, imag; } openblas_complex_double; - typedef struct { xdouble real, imag; } openblas_complex_xdouble; - #define openblas_make_complex_float(real, imag) {(real), (imag)} - #define openblas_make_complex_double(real, imag) {(real), (imag)} - #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} -#endif -#endif - -#include "param.h" -#include "common_param.h" - -#ifndef STDERR -#define STDERR stderr -#endif - -#ifndef MASK -#define MASK(a, b) (((a) + ((b) - 1)) & ~((b) - 1)) -#endif - -#if defined(XDOUBLE) || defined(DOUBLE) -#define FLOATRET FLOAT -#else -#ifdef NEED_F2CCONV -#define FLOATRET double -#else -#define FLOATRET float -#endif -#endif - -#ifndef ASSEMBLER -#ifndef NOINCLUDE -/* Inclusion of a standard header file is needed for definition of __STDC_* - predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs - as a side effect of including either or . */ -#include -#endif // NOINCLUDE - -#ifdef XDOUBLE -#define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble -#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i) -#elif defined(DOUBLE) -#define OPENBLAS_COMPLEX_FLOAT openblas_complex_double -#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_double(r,i) -#else -#define OPENBLAS_COMPLEX_FLOAT openblas_complex_float -#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_float(r,i) -#endif - -#if defined(C_PGI) || defined(C_SUN) - #if defined(__STDC_IEC_559_COMPLEX__) - #define CREAL(X) creal(X) - #define CIMAG(X) cimag(X) - #else - #define CREAL(X) (*((FLOAT *)&X + 0)) - #define CIMAG(X) (*((FLOAT *)&X + 1)) - #endif -#else -#ifdef OPENBLAS_COMPLEX_STRUCT -#define CREAL(Z) ((Z).real) -#define CIMAG(Z) ((Z).imag) -#else -#define CREAL __real__ -#define CIMAG __imag__ -#endif -#endif - -#endif // ASSEMBLER - -#ifndef IFLUSH -#define IFLUSH -#endif - -#ifndef IFLUSH_HALF -#define IFLUSH_HALF -#endif - -#if defined(C_GCC) && (( __GNUC__ <= 3) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 2))) -#ifdef USE_OPENMP -#undef USE_OPENMP -#endif -#endif - -#if defined(C_MSVC) -#define inline __inline -#endif - -#ifndef ASSEMBLER - -#ifndef MIN -#define MIN(a,b) (a>b? b:a) -#endif - -#ifndef MAX -#define MAX(a,b) (a 0x60) (a) -= 0x20;} - -#if defined(__FreeBSD__) || defined(__APPLE__) -#define MAP_ANONYMOUS MAP_ANON -#endif - -/* Common Memory Management Routine */ -void blas_set_parameter(void); -int blas_get_cpu_number(void); -void *blas_memory_alloc (int); -void blas_memory_free (void *); -void *blas_memory_alloc_nolock (int); //use malloc without blas_lock -void blas_memory_free_nolock (void *); - -int get_num_procs (void); - -#if defined(OS_LINUX) && defined(SMP) && !defined(NO_AFFINITY) -int get_num_nodes (void); -int get_num_proc (int); -int get_node_equal (void); -#endif - -void goto_set_num_threads(int); - -void gotoblas_affinity_init(void); -void gotoblas_affinity_quit(void); -void gotoblas_dynamic_init(void); -void gotoblas_dynamic_quit(void); -void gotoblas_profile_init(void); -void gotoblas_profile_quit(void); - -int support_avx512(void); - -#ifdef USE_OPENMP - -#ifndef C_MSVC -int omp_in_parallel(void); -int omp_get_num_procs(void); -#else -__declspec(dllimport) int __cdecl omp_in_parallel(void); -__declspec(dllimport) int __cdecl omp_get_num_procs(void); -#endif - -#ifdef HAVE_C11 -#if defined(C_GCC) && ( __GNUC__ < 7) -// workaround for GCC bug 65467 -#ifndef _Atomic -#define _Atomic volatile -#endif -#endif -#include -#else -#ifndef _Atomic -#define _Atomic volatile -#endif -#endif - -#else -#ifdef __ELF__ -int omp_in_parallel (void) __attribute__ ((weak)); -int omp_get_num_procs(void) __attribute__ ((weak)); -#endif -#endif - -static __inline void blas_unlock(volatile BLASULONG *address){ - MB; - *address = 0; -} - -#ifdef OS_WINDOWSSTORE -static __inline int readenv_atoi(char *env) { - return 0; -} -#else -#ifdef OS_WINDOWS -static __inline int readenv_atoi(char *env) { - env_var_t p; - return readenv(p,env) ? 0 : atoi(p); -} -#else -static __inline int readenv_atoi(char *env) { - char *p; - if (( p = getenv(env) )) - return (atoi(p)); - else - return(0); -} -#endif -#endif - -#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) - -static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){ - -#ifndef UNIT - FLOAT ratio, den; - - if ( -#ifdef XDOUBLE - (fabsl(ar)) >= (fabsl(ai)) -#elif defined DOUBLE - (fabs (ar)) >= (fabs (ai)) -#else - (fabsf(ar)) >= (fabsf(ai)) -#endif - ) { - ratio = ai / ar; - den = (FLOAT)(ONE / (ar * (ONE + ratio * ratio))); - ar = den; - ai = -ratio * den; - } else { - ratio = ar / ai; - den = (FLOAT)(ONE /(ai * (ONE + ratio * ratio))); - ar = ratio * den; - ai = -den; - } - b[0] = ar; - b[1] = ai; -#else - b[0] = ONE; - b[1] = ZERO; -#endif - -} -#endif - -#ifdef MALLOC_DEBUG -void *blas_debug_alloc(int); -void *blas_debug_free(void *); -#undef malloc -#undef free -#define malloc(a) blas_debug_alloc(a) -#define free(a) blas_debug_free (a) -#endif - -#ifndef COPYOVERHEAD -#define GEMMRETTYPE int -#else - -typedef struct { - double outercopy; - double innercopy; - double kernel; - double mflops; -} copyoverhead_t; - -#define GEMMRETTYPE copyoverhead_t -#endif -#endif - -#ifndef BUILD_KERNEL -#define KNAME(A, B) A -#else -#define KNAME(A, B) A##B -#endif - -#include "common_interface.h" -#ifdef SANITY_CHECK -#include "common_reference.h" -#endif -#include "common_macro.h" -#include "common_level1.h" -#include "common_level2.h" -#include "common_level3.h" -#include "common_lapack.h" - -#ifdef CBLAS -# define OPENBLAS_CONST /* see comment in cblas.h */ -# include "cblas.h" -#endif - -#ifndef ASSEMBLER -#include "common_stackalloc.h" -#if 0 -#include "symcopy.h" -#endif - -#if defined(SMP_SERVER) && defined(SMP_ONDEMAND) -#error Both SMP_SERVER and SMP_ONDEMAND are specified. -#endif - -#if defined(SMP_SERVER) || defined(SMP_ONDEMAND) -#include "common_thread.h" -#endif - -#endif - -#define INFO_NUM 99 - -#ifndef DEFAULT_CPU_NUMBER -#define DEFAULT_CPU_NUMBER 4 -#endif - -#ifndef IDEBUG_START -#define IDEBUG_START -#endif - -#ifndef IDEBUG_END -#define IDEBUG_END -#endif - -#if !defined(ASSEMBLER) && defined(FUNCTION_PROFILE) - -typedef struct { - int func; - unsigned long long calls, fops, area, cycles, tcycles; -} func_profile_t; - -extern func_profile_t function_profile_table[]; -extern int gotoblas_profile; - -#ifdef XDOUBLE -#define NUMOPT QNUMOPT -#elif defined DOUBLE -#define NUMOPT DNUMOPT -#else -#define NUMOPT SNUMOPT -#endif - -#define FUNCTION_PROFILE_START() { unsigned long long profile_start = rpcc(), profile_end; -#ifdef SMP -#define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ - if (gotoblas_profile) { \ - profile_end = rpcc(); \ - function_profile_table[PROFILE_FUNC_NAME].calls ++; \ - function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ - function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ - function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ - function_profile_table[PROFILE_FUNC_NAME].tcycles += blas_cpu_number * (profile_end - profile_start); \ - } \ - } -#else -#define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ - if (gotoblas_profile) { \ - profile_end = rpcc(); \ - function_profile_table[PROFILE_FUNC_NAME].calls ++; \ - function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ - function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ - function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ - function_profile_table[PROFILE_FUNC_NAME].tcycles += (profile_end - profile_start); \ - } \ - } -#endif - -#else -#define FUNCTION_PROFILE_START() -#define FUNCTION_PROFILE_END(COMP, AREA, OPS) -#endif - -#if 1 -#define PRINT_DEBUG_CNAME -#define PRINT_DEBUG_NAME -#else -#define PRINT_DEBUG_CNAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME) -#define PRINT_DEBUG_NAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME) -#endif - -#ifdef __cplusplus -} - -#endif /* __cplusplus */ - -#endif +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_H +#define COMMON_H + +#ifdef __cplusplus +extern "C" { + /* Assume C declarations for C++ */ +#endif /* __cplusplus */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifndef __USE_XOPEN +#define __USE_XOPEN +#endif + +#ifndef __USE_SVID +#define __USE_SVID +#endif + +#ifdef BUILD_KERNEL +#include "config_kernel.h" +#else +#include "config.h" +#endif + +#undef ENABLE_SSE_EXCEPTION + +#if defined(SMP_SERVER) || defined(SMP_ONDEMAND) +#define SMP +#endif + +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) +#define WINDOWS_ABI +#define OS_WINDOWS + +#ifdef DOUBLE +#define DOUBLE_DEFINED DOUBLE +#undef DOUBLE +#endif +#endif + +#if !defined(NOINCLUDE) && !defined(ASSEMBLER) +#include +#include +#include + +#if !defined(_MSC_VER) +#include +#elif _MSC_VER < 1900 +#define snprintf _snprintf +#endif +#include + +#if defined(OS_LINUX) || defined(OS_QNX) +#include +#include +#endif + +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID) +#include +#endif + +#ifdef OS_ANDROID +#define NO_SYSV_IPC +//Android NDK only supports complex.h since Android 5.0 +#if __ANDROID_API__ < 21 +#define FORCE_OPENBLAS_COMPLEX_STRUCT +#endif +#endif + +#if defined(OS_HAIKU) || defined(OS_QNX) +#define NO_SYSV_IPC +#endif + +#ifdef OS_WINDOWS +#ifdef ATOM +#define GOTO_ATOM ATOM +#undef ATOM +#endif +#include +#include +#ifdef GOTO_ATOM +#define ATOM GOTO_ATOM +#undef GOTO_ATOM +#endif +#elif !defined(OS_EMBEDDED) +#include +#ifndef NO_SYSV_IPC +#include +#endif +#include +#include +#include +#include +#if defined(SMP) || defined(USE_LOCKING) +#include +#endif +#else +#include +#include +#endif + +#if defined(OS_SUNOS) +#include +#endif + +#ifdef __DECC +#include +#include +#endif + +#if defined(ARCH_IA64) && defined(ENABLE_SSE_EXCEPTION) +#include +#endif + +#endif + +#if defined(OS_WINDOWS) && defined(DOUBLE_DEFINED) +#define DOUBLE DOUBLE_DEFINED +#undef DOUBLE_DEFINED +#endif + +#undef DEBUG_INFO +#define SMP_DEBUG +#undef MALLOC_DEBUG +#undef SMP_ALLOC_DEBUG + +#ifndef ZERO +#ifdef XDOUBLE +#define ZERO 0.e0L +#elif defined DOUBLE +#define ZERO 0.e0 +#else +#define ZERO 0.e0f +#endif +#endif + +#ifndef ONE +#ifdef XDOUBLE +#define ONE 1.e0L +#elif defined DOUBLE +#define ONE 1.e0 +#else +#define ONE 1.e0f +#endif +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#define ALLOCA_ALIGN 63UL + +#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)) + +#if (NEEDBUNDERSCORE==1) +#define BLASFUNC(FUNC) FUNC##_ +#else +#define BLASFUNC(FUNC) FUNC +#endif + +#undef USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK + +#if defined(USE_PTHREAD_LOCK) && defined(USE_PTHREAD_SPINLOCK) +#error "You can't specify both LOCK operation!" +#endif + +#if defined(SMP) || defined(USE_LOCKING) +#define USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK +#endif + +#ifdef OS_WINDOWS +#undef USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK +#endif + +#if defined(USE_PTHREAD_LOCK) +#define LOCK_COMMAND(x) pthread_mutex_lock(x) +#define UNLOCK_COMMAND(x) pthread_mutex_unlock(x) +#elif defined(USE_PTHREAD_SPINLOCK) +#ifndef ASSEMBLER +typedef volatile int pthread_spinlock_t; +int pthread_spin_lock (pthread_spinlock_t *__lock); +int pthread_spin_unlock (pthread_spinlock_t *__lock); +#endif +#define LOCK_COMMAND(x) pthread_spin_lock(x) +#define UNLOCK_COMMAND(x) pthread_spin_unlock(x) +#else +#define LOCK_COMMAND(x) blas_lock(x) +#define UNLOCK_COMMAND(x) blas_unlock(x) +#endif + +#define GOTO_SHMID 0x510510 + +#if 0 +#ifndef __CUDACC__ +#define __global__ +#define __device__ +#define __host__ +#define __shared__ +#endif +#endif + +#ifndef ASSEMBLER + +#ifdef QUAD_PRECISION +typedef struct { + unsigned long x[2]; +} xdouble; +#elif defined EXPRECISION +#define xdouble long double +#else +#define xdouble double +#endif + +#if defined(OS_WINDOWS) && defined(__64BIT__) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifndef bfloat16 +#include +typedef uint16_t bfloat16; +#define BFLOAT16CONVERSION 1 +#endif + +#ifdef USE64BITINT +typedef BLASLONG blasint; +#if defined(OS_WINDOWS) && defined(__64BIT__) +#define blasabs(x) llabs(x) +#else +#define blasabs(x) labs(x) +#endif +#else +typedef int blasint; +#define blasabs(x) abs(x) +#endif +#else +#ifdef USE64BITINT +#define INTSHIFT 3 +#define INTSIZE 8 +#else +#define INTSHIFT 2 +#define INTSIZE 4 +#endif +#endif + +#ifdef XDOUBLE +#define FLOAT xdouble +#ifdef QUAD_PRECISION +#define XFLOAT xidouble +#endif +#ifdef QUAD_PRECISION +#define SIZE 32 +#define BASE_SHIFT 5 +#define ZBASE_SHIFT 6 +#else +#define SIZE 16 +#define BASE_SHIFT 4 +#define ZBASE_SHIFT 5 +#endif +#elif defined(DOUBLE) +#define FLOAT double +#define SIZE 8 +#define BASE_SHIFT 3 +#define ZBASE_SHIFT 4 +#elif defined(BFLOAT16) +#define IFLOAT bfloat16 +#define XFLOAT IFLOAT +#define FLOAT float +#define SIZE 2 +#define BASE_SHIFT 1 +#define ZBASE_SHIFT 2 +#else +#define FLOAT float +#define SIZE 4 +#define BASE_SHIFT 2 +#define ZBASE_SHIFT 3 +#endif + +#ifndef XFLOAT +#define XFLOAT FLOAT +#endif + +#ifndef IFLOAT +#define IFLOAT FLOAT +#endif + +#ifndef COMPLEX +#define COMPSIZE 1 +#else +#define COMPSIZE 2 +#endif + + +#define Address_H(x) (((x)+(1<<15))>>16) +#define Address_L(x) ((x)-((Address_H(x))<<16)) + +#ifndef MAX_CPU_NUMBER +#define MAX_CPU_NUMBER 2 +#endif + +#if defined(OS_SUNOS) +#define YIELDING thr_yield() +#endif + +#if defined(OS_WINDOWS) +#if defined(_MSC_VER) && !defined(__clang__) +#define YIELDING YieldProcessor() +#else +#define YIELDING SwitchToThread() +#endif +#endif + +#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#endif + + +#if defined(POWER8) || defined(POWER9) || defined(POWER10) +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + + +#if defined(ARCH_X86_64) +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + +#if defined(ARCH_RISCV64) +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + + +#ifdef __EMSCRIPTEN__ +#define YIELDING +#endif + +#ifndef YIELDING +#define YIELDING sched_yield() +#endif + +/*** +To alloc job_t on heap or stack. +please https://github.com/xianyi/OpenBLAS/issues/246 +***/ +#if defined(OS_WINDOWS) +#define GETRF_MEM_ALLOC_THRESHOLD 32 +#define BLAS3_MEM_ALLOC_THRESHOLD 32 +#endif + +#ifndef GETRF_MEM_ALLOC_THRESHOLD +#define GETRF_MEM_ALLOC_THRESHOLD 80 +#endif + +#ifndef BLAS3_MEM_ALLOC_THRESHOLD +#define BLAS3_MEM_ALLOC_THRESHOLD 32 +#endif + +#ifdef QUAD_PRECISION +#include "common_quad.h" +#endif + +#ifdef ARCH_ALPHA +#include "common_alpha.h" +#endif + +#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif +#ifndef _CET_ENDBR +#define _CET_ENDBR +#endif + +#ifdef ARCH_X86 +#include "common_x86.h" +#endif + +#ifdef ARCH_X86_64 +#include "common_x86_64.h" +#endif + +#ifdef ARCH_IA64 +#include "common_ia64.h" +#endif + +#ifdef ARCH_POWER +#include "common_power.h" +#endif + +#ifdef sparc +#include "common_sparc.h" +#endif + +#ifdef ARCH_MIPS +#include "common_mips.h" +#endif + + +#ifdef ARCH_RISCV64 +#include "common_riscv64.h" +#endif + +#ifdef ARCH_MIPS64 +#include "common_mips64.h" +#endif + +#ifdef ARCH_ARM +#include "common_arm.h" +#endif + +#ifdef ARCH_ARM64 +#include "common_arm64.h" +#endif + +#ifdef ARCH_ZARCH +#include "common_zarch.h" +#endif + +#ifdef ARCH_LOONGARCH64 +#include "common_loongarch64.h" +#endif + +#ifdef ARCH_E2K +#include "common_e2k.h" +#endif + +#ifdef ARCH_CSKY +#include "common_csky.h" +#endif + +#ifndef ASSEMBLER +#ifdef OS_WINDOWSSTORE +typedef char env_var_t[MAX_PATH]; +#define readenv(p, n) 0 +#else +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) +typedef char env_var_t[MAX_PATH]; +#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) +#else +typedef char* env_var_t; +#define readenv(p, n) ((p)=getenv(n)) +#endif +#endif + +#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) +#ifdef _POSIX_MONOTONIC_CLOCK +#if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17) +#if __GLIBC_PREREQ(2, 17) // don't require -lrt +#define USE_MONOTONIC +#endif +#elif defined(OS_ANDROID) +#define USE_MONOTONIC +#endif +#endif +/* use similar scale as x86 rdtsc for timeouts to work correctly */ +static inline unsigned long long rpcc(void){ +#ifdef USE_MONOTONIC + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; +#elif !defined(OS_EMBEDDED) + struct timeval tv; + gettimeofday(&tv,NULL); + return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; +#else + return 0; +#endif +} +#define RPCC_DEFINED +#define RPCC64BIT +#endif // !RPCC_DEFINED + +#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) +static __inline void blas_lock(volatile BLASULONG *address){ + + do { + while (*address) {YIELDING;}; + + } while (!__sync_bool_compare_and_swap(address, 0, 1)); +} +#define BLAS_LOCK_DEFINED +#endif + +#ifndef RPCC_DEFINED +#error "rpcc() implementation is missing for your platform" +#endif +#ifndef BLAS_LOCK_DEFINED +#error "blas_lock() implementation is missing for your platform" +#endif +#endif // !ASSEMBLER + +#ifdef OS_LINUX +#include "common_linux.h" +#endif + +#ifdef OS_EMBEDDED +#define DTB_DEFAULT_ENTRIES 64 +#endif + +#define MMAP_ACCESS (PROT_READ | PROT_WRITE) + +#ifdef __NetBSD__ +#define MMAP_POLICY (MAP_PRIVATE | MAP_ANON) +#else +#define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) +#endif + +#ifndef ASSEMBLER +/* C99 supports complex floating numbers natively, which GCC also offers as an + extension since version 3.0. If neither are available, use a compatible + structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ +#if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ + (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) + #define OPENBLAS_COMPLEX_C99 + #ifndef __cplusplus + #include + #endif + typedef float _Complex openblas_complex_float; + typedef double _Complex openblas_complex_double; + typedef xdouble _Complex openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) + #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) +#else + #define OPENBLAS_COMPLEX_STRUCT + typedef struct { float real, imag; } openblas_complex_float; + typedef struct { double real, imag; } openblas_complex_double; + typedef struct { xdouble real, imag; } openblas_complex_xdouble; + #define openblas_make_complex_float(real, imag) {(real), (imag)} + #define openblas_make_complex_double(real, imag) {(real), (imag)} + #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} +#endif +#endif + +#include "param.h" +#include "common_param.h" + +#ifndef STDERR +#define STDERR stderr +#endif + +#ifndef MASK +#define MASK(a, b) (((a) + ((b) - 1)) & ~((b) - 1)) +#endif + +#if defined(XDOUBLE) || defined(DOUBLE) +#define FLOATRET FLOAT +#else +#ifdef NEED_F2CCONV +#define FLOATRET double +#else +#define FLOATRET float +#endif +#endif + +#ifndef ASSEMBLER +#ifndef NOINCLUDE +/* Inclusion of a standard header file is needed for definition of __STDC_* + predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs + as a side effect of including either or . */ +#include +#endif // NOINCLUDE + +#ifdef XDOUBLE +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i) +#elif defined(DOUBLE) +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_double +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_double(r,i) +#else +#define OPENBLAS_COMPLEX_FLOAT openblas_complex_float +#define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_float(r,i) +#endif + +#if defined(C_PGI) || defined(C_SUN) + #if defined(__STDC_IEC_559_COMPLEX__) + #define CREAL(X) creal(X) + #define CIMAG(X) cimag(X) + #else + #define CREAL(X) (*((FLOAT *)&X + 0)) + #define CIMAG(X) (*((FLOAT *)&X + 1)) + #endif +#else +#ifdef OPENBLAS_COMPLEX_STRUCT +#define CREAL(Z) ((Z).real) +#define CIMAG(Z) ((Z).imag) +#else +#define CREAL __real__ +#define CIMAG __imag__ +#endif +#endif + +#endif // ASSEMBLER + +#ifndef IFLUSH +#define IFLUSH +#endif + +#ifndef IFLUSH_HALF +#define IFLUSH_HALF +#endif + +#if defined(C_GCC) && (( __GNUC__ <= 3) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 2))) +#ifdef USE_OPENMP +#undef USE_OPENMP +#endif +#endif + +#if defined(C_MSVC) +#define inline __inline +#endif + +#ifndef ASSEMBLER + +#ifndef MIN +#define MIN(a,b) (a>b? b:a) +#endif + +#ifndef MAX +#define MAX(a,b) (a 0x60) (a) -= 0x20;} + +#if defined(__FreeBSD__) || defined(__APPLE__) +#define MAP_ANONYMOUS MAP_ANON +#endif + +/* Common Memory Management Routine */ +void blas_set_parameter(void); +int blas_get_cpu_number(void); +void *blas_memory_alloc (int); +void blas_memory_free (void *); +void *blas_memory_alloc_nolock (int); //use malloc without blas_lock +void blas_memory_free_nolock (void *); + +int get_num_procs (void); + +#if defined(OS_LINUX) && defined(SMP) && !defined(NO_AFFINITY) +int get_num_nodes (void); +int get_num_proc (int); +int get_node_equal (void); +#endif + +void goto_set_num_threads(int); + +void gotoblas_affinity_init(void); +void gotoblas_affinity_quit(void); +void gotoblas_dynamic_init(void); +void gotoblas_dynamic_quit(void); +void gotoblas_profile_init(void); +void gotoblas_profile_quit(void); + +int support_avx512(void); + +#ifdef USE_OPENMP + +#ifndef C_MSVC +int omp_in_parallel(void); +int omp_get_num_procs(void); +#else +__declspec(dllimport) int __cdecl omp_in_parallel(void); +__declspec(dllimport) int __cdecl omp_get_num_procs(void); +#endif + +#ifdef HAVE_C11 +#if defined(C_GCC) && ( __GNUC__ < 7) +// workaround for GCC bug 65467 +#ifndef _Atomic +#define _Atomic volatile +#endif +#endif +#include +#else +#ifndef _Atomic +#define _Atomic volatile +#endif +#endif + +#else +#ifdef __ELF__ +int omp_in_parallel (void) __attribute__ ((weak)); +int omp_get_num_procs(void) __attribute__ ((weak)); +#endif +#endif + +static __inline void blas_unlock(volatile BLASULONG *address){ + MB; + *address = 0; +} + +#ifdef OS_WINDOWSSTORE +static __inline int readenv_atoi(char *env) { + return 0; +} +#else +#ifdef OS_WINDOWS +static __inline int readenv_atoi(char *env) { + env_var_t p; + return readenv(p,env) ? 0 : atoi(p); +} +#else +static __inline int readenv_atoi(char *env) { + char *p; + if (( p = getenv(env) )) + return (atoi(p)); + else + return(0); +} +#endif +#endif + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + +static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){ + +#ifndef UNIT + FLOAT ratio, den; + + if ( +#ifdef XDOUBLE + (fabsl(ar)) >= (fabsl(ai)) +#elif defined DOUBLE + (fabs (ar)) >= (fabs (ai)) +#else + (fabsf(ar)) >= (fabsf(ai)) +#endif + ) { + ratio = ai / ar; + den = (FLOAT)(ONE / (ar * (ONE + ratio * ratio))); + ar = den; + ai = -ratio * den; + } else { + ratio = ar / ai; + den = (FLOAT)(ONE /(ai * (ONE + ratio * ratio))); + ar = ratio * den; + ai = -den; + } + b[0] = ar; + b[1] = ai; +#else + b[0] = ONE; + b[1] = ZERO; +#endif + +} +#endif + +#ifdef MALLOC_DEBUG +void *blas_debug_alloc(int); +void *blas_debug_free(void *); +#undef malloc +#undef free +#define malloc(a) blas_debug_alloc(a) +#define free(a) blas_debug_free (a) +#endif + +#ifndef COPYOVERHEAD +#define GEMMRETTYPE int +#else + +typedef struct { + double outercopy; + double innercopy; + double kernel; + double mflops; +} copyoverhead_t; + +#define GEMMRETTYPE copyoverhead_t +#endif +#endif + +#ifndef BUILD_KERNEL +#define KNAME(A, B) A +#else +#define KNAME(A, B) A##B +#endif + +#include "common_interface.h" +#ifdef SANITY_CHECK +#include "common_reference.h" +#endif +#include "common_macro.h" +#include "common_level1.h" +#include "common_level2.h" +#include "common_level3.h" +#include "common_lapack.h" + +#ifdef CBLAS +# define OPENBLAS_CONST /* see comment in cblas.h */ +# include "cblas.h" +#endif + +#ifndef ASSEMBLER +#include "common_stackalloc.h" +#if 0 +#include "symcopy.h" +#endif + +#if defined(SMP_SERVER) && defined(SMP_ONDEMAND) +#error Both SMP_SERVER and SMP_ONDEMAND are specified. +#endif + +#if defined(SMP_SERVER) || defined(SMP_ONDEMAND) +#include "common_thread.h" +#endif + +#endif + +#define INFO_NUM 99 + +#ifndef DEFAULT_CPU_NUMBER +#define DEFAULT_CPU_NUMBER 4 +#endif + +#ifndef IDEBUG_START +#define IDEBUG_START +#endif + +#ifndef IDEBUG_END +#define IDEBUG_END +#endif + +#if !defined(ASSEMBLER) && defined(FUNCTION_PROFILE) + +typedef struct { + int func; + unsigned long long calls, fops, area, cycles, tcycles; +} func_profile_t; + +extern func_profile_t function_profile_table[]; +extern int gotoblas_profile; + +#ifdef XDOUBLE +#define NUMOPT QNUMOPT +#elif defined DOUBLE +#define NUMOPT DNUMOPT +#else +#define NUMOPT SNUMOPT +#endif + +#define FUNCTION_PROFILE_START() { unsigned long long profile_start = rpcc(), profile_end; +#ifdef SMP +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ + if (gotoblas_profile) { \ + profile_end = rpcc(); \ + function_profile_table[PROFILE_FUNC_NAME].calls ++; \ + function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ + function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ + function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ + function_profile_table[PROFILE_FUNC_NAME].tcycles += blas_cpu_number * (profile_end - profile_start); \ + } \ + } +#else +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ + if (gotoblas_profile) { \ + profile_end = rpcc(); \ + function_profile_table[PROFILE_FUNC_NAME].calls ++; \ + function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ + function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ + function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ + function_profile_table[PROFILE_FUNC_NAME].tcycles += (profile_end - profile_start); \ + } \ + } +#endif + +#else +#define FUNCTION_PROFILE_START() +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) +#endif + +#if 1 +#define PRINT_DEBUG_CNAME +#define PRINT_DEBUG_NAME +#else +#define PRINT_DEBUG_CNAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME) +#define PRINT_DEBUG_NAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME) +#endif + +#ifdef __cplusplus +} + +#endif /* __cplusplus */ + +#endif