From ca31c32693bbb70cd8eeee5f2be09a7e9d1b363c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:49:22 +0200 Subject: [PATCH] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- cblas.h | 2 +- common.h | 4 +- common_interface.h | 4 +- common_level1.h | 2 +- common_level3.h | 28 +++---- common_macro.h | 94 +++++++++++----------- common_param.h | 230 ++++++++++++++++++++++------------------------------- getarch_2nd.c | 4 +- param.h | 32 ++++---- 9 files changed, 178 insertions(+), 222 deletions(-) diff --git a/cblas.h b/cblas.h index 21f3958..4fc6f86 100644 --- a/cblas.h +++ b/cblas.h @@ -392,7 +392,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE /* convert BFLOAT16 array to double array */ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); /* dot production of BFLOAT16 input arrays, and output as float */ -float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); +float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); #ifdef __cplusplus } diff --git a/common.h b/common.h index ab28726..89eeb19 100644 --- a/common.h +++ b/common.h @@ -260,7 +260,7 @@ typedef unsigned long BLASULONG; #ifndef BFLOAT16 #include typedef uint16_t bfloat16; -#define HALFCONVERSION 1 +#define BFLOAT16CONVERSION 1 #endif #ifdef USE64BITINT @@ -303,7 +303,7 @@ typedef int blasint; #define SIZE 8 #define BASE_SHIFT 3 #define ZBASE_SHIFT 4 -#elif defined(HALF) +#elif defined(BFLOAT16) #define IFLOAT bfloat16 #define XFLOAT IFLOAT #define FLOAT float diff --git a/common_interface.h b/common_interface.h index 35a957a..bee09e8 100644 --- a/common_interface.h +++ b/common_interface.h @@ -54,7 +54,7 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); -float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); +float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); @@ -474,7 +474,7 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint /* Level 3 routines */ -void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, +void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *, bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); diff --git a/common_level1.h b/common_level1.h index 88aa275..7b17962 100644 --- a/common_level1.h +++ b/common_level1.h @@ -46,7 +46,7 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); +float sbdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); diff --git a/common_level3.h b/common_level3.h index 671a7a0..c4f9435 100644 --- a/common_level3.h +++ b/common_level3.h @@ -55,7 +55,7 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); -int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, +int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -78,10 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #endif -int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); @@ -505,7 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); -int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); +int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); @@ -534,10 +534,10 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); -int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); @@ -631,10 +631,10 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif -int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 3d6bcd9..605d74a 100644 --- a/common_macro.h +++ b/common_macro.h @@ -644,7 +644,7 @@ #define GEADD_K DGEADD_K -#elif defined(HALF) +#elif defined(BFLOAT16) #define D_TO_BF16_K SHDTOBF16_K #define D_BF16_TO_K DBF16TOD_K @@ -662,7 +662,7 @@ #define ASUM_K SASUM_K #define DOTU_K SDOTU_K #define DOTC_K SDOTC_K -#define BF16_DOT_K SHDOT_K +#define BF16_DOT_K SBDOT_K #define AXPYU_K SAXPYU_K #define AXPYC_K SAXPYC_K #define AXPBY_K SAXPBY_K @@ -682,32 +682,32 @@ #define NRM2_K SNRM2_K #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L -#define GEMM_BETA SHGEMM_BETA -#define GEMM_KERNEL_N SHGEMM_KERNEL -#define GEMM_KERNEL_L SHGEMM_KERNEL -#define GEMM_KERNEL_R SHGEMM_KERNEL -#define GEMM_KERNEL_B SHGEMM_KERNEL - -#define GEMM_NN SHGEMM_NN -#define GEMM_CN SHGEMM_TN -#define GEMM_TN SHGEMM_TN -#define GEMM_NC SHGEMM_NT -#define GEMM_NT SHGEMM_NT -#define GEMM_CC SHGEMM_TT -#define GEMM_CT SHGEMM_TT -#define GEMM_TC SHGEMM_TT -#define GEMM_TT SHGEMM_TT -#define GEMM_NR SHGEMM_NN -#define GEMM_TR SHGEMM_TN -#define GEMM_CR SHGEMM_TN -#define GEMM_RN SHGEMM_NN -#define GEMM_RT SHGEMM_NT -#define GEMM_RC SHGEMM_NT -#define GEMM_RR SHGEMM_NN -#define GEMM_ONCOPY SHGEMM_ONCOPY -#define GEMM_OTCOPY SHGEMM_OTCOPY -#define GEMM_INCOPY SHGEMM_INCOPY -#define GEMM_ITCOPY SHGEMM_ITCOPY +#define GEMM_BETA SBGEMM_BETA +#define GEMM_KERNEL_N SBGEMM_KERNEL +#define GEMM_KERNEL_L SBGEMM_KERNEL +#define GEMM_KERNEL_R SBGEMM_KERNEL +#define GEMM_KERNEL_B SBGEMM_KERNEL + +#define GEMM_NN SBGEMM_NN +#define GEMM_CN SBGEMM_TN +#define GEMM_TN SBGEMM_TN +#define GEMM_NC SBGEMM_NT +#define GEMM_NT SBGEMM_NT +#define GEMM_CC SBGEMM_TT +#define GEMM_CT SBGEMM_TT +#define GEMM_TC SBGEMM_TT +#define GEMM_TT SBGEMM_TT +#define GEMM_NR SBGEMM_NN +#define GEMM_TR SBGEMM_TN +#define GEMM_CR SBGEMM_TN +#define GEMM_RN SBGEMM_NN +#define GEMM_RT SBGEMM_NT +#define GEMM_RC SBGEMM_NT +#define GEMM_RR SBGEMM_NN +#define GEMM_ONCOPY SBGEMM_ONCOPY +#define GEMM_OTCOPY SBGEMM_OTCOPY +#define GEMM_INCOPY SBGEMM_INCOPY +#define GEMM_ITCOPY SBGEMM_ITCOPY #define SYMM_THREAD_LU SSYMM_THREAD_LU #define SYMM_THREAD_LL SSYMM_THREAD_LL #define SYMM_THREAD_RU SSYMM_THREAD_RU @@ -723,22 +723,22 @@ #define HEMM_THREAD_RU SHEMM_THREAD_RU #define HEMM_THREAD_RL SHEMM_THREAD_RL -#define GEMM_THREAD_NN SHGEMM_THREAD_NN -#define GEMM_THREAD_CN SHGEMM_THREAD_TN -#define GEMM_THREAD_TN SHGEMM_THREAD_TN -#define GEMM_THREAD_NC SHGEMM_THREAD_NT -#define GEMM_THREAD_NT SHGEMM_THREAD_NT -#define GEMM_THREAD_CC SHGEMM_THREAD_TT -#define GEMM_THREAD_CT SHGEMM_THREAD_TT -#define GEMM_THREAD_TC SHGEMM_THREAD_TT -#define GEMM_THREAD_TT SHGEMM_THREAD_TT -#define GEMM_THREAD_NR SHGEMM_THREAD_NN -#define GEMM_THREAD_TR SHGEMM_THREAD_TN -#define GEMM_THREAD_CR SHGEMM_THREAD_TN -#define GEMM_THREAD_RN SHGEMM_THREAD_NN -#define GEMM_THREAD_RT SHGEMM_THREAD_NT -#define GEMM_THREAD_RC SHGEMM_THREAD_NT -#define GEMM_THREAD_RR SHGEMM_THREAD_NN +#define GEMM_THREAD_NN SBGEMM_THREAD_NN +#define GEMM_THREAD_CN SBGEMM_THREAD_TN +#define GEMM_THREAD_TN SBGEMM_THREAD_TN +#define GEMM_THREAD_NC SBGEMM_THREAD_NT +#define GEMM_THREAD_NT SBGEMM_THREAD_NT +#define GEMM_THREAD_CC SBGEMM_THREAD_TT +#define GEMM_THREAD_CT SBGEMM_THREAD_TT +#define GEMM_THREAD_TC SBGEMM_THREAD_TT +#define GEMM_THREAD_TT SBGEMM_THREAD_TT +#define GEMM_THREAD_NR SBGEMM_THREAD_NN +#define GEMM_THREAD_TR SBGEMM_THREAD_TN +#define GEMM_THREAD_CR SBGEMM_THREAD_TN +#define GEMM_THREAD_RN SBGEMM_THREAD_NN +#define GEMM_THREAD_RT SBGEMM_THREAD_NT +#define GEMM_THREAD_RC SBGEMM_THREAD_NT +#define GEMM_THREAD_RR SBGEMM_THREAD_NN #ifdef UNIT @@ -2491,9 +2491,9 @@ #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; -extern BLASLONG shgemm_p; -extern BLASLONG shgemm_q; -extern BLASLONG shgemm_r; +extern BLASLONG sbgemm_p; +extern BLASLONG sbgemm_q; +extern BLASLONG sbgemm_r; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_param.h b/common_param.h index 0fe5e6c..3615230 100644 --- a/common_param.h +++ b/common_param.h @@ -47,9 +47,9 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; -#ifdef BUILD_HALF - int shgemm_p, shgemm_q, shgemm_r; - int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; +#ifdef BUILD_BFLOAT16 + int sbgemm_p, sbgemm_q, sbgemm_r; + int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); @@ -69,8 +69,8 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); float (*shasum_k) (BLASLONG, float *, BLASLONG); float (*shsum_k) (BLASLONG, float *, BLASLONG); int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); - double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*sbdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); + double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); @@ -78,20 +78,20 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); - int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); + int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); - int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -147,14 +147,14 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; #endif int exclusive_cache; -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); @@ -167,11 +167,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); #endif - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE float (*ssum_k) (BLASLONG, float *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -179,26 +178,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif - -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); @@ -213,8 +206,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); #endif - -#if (BUILD_SINGLE) || (BUILD_DOUBLE) +#ifdef BUILD_SINGLE int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -236,8 +228,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); -#endif -#if BUILD_SINGLE + int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -264,18 +255,17 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) + int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -286,21 +276,21 @@ BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE double (*dsum_k) (BLASLONG, double *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) +#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -308,15 +298,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); #endif - -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); #endif - -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -325,8 +313,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); #endif - -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -473,30 +460,23 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif - -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) +#ifdef BUILD_COMPLEX int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; + float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); -#endif -#if BUILD_COMPLEX float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); float (*csum_k) (BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if BUILD_COMPLEX int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); -#endif -#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) + int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -510,8 +490,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -523,14 +501,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -561,8 +538,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -646,14 +621,12 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); -#endif -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) + int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif - -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -991,35 +964,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -1031,7 +1003,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -1043,21 +1015,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -1069,7 +1041,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -1081,16 +1053,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); #endif } gotoblas_t; @@ -1104,16 +1076,16 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache -#ifdef BUILD_HALF -#define SHGEMM_P gotoblas -> shgemm_p -#define SHGEMM_Q gotoblas -> shgemm_q -#define SHGEMM_R gotoblas -> shgemm_r -#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m -#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n -#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#ifdef BUILD_BFLOAT16 +#define SBGEMM_P gotoblas -> sbgemm_p +#define SBGEMM_Q gotoblas -> sbgemm_q +#define SBGEMM_R gotoblas -> sbgemm_r +#define SBGEMM_UNROLL_M gotoblas -> sbgemm_unroll_m +#define SBGEMM_UNROLL_N gotoblas -> sbgemm_unroll_n +#define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn #endif -#if (BUILD_SINGLE) +#if defined (BUILD_SINGLE) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -1122,21 +1094,13 @@ extern gotoblas_t *gotoblas; #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn #endif -#if (BUILD_DOUBLE) +#if defined (BUILD_DOUBLE) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn -#if ! (BUILD_SINGLE) -#define SGEMM_P gotoblas -> sgemm_p -#define SGEMM_Q gotoblas -> sgemm_q -#define SGEMM_R gotoblas -> sgemm_r -#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m -#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n -#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn -#endif #endif #define QGEMM_P gotoblas -> qgemm_p @@ -1146,7 +1110,7 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r @@ -1163,7 +1127,7 @@ extern gotoblas_t *gotoblas; #endif #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r @@ -1178,14 +1142,6 @@ extern gotoblas_t *gotoblas; #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn #endif -#ifndef BUILD_COMPLEX -#define CGEMM_P gotoblas -> cgemm_p -#define CGEMM_Q gotoblas -> cgemm_q -#define CGEMM_R gotoblas -> cgemm_r -#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m -#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n -#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn -#endif #endif #define XGEMM_P gotoblas -> xgemm_p @@ -1230,16 +1186,16 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif -#ifdef BUILD_HALF -#define SHGEMM_P SHGEMM_DEFAULT_P -#define SHGEMM_Q SHGEMM_DEFAULT_Q -#define SHGEMM_R SHGEMM_DEFAULT_R -#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#ifdef SHGEMM_DEFAULT_UNROLL_MN -#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN +#ifdef BUILD_BFLOAT16 +#define SBGEMM_P SBGEMM_DEFAULT_P +#define SBGEMM_Q SBGEMM_DEFAULT_Q +#define SBGEMM_R SBGEMM_DEFAULT_R +#define SBGEMM_UNROLL_M SBGEMM_DEFAULT_UNROLL_M +#define SBGEMM_UNROLL_N SBGEMM_DEFAULT_UNROLL_N +#ifdef SBGEMM_DEFAULT_UNROLL_MN +#define SBGEMM_UNROLL_MN SBGEMM_DEFAULT_UNROLL_MN #else -#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) +#define SBGEMM_UNROLL_MN MAX((SBGEMM_UNROLL_M), (SBGEMM_UNROLL_N)) #endif #endif @@ -1354,7 +1310,7 @@ extern gotoblas_t *gotoblas; #endif #ifndef COMPLEX -#if (XDOUBLE) +#if defined(XDOUBLE) #define GEMM_P QGEMM_P #define GEMM_Q QGEMM_Q #define GEMM_R QGEMM_R @@ -1378,18 +1334,18 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif (HALF) -#define GEMM_P SHGEMM_P -#define GEMM_Q SHGEMM_Q -#define GEMM_R SHGEMM_R -#define GEMM_UNROLL_M SHGEMM_UNROLL_M -#define GEMM_UNROLL_N SHGEMM_UNROLL_N -#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#elif defined(BFLOAT16) +#define GEMM_P SBGEMM_P +#define GEMM_Q SBGEMM_Q +#define GEMM_R SBGEMM_R +#define GEMM_UNROLL_M SBGEMM_UNROLL_M +#define GEMM_UNROLL_N SBGEMM_UNROLL_N +#define GEMM_UNROLL_MN SBGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SBGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SBGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SBGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SBGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SBGEMM_DEFAULT_UNROLL_N #else #define GEMM_P SGEMM_P #define GEMM_Q SGEMM_Q @@ -1404,7 +1360,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N #endif #else -#if (XDOUBLE) +#if defined(XDOUBLE) #define GEMM_P XGEMM_P #define GEMM_Q XGEMM_Q #define GEMM_R XGEMM_R @@ -1475,8 +1431,8 @@ extern gotoblas_t *gotoblas; #define GEMM_THREAD gemm_thread_n #endif -#ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) +#ifndef SBGEMM_DEFAULT_R +#define SBGEMM_DEFAULT_R (((BUFFER_SIZE - ((SBGEMM_DEFAULT_P * SBGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SBGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef SGEMM_DEFAULT_R @@ -1518,7 +1474,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_P #ifdef XDOUBLE #define GEMM3M_P XGEMM3M_P -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_P ZGEMM3M_P #else #define GEMM3M_P CGEMM3M_P @@ -1528,7 +1484,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_Q #ifdef XDOUBLE #define GEMM3M_Q XGEMM3M_Q -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_Q ZGEMM3M_Q #else #define GEMM3M_Q CGEMM3M_Q @@ -1538,7 +1494,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_R #ifdef XDOUBLE #define GEMM3M_R XGEMM3M_R -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_R ZGEMM3M_R #else #define GEMM3M_R CGEMM3M_R diff --git a/getarch_2nd.c b/getarch_2nd.c index a1d0cca..c390ef5 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -9,8 +9,8 @@ int main(int argc, char **argv) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { - printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); - printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); + printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M); + printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); diff --git a/param.h b/param.h index 1ab982d..f3ddde6 100644 --- a/param.h +++ b/param.h @@ -72,12 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H -#define SHGEMM_DEFAULT_UNROLL_N 4 -#define SHGEMM_DEFAULT_UNROLL_M 8 -#define SHGEMM_DEFAULT_UNROLL_MN 32 -#define SHGEMM_DEFAULT_P 256 -#define SHGEMM_DEFAULT_R 256 -#define SHGEMM_DEFAULT_Q 256 +#define SBGEMM_DEFAULT_UNROLL_N 4 +#define SBGEMM_DEFAULT_UNROLL_M 8 +#define SBGEMM_DEFAULT_UNROLL_MN 32 +#define SBGEMM_DEFAULT_P 256 +#define SBGEMM_DEFAULT_R 256 +#define SBGEMM_DEFAULT_Q 256 #ifdef OPTERON #define SNUMOPT 4 @@ -2426,16 +2426,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER10) -#undef SHGEMM_DEFAULT_UNROLL_N -#undef SHGEMM_DEFAULT_UNROLL_M -#undef SHGEMM_DEFAULT_P -#undef SHGEMM_DEFAULT_R -#undef SHGEMM_DEFAULT_Q -#define SHGEMM_DEFAULT_UNROLL_M 16 -#define SHGEMM_DEFAULT_UNROLL_N 8 -#define SHGEMM_DEFAULT_P 832 -#define SHGEMM_DEFAULT_Q 1026 -#define SHGEMM_DEFAULT_R 4096 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_UNROLL_N 8 +#define SBGEMM_DEFAULT_P 832 +#define SBGEMM_DEFAULT_Q 1026 +#define SBGEMM_DEFAULT_R 4096 #endif #if defined(SPARC) && defined(V7) -- 2.7.4