Merge branch 'openblas-open-910' of git://github.com/damonyu1989/OpenBLAS into damonyu1989-openblas-open-910
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 10.dev)
+set(OpenBLAS_PATCH_VERSION 12.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
OpenBLAS ChangeLog
====================================================================
+Version 0.3.12
+ 24-Oct-2020
+
+common:
+ * Fixed missing BLAS/LAPACK functions (inadvertently dropped during
+ the build system restructuring)
+ * Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
+
+POWER:
+ * Added optimized SCOPY/CCOPY kernels for POWER10
+ * Increased and unified the default size of the GEMM BUFFER
+ * Fixed building for POWER10 in DYNAMIC_ARCH mode
+ * POWER10 compatibility test now checks binutils version as well
+ * Cleaned up compiler warnings
+
+x86_64:
+ * corrected compiler version checks for AVX2 compatibility
+ * added compiler option -mavx2 for building with flang
+ * fixed direct SGEMM pathway for small matrix sizes (broken by
+ the code refactoring in 0.3.11)
+ * fixed unhandled partial register clobbers in several kernels
+ for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
+
+ARMV8:
+ * improved Apple Vortex support to include cross-compiling
+
+====================================================================
+Version 0.3.11
+ 17-Oct-2020
+
+common:
+ * API change:
+ the newly added BFLOAT16 functions were renamed to use the
+ letter "B" instead of "H" to avoid potential confusion with
+ the IEEE "half precision float" type, i.e. the 0.3.10
+ SHGEMM is now SBGEMM and the corresponding build option
+ was changed from "BUILD_HALF" to "BUILD_BFLOAT16".
+ * Reduced the default BLAS3_MEM_ALLOC_THRESHOLD (used as an upper
+ limit for placing temporary arrays on the stack) to be compatible
+ with a stack size of 1mb (as imposed by the JAVA runtime library)
+ * Added mixed-precision dot function SBDOT and utility functions
+ shstobf16, shdtobf16, sbf16tos and dbf16tod to convert between
+ single or double precision float arrays and bfloat16 arrays
+ * Fixed prototypes of LAPACK_?ggsvp and LAPACK_?ggsvd functions
+ in lapack.h
+ * Fixed underflow and rounding errors in LAPACK SLANV2 and DLANV2
+ (causing miscalculations in e.g. SHSEQR/DHSEQR, LAPACK issue #263)
+ * Fixed workspace calculation in LAPACK ?GELQ (LAPACK issue #415)
+ * Fixed several bugs in the LAPACK testsuite
+ * Improved performance of TRMM and TRSM for certain problem sizes
+ * Fixed infinite recursions and workspace miscalculations in ReLAPACK
+ * CMAKE builds no longer require pkg-config for creating the .pc file
+ * Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
+ enabling these options
+ * Fixed detection of gfortran when invoked through an mpi wrapper
+ * Improve thread reinitialization performance with OpenMP after a fork
+ * Added support for building only the subset of the library required
+ for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
+ * Optional function name prefixes and suffixes are now correctly
+ reflected in the generated cblas.h
+ * Added CMAKE build support for the LAPACK and multithreading tests
+
+POWER:
+ * Added optimized support for POWER10
+ * Added support for compiling for POWER8 in 32bit mode
+ * Added support for compilation with LLVM/clang
+ * Added support for compilation with NVIDIA/PGI compilers
+ * Fixed building on big-endian POWER8
+ * Fixed miscompilation of ZDOTC by gcc10
+ * Fixed alignment errors in the POWER8 SAXPY kernel
+ * Improved CPU detection on AIX
+ * Supported building with older compilers on POWER9
+
+x86_64:
+ * Added support for Intel Cooperlake
+ * Added autodetection of AMD Renoir/Matisse/Zen3 cpus
+ * Added autodetection of Intel Comet Lake cpus
+ * Reimplemented ?sum, ?dot and daxpy using universal intrinsics
+ * Reset the fpu state before using the fpu on Windows as a workaround
+ for a problem introduced in Windows 10 build 19041 (a.k.a. SDK 2004)
+ * Fixed potentially undefined behaviour in the dot and gemv_t kernels
+ * Fixed a potential segmentation fault in DYNAMIC_ARCH builds
+ * Fixed building for ZEN with PGI/NVIDIA and AMD AOCC compilers
+
+ARMV7:
+ * Fixed cpu detection on BSD-like systems
+
+ARMV8:
+ * Added preliminary support for Apple Vortex cpus
+ * Added support for the Cavium ThunderX3T110 cpu
+ * Fixed cpu detection on BSD-like systems
+ * Fixed compilation in -std=C18 mode
+
+IBM Z:
+ * Added support for compiling with the clang compiler
+ * Improved GEMM performance on Z14
+
+====================================================================
Version 0.3.10
14-Jun-2020
CCOMMON_OPT += -mfpu=vfp
FCOMMON_OPT += -mfpu=vfp
endif
+
+ifdef HAVE_NEON
+CCOMMON_OPT += -mfpu=neon
+FCOMMON_OPT += -mfpu=neon
+endif
endif
ifeq ($(CORE), POWER10)
-COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
endif
#
# This library's version
-VERSION = 0.3.10.dev
+VERSION = 0.3.12.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
-# the below is not yet configurable, use cmake if you need to build only select types
-BUILD_SINGLE = 1
-BUILD_DOUBLE = 1
-BUILD_COMPLEX = 1
-BUILD_COMPLEX16 = 1
+# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
+# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
+# the functions for complex numbers, uncomment the desired type(s) below
+# BUILD_SINGLE = 1
+# BUILD_DOUBLE = 1
+# BUILD_COMPLEX = 1
+# BUILD_COMPLEX16 = 1
+#
# End of user configuration
#
INCLUDED = 1
ifndef TOPDIR
-TOPDIR = .
+TOPDIR = .
endif
# If ARCH is not set, we use the host system's architecture for getarch compile options.
ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET)
GETARCH_FLAGS += -DUSER_TARGET
+ifeq ($(TARGET), GENERIC)
+ifeq ($(DYNAMIC_ARCH), 1)
+override NO_EXPRECISION=1
+export NO_EXPRECiSION
+endif
+endif
endif
# Force fallbacks for 32bit
ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf
else
+HAVE_NEON=
+HAVE_VFP=
+HAVE_VFPV3=
+HAVE_VFPV4=
+HAVE_MMX=
+HAVE_SSE=
+HAVE_SSE2=
+HAVE_SSE3=
+HAVE_SSSE3=
+HAVE_SSE4_1=
+HAVE_SSE4_2=
+HAVE_SSE4A=
+HAVE_SSE5=
+HAVE_AVX=
+HAVE_AVX2=
+HAVE_FMA3=
include $(TOPDIR)/Makefile_kernel.conf
endif
else
GCCDUMPVERSION_PARAM := -dumpversion
endif
+GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
endif
ifneq ($(C_COMPILER), GCC)
DYNAMIC_CORE += POWER9
DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
endif
ifeq ($(C_COMPILER), GCC)
ifeq ($(GCCVERSIONGT5), 1)
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
endif
-ifeq ($(GCCVERSIONGTEQ11), 1)
+LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
+ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
else ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq ($(GCCMINORVERSIONGTEQ2), 1)
+ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
endif
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
ifndef BINARY_DEFINED
ifneq ($(OSNAME), AIX)
ifdef BINARY64
+ifneq ($(ARCH), riscv64)
CCOMMON_OPT += -m64
+endif
else
CCOMMON_OPT += -m32
endif
FCOMMON_OPT += -Mrecursive -Kieee
ifeq ($(OSNAME), Linux)
ifeq ($(ARCH), x86_64)
-FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
+FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
ifeq ($(FLANG_VENDOR),AOCC)
FCOMMON_OPT += -fno-unroll-loops
endif
else
ifdef BINARY64
ifneq ($(OSNAME), AIX)
+ifneq ($(ARCH), riscv64)
FCOMMON_OPT += -m64
endif
+endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -fdefault-integer-8
export HAVE_SSE4A
export HAVE_SSE5
export HAVE_AVX
+export HAVE_AVX2
+export HAVE_FMA3
export HAVE_VFP
export HAVE_VFPV3
export HAVE_VFPV4
else
LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm
endif
+
+ifdef HAVE_SSE3
+ifndef DYNAMIC_ARCH
+CCOMMON_OPT += -msse3
+FCOMMON_OPT += -msse3
+ifdef HAVE_SSSE3
+CCOMMON_OPT += -mssse3
+FCOMMON_OPT += -mssse3
+endif
+ifdef HAVE_SSE4_1
+CCOMMON_OPT += -msse4.1
+FCOMMON_OPT += -msse4.1
+endif
+endif
+endif
+
endif
ifdef HAVE_SSE3
-ifndef DYNAMIC_ARCH
CCOMMON_OPT += -msse3
FCOMMON_OPT += -msse3
+endif
ifdef HAVE_SSSE3
CCOMMON_OPT += -mssse3
FCOMMON_OPT += -mssse3
endif
+ifdef HAVE_SSE4_1
+CCOMMON_OPT += -msse4.1
+FCOMMON_OPT += -msse4.1
+endif
+ifdef HAVE_AVX
+CCOMMON_OPT += -mavx
+FCOMMON_OPT += -mavx
+endif
+ifdef HAVE_AVX2
+CCOMMON_OPT += -mavx2
+FCOMMON_OPT += -mavx2
endif
+ifdef HAVE_FMA3
+CCOMMON_OPT += -mfma
+FCOMMON_OPT += -mfma
endif
ifeq ($(CORE), SKYLAKEX)
ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC)
# cooperlake support was added in 10.1
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
CCOMMON_OPT += -march=cooperlake
FCOMMON_OPT += -march=cooperlake
endif
endif
-ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE))
-ifndef DYNAMIC_ARCH
+ifdef HAVE_AVX2
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
+CCOMMON_OPT += -mavx2
+endif
+else
+ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -mavx2
endif
endif
ifeq ($(F_COMPILER), GFORTRAN)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
-ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
FCOMMON_OPT += -mavx2
endif
+else
+ifeq ($(F_COMPILER), FLANG)
+FCOMMON_OPT += -mavx2
endif
endif
endif
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef AMAX\r
\r
#ifdef COMPLEX\r
#ifdef DOUBLE\r
-#define AMAX BLASFUNC(dzamax)\r
+#define AMAX BLASFUNC(dzamax)\r
#else\r
-#define AMAX BLASFUNC(scamax)\r
+#define AMAX BLASFUNC(scamax)\r
#endif\r
#else\r
#ifdef DOUBLE\r
-#define AMAX BLASFUNC(damax)\r
+#define AMAX BLASFUNC(damax)\r
#else\r
-#define AMAX BLASFUNC(samax)\r
+#define AMAX BLASFUNC(samax)\r
#endif\r
#endif\r
\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz){\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size){\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
-int main(int argc, char *argv[]){\r
+int main(int argc, char *argv[])\r
+{\r
\r
FLOAT *x;\r
blasint m, i;\r
- blasint inc_x=1;\r
+ blasint inc_x = 1;\r
int loops = 1;\r
int l;\r
char *p;\r
\r
+ int from = 1;\r
+ int to = 200;\r
+ int step = 1;\r
\r
- int from = 1;\r
- int to = 200;\r
- int step = 1;\r
-\r
- struct timeval start, stop;\r
- double time1,timeg;\r
+ double time1, timeg;\r
\r
- argc--;argv++;\r
+ argc--;\r
+ argv++;\r
\r
- if (argc > 0) { from = atol(*argv); argc--; argv++;}\r
- if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}\r
- if (argc > 0) { step = atol(*argv); argc--; argv++;}\r
+ if (argc > 0)\r
+ {\r
+ from = atol(*argv);\r
+ argc--;\r
+ argv++;\r
+ }\r
+ if (argc > 0)\r
+ {\r
+ to = MAX(atol(*argv), from);\r
+ argc--;\r
+ argv++;\r
+ }\r
+ if (argc > 0)\r
+ {\r
+ step = atol(*argv);\r
+ argc--;\r
+ argv++;\r
+ }\r
\r
- if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);\r
- if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);\r
+ if ((p = getenv("OPENBLAS_LOOPS")))\r
+ loops = atoi(p);\r
+ if ((p = getenv("OPENBLAS_INCX")))\r
+ inc_x = atoi(p);\r
\r
- fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);\r
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);\r
\r
- if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){\r
- fprintf(stderr,"Out of Memory!!\n");exit(1);\r
+ if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)\r
+ {\r
+ fprintf(stderr, "Out of Memory!!\n");\r
+ exit(1);\r
}\r
\r
#ifdef __linux\r
\r
fprintf(stderr, " SIZE Flops\n");\r
\r
- for(m = from; m <= to; m += step)\r
+ for (m = from; m <= to; m += step)\r
{\r
\r
- timeg=0;\r
-\r
- fprintf(stderr, " %6d : ", (int)m);\r
+ timeg = 0;\r
+ fprintf(stderr, " %6d : ", (int)m);\r
\r
+ for (l = 0; l < loops; l++)\r
+ {\r
\r
- for (l=0; l<loops; l++)\r
- {\r
-\r
- for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){\r
- x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
- }\r
-\r
- gettimeofday( &start, (struct timezone *)0);\r
- AMAX (&m, x, &inc_x);\r
- gettimeofday( &stop, (struct timezone *)0);\r
-\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
-\r
- timeg += time1;\r
+ for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)\r
+ {\r
+ x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;\r
+ }\r
\r
+ begin();\r
+ AMAX(&m, x, &inc_x);\r
+ end();\r
+ timeg += getsec();\r
}\r
\r
timeg /= loops;\r
\r
fprintf(stderr,\r
- " %10.2f MFlops %10.6f sec\n",\r
- COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);\r
-\r
+ " %10.2f MFlops %10.6f sec\n",\r
+ COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);\r
}\r
\r
return 0;\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef AMIN\r
\r
#ifdef COMPLEX\r
#ifdef DOUBLE\r
-#define AMIN BLASFUNC(dzamin)\r
+#define AMIN BLASFUNC(dzamin)\r
#else\r
-#define AMIN BLASFUNC(scamin)\r
+#define AMIN BLASFUNC(scamin)\r
#endif\r
#else\r
#ifdef DOUBLE\r
-#define AMIN BLASFUNC(damin)\r
+#define AMIN BLASFUNC(damin)\r
#else\r
-#define AMIN BLASFUNC(samin)\r
-#endif\r
-#endif\r
-\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz){\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
+#define AMIN BLASFUNC(samin)\r
#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size){\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
#endif\r
\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
-int main(int argc, char *argv[]){\r
+int main(int argc, char *argv[])\r
+{\r
\r
FLOAT *x;\r
blasint m, i;\r
- blasint inc_x=1;\r
+ blasint inc_x = 1;\r
int loops = 1;\r
int l;\r
char *p;\r
\r
- int from = 1;\r
- int to = 200;\r
- int step = 1;\r
+ int from = 1;\r
+ int to = 200;\r
+ int step = 1;\r
\r
- struct timeval start, stop;\r
- double time1,timeg;\r
+ double time1, timeg;\r
\r
- argc--;argv++;\r
+ argc--;\r
+ argv++;\r
\r
- if (argc > 0) { from = atol(*argv); argc--; argv++;}\r
- if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}\r
- if (argc > 0) { step = atol(*argv); argc--; argv++;}\r
+ if (argc > 0)\r
+ {\r
+ from = atol(*argv);\r
+ argc--;\r
+ argv++;\r
+ }\r
+ if (argc > 0)\r
+ {\r
+ to = MAX(atol(*argv), from);\r
+ argc--;\r
+ argv++;\r
+ }\r
+ if (argc > 0)\r
+ {\r
+ step = atol(*argv);\r
+ argc--;\r
+ argv++;\r
+ }\r
\r
- if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);\r
- if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);\r
+ if ((p = getenv("OPENBLAS_LOOPS")))\r
+ loops = atoi(p);\r
+ if ((p = getenv("OPENBLAS_INCX")))\r
+ inc_x = atoi(p);\r
\r
- fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);\r
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);\r
\r
- if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){\r
- fprintf(stderr,"Out of Memory!!\n");exit(1);\r
+ if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)\r
+ {\r
+ fprintf(stderr, "Out of Memory!!\n");\r
+ exit(1);\r
}\r
\r
#ifdef __linux\r
\r
fprintf(stderr, " SIZE Flops\n");\r
\r
- for(m = from; m <= to; m += step)\r
+ for (m = from; m <= to; m += step)\r
{\r
\r
- timeg=0;\r
+ timeg = 0;\r
\r
- fprintf(stderr, " %6d : ", (int)m);\r
+ fprintf(stderr, " %6d : ", (int)m);\r
\r
+ for (l = 0; l < loops; l++)\r
+ {\r
\r
- for (l=0; l<loops; l++)\r
- {\r
-\r
- for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){\r
- x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
- }\r
-\r
- gettimeofday( &start, (struct timezone *)0);\r
-\r
- AMIN (&m, x, &inc_x);\r
+ for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)\r
+ {\r
+ x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;\r
+ }\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
+ begin();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ AMIN(&m, x, &inc_x);\r
\r
- timeg += time1;\r
+ end();\r
\r
+ timeg += getsec();\r
}\r
\r
timeg /= loops;\r
\r
fprintf(stderr,\r
- " %10.2f MFlops %10.6f sec\n",\r
- COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);\r
-\r
+ " %10.2f MFlops %10.6f sec\n",\r
+ COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);\r
}\r
\r
return 0;\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef ASUM
#ifdef COMPLEX
#ifdef DOUBLE
-#define ASUM BLASFUNC(dzasum)
+#define ASUM BLASFUNC(dzasum)
#else
-#define ASUM BLASFUNC(scasum)
+#define ASUM BLASFUNC(scasum)
#endif
#else
#ifdef DOUBLE
-#define ASUM BLASFUNC(dasum)
+#define ASUM BLASFUNC(dasum)
#else
-#define ASUM BLASFUNC(sasum)
+#define ASUM BLASFUNC(sasum)
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
FLOAT *x;
FLOAT result;
blasint m, i;
- blasint inc_x=1;
+ blasint inc_x = 1;
int loops = 1;
int l;
char *p;
- int from = 1;
- int to = 200;
- int step = 1;
-
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
- struct timeval start, stop;
- double time1,timeg;
-#else
- struct timespec start = { 0, 0 }, stop = { 0, 0 };
+ int from = 1;
+ int to = 200;
+ int step = 1;
double time1, timeg;
-#endif
- argc--;argv++;
+ argc--;
+ argv++;
- if (argc > 0) { from = atol(*argv); argc--; argv++;}
- if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
- if (argc > 0) { step = atol(*argv); argc--; argv++;}
+ if (argc > 0)
+ {
+ from = atol(*argv);
+ argc--;
+ argv++;
+ }
+ if (argc > 0)
+ {
+ to = MAX(atol(*argv), from);
+ argc--;
+ argv++;
+ }
+ if (argc > 0)
+ {
+ step = atol(*argv);
+ argc--;
+ argv++;
+ }
- if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
- if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
+ if ((p = getenv("OPENBLAS_LOOPS")))
+ loops = atoi(p);
+ if ((p = getenv("OPENBLAS_INCX")))
+ inc_x = atoi(p);
- fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+ fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
- if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
- fprintf(stderr,"Out of Memory!!\n");exit(1);
+ if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+ {
+ fprintf(stderr, "Out of Memory!!\n");
+ exit(1);
}
-
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
- for(m = from; m <= to; m += step)
+ for (m = from; m <= to; m += step)
{
- timeg=0;
-
- fprintf(stderr, " %6d : ", (int)m);
+ timeg = 0;
- for (l=0; l<loops; l++)
- {
+ fprintf(stderr, " %6d : ", (int)m);
- for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
- x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
- }
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
- gettimeofday( &start, (struct timezone *)0);
-#else
- clock_gettime(CLOCK_REALTIME, &start);
-#endif
- result = ASUM (&m, x, &inc_x);
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
- clock_gettime(CLOCK_REALTIME, &stop);
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-#else
- gettimeofday( &stop, (struct timezone *)0);
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
-#endif
-
- timeg += time1;
+ for (l = 0; l < loops; l++)
+ {
+ for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+ {
+ x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+ }
+ begin();
+ result = ASUM(&m, x, &inc_x);
+ end();
+ timeg += getsec();
}
-if (loops >1)
- timeg /= loops;
+ if (loops > 1)
+ timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
#else
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
#endif
-
}
return 0;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef AXPBY
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for (l=0; l<loops; l++)
{
- gettimeofday( &start, (struct timezone *)0);
-
+ begin();
AXPBY (&m, alpha, x, &inc_x, beta, y, &inc_y );
-
- gettimeofday( &stop, (struct timezone *)0);
-
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
- timeg += time1;
-
+ end();
+ timeg += getsec();
}
timeg /= loops;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef AXPY
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int from = 1;
int to = 200;
int step = 1;
-
- struct timespec start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- clock_gettime( CLOCK_REALTIME, &start);
+ begin();
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
- clock_gettime( CLOCK_REALTIME, &stop);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
+ time1 = getsec();
timeg += time1;
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+ FILETIME ft;
+ unsigned __int64 tmpres = 0;
+ static int tzflag;
+
+ if (NULL != tv)
+ {
+ GetSystemTimeAsFileTime(&ft);
+
+ tmpres |= ft.dwHighDateTime;
+ tmpres <<= 32;
+ tmpres |= ft.dwLowDateTime;
+
+ /*converting file time to unix epoch*/
+ tmpres /= 10; /*convert into microseconds*/
+ tmpres -= DELTA_EPOCH_IN_MICROSECS;
+ tv->tv_sec = (long)(tmpres / 1000000UL);
+ tv->tv_usec = (long)(tmpres % 1000000UL);
+ }
+
+ return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+ int shmid;
+ void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+ if ((shmid =shmget(IPC_PRIVATE,
+ (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+ SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+ printf( "Memory allocation failed(shmget).\n");
+ exit(1);
+ }
+
+ address = shmat(shmid, NULL, SHM_RND);
+
+ if ((BLASLONG)address == -1){
+ printf( "Memory allocation failed(shmat).\n");
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, 0);
+
+ return address;
+}
+
+
+#define malloc huge_malloc
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+ struct timeval start, stop;
+#else
+ struct timespec start = { 0, 0 }, stop = { 0, 0 };
+#endif
+
+double getsec()
+{
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+ return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+#else
+ return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
+#endif
+}
+
+void begin() {
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+ gettimeofday( &start, (struct timezone *)0);
+#else
+ clock_gettime(CLOCK_REALTIME, &start);
+#endif
+}
+
+void end() {
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+ gettimeofday( &stop, (struct timezone *)0);
+#else
+ clock_gettime(CLOCK_REALTIME, &stop);
+#endif
+}
\ No newline at end of file
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
double fabs(double);
#endif
#endif
-
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-
static __inline double getmflops(int ratio, int m, double secs){
double mm = (double)m;
FLOAT maxerr;
- struct timeval start, stop;
double time1;
argc--;argv++;
SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
- gettimeofday( &start, (struct timezone *)0);
+ begin();
POTRF(uplo[uplos], &m, b, &m, &info);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
if (info != 0) {
fprintf(stderr, "Info = %d\n", info);
exit(1);
}
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
- maxerr = 0.;
if (!(uplos & 1)) {
for (j = 0; j < m; j++) {
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef COPY
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1 = 0.0, timeg = 0.0;
long nanos = 0;
time_t seconds = 0;
- struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
argc--;argv++;
for (l=0; l<loops; l++)
{
- clock_gettime(CLOCK_REALTIME, &time_start);
+ begin();
COPY (&m, x, &inc_x, y, &inc_y );
- clock_gettime(CLOCK_REALTIME, &time_end);
-
- nanos = time_end.tv_nsec - time_start.tv_nsec;
- seconds = time_end.tv_sec - time_start.tv_sec;
-
- time1 = seconds + nanos / 1.e9;
- timeg += time1;
+ end();
+ timeg += getsec();
}
timeg /= loops;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef DOT
-
#ifdef DOUBLE
#define DOT BLASFUNC(ddot)
#else
#define DOT BLASFUNC(sdot)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
result = DOT (&m, x, &inc_x, y, &inc_y );
- gettimeofday( &stop, (struct timezone *)0);
-
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
- timeg += time1;
+ end();
+ timeg += getsec();
}
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef GEEV
FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info );
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
for(m = from; m <= to; m += step){
fprintf(stderr, " %6d : ", (int)m);
- gettimeofday( &start, (struct timezone *)0);
+ begin();
lwork = -1;
#ifndef COMPLEX
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
#endif
- gettimeofday( &stop, (struct timezone *)0);
+ end();
if (info) {
fprintf(stderr, "failed to compute eigenvalues .. %d\n", info);
exit(1);
}
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops : %10.2f Sec : %d\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef GEMM
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
IFLOAT *a, *b;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1, timeg;
argc--;argv++;
ldc = m;
fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k);
- gettimeofday( &start, (struct timezone *)0);
+ begin();
for (j=0; j<loops; j++) {
GEMM (&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
}
- gettimeofday( &stop, (struct timezone *)0);
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ end();
+ time1 = getsec();
timeg = time1/loops;
fprintf(stderr,
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef GEMM
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
- gettimeofday( &stop, (struct timezone *)0);
-
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
- timeg += time1;
-
+ end();
+ timeg += getsec();
}
timeg /= loops;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef GEMV
#endif
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
- gettimeofday( &stop, (struct timezone *)0);
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ end();
+ time1 = getsec();
timeg += time1;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
- gettimeofday( &stop, (struct timezone *)0);
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ end();
+ time1 = getsec();
timeg += time1;
}
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef GER
#endif
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for (l=0; l<loops; l++)
{
- gettimeofday( &start, (struct timezone *)0);
+ begin();
GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m);
- gettimeofday( &stop, (struct timezone *)0);
-
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
- timeg += time1;
-
+ end();
+
+ timeg += getsec();
}
timeg /= loops;
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
double fabs(double);
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
GESV (&m, &m, a, &m, ipiv, b, &m, &info);
- gettimeofday( &stop, (struct timezone *)0);
-
-
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
+ end();
+ time1 = getsec();
fprintf(stderr,
"%10.2f MFlops %10.6f s\n",
COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1);
-
}
return 0;
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef GETRF
#undef GETRI
extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info);
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a,*work;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
exit(1);
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
lwork = -1;
GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
lwork = (blasint)wkopt[0];
GETRI(&m, a, &m, ipiv, work, &lwork, &info);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
if (info) {
fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
exit(1);
}
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops : %10.2f Sec : %d\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef HBMV\r
\r
-\r
#ifdef DOUBLE\r
#define HBMV BLASFUNC(zhbmv)\r
#else\r
#define HBMV BLASFUNC(chbmv)\r
#endif\r
\r
-\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz) {\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size) {\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[]){\r
\r
FLOAT *a, *x, *y;\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
double time1,timeg;\r
\r
argc--;argv++;\r
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
}\r
\r
- gettimeofday( &start, (struct timezone *)0);\r
+ begin();\r
\r
HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
-\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ end();\r
\r
- timeg += time1;\r
+ timeg += getsec();\r
\r
}\r
\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef HEMM
#define HEMM BLASFUNC(chemm)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef HEMV
-
#ifdef DOUBLE
#define HEMV BLASFUNC(zhemv)
#else
#define HEMV BLASFUNC(chemv)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef HER
-
#ifdef DOUBLE
#define HER BLASFUNC(zher)
#else
#define HER BLASFUNC(cher)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *x;
int from = 1;
int to = 200;
int step = 1;
-
- struct timeval start, stop;
double time1;
argc--;argv++;
x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
HER (&uplo, &m, alpha, x, &incx, a, &m );
- gettimeofday( &stop, (struct timezone *)0);
-
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ end();
- gettimeofday( &start, (struct timezone *)0);
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef HER2
-
#ifdef DOUBLE
#define HER2 BLASFUNC(zher2)
#else
#define HER2 BLASFUNC(cher2)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
-
+ begin();
HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m );
- gettimeofday( &stop, (struct timezone *)0);
-
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ end();
- gettimeofday( &start, (struct timezone *)0);
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef HER2K
#ifdef DOUBLE
#define HER2K BLASFUNC(cher2k)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef HERK
-
#ifdef DOUBLE
#define HERK BLASFUNC(zherk)
#else
#define HERK BLASFUNC(cherk)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
-
}
return 0;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef HPMV\r
\r
-\r
#ifdef DOUBLE\r
#define HPMV BLASFUNC(zhpmv)\r
#else\r
#define HPMV BLASFUNC(chpmv)\r
#endif\r
\r
-\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz) {\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size) {\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[]){\r
\r
FLOAT *a, *x, *y;\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
double time1,timeg;\r
\r
argc--;argv++;\r
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
}\r
\r
- gettimeofday( &start, (struct timezone *)0);\r
+ begin();\r
\r
HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
+ end();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ time1 = getsec();\r
\r
timeg += time1;\r
\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef IAMAX
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
IAMAX (&m, x, &inc_x);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef IAMIN\r
\r
#endif\r
#endif\r
\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz){\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size){\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[]){\r
\r
FLOAT *x;\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
double time1,timeg;\r
\r
argc--;argv++;\r
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
}\r
\r
- gettimeofday( &start, (struct timezone *)0);\r
+ begin();\r
\r
IAMIN (&m, x, &inc_x);\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
+ end();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ time1 = getsec();\r
\r
timeg += time1;\r
\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef IMAX\r
\r
#endif\r
#endif\r
\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz){\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size){\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[]){\r
\r
FLOAT *x;\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
double time1,timeg;\r
\r
argc--;argv++;\r
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
}\r
\r
- gettimeofday( &start, (struct timezone *)0);\r
+ begin();\r
\r
IMAX (&m, x, &inc_x);\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
+ end();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ time1 = getsec();\r
\r
timeg += time1;\r
\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef IMIN\r
\r
#endif\r
#endif\r
\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz){\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size){\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[]){\r
\r
FLOAT *x;\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
double time1,timeg;\r
\r
argc--;argv++;\r
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
}\r
\r
- gettimeofday( &start, (struct timezone *)0);\r
+ begin();\r
\r
IMIN (&m, x, &inc_x);\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
+ end();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ time1 = getsec();\r
\r
timeg += time1;\r
\r
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
double fabs(double);
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b;
FLOAT maxerr;
- struct timeval start, stop;
double time1, time2;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
GETRF (&m, &m, a, &m, ipiv, &info);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
if (info) {
fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1);
}
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
- gettimeofday( &start, (struct timezone *)0);
+ begin();
GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
if (info) {
fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1);
}
- time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time2 = getsec();
maxerr = 0.;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef NAMAX\r
\r
#endif\r
#endif\r
\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz){\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size){\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[]){\r
\r
FLOAT *x;\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
double time1,timeg;\r
\r
argc--;argv++;\r
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
}\r
\r
- gettimeofday( &start, (struct timezone *)0);\r
+ begin();\r
\r
NAMAX (&m, x, &inc_x);\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
+ end();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ time1 = getsec();\r
\r
timeg += time1;\r
\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef NAMIN\r
\r
#endif\r
#endif\r
\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz){\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size){\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[]){\r
\r
FLOAT *x;\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
double time1,timeg;\r
\r
argc--;argv++;\r
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
}\r
\r
- gettimeofday( &start, (struct timezone *)0);\r
+ begin();\r
\r
NAMIN (&m, x, &inc_x);\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
+ end();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ time1 = getsec();\r
\r
timeg += time1;\r
\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef NRM2
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
NRM2 (&m, x, &inc_x);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
double fabs(double);
// extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info);
// extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info);
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
int main(int argc, char *argv[]){
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
- gettimeofday( &start, (struct timezone *)0);
+ begin();
POTRF(uplo[uplos], &m, b, &m, &info);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
if (info != 0) {
fprintf(stderr, "Potrf info = %d\n", info);
exit(1);
}
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
if ( btest == 'S' )
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
POTRS(uplo[uplos], &m, &m, b, &m, a, &m, &info);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
if (info != 0) {
fprintf(stderr, "Potrs info = %d\n", info);
exit(1);
}
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
}
if ( btest == 'I' )
{
- gettimeofday( &start, (struct timezone *)0);
+ begin();
POTRI(uplo[uplos], &m, b, &m, &info);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
if (info != 0) {
fprintf(stderr, "Potri info = %d\n", info);
exit(1);
}
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
}
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef ROT
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for (l=0; l<loops; l++)
{
- gettimeofday( &start, (struct timezone *)0);
+ begin();
ROT (&m, x, &inc_x, y, &inc_y, c, s);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
+#include "bench.h"\r
\r
#undef ROTM\r
\r
#define ROTM BLASFUNC(srotm)\r
#endif\r
\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz)\r
-{\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv) {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size)\r
-{\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =\r
- shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT | 0600)) < 0) {\r
- printf("Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1) {\r
- printf("Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[])\r
{\r
\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
+ \r
double time1, timeg;\r
\r
argc--;\r
}\r
\r
for (l = 0; l < loops; l++) {\r
- gettimeofday(&start, (struct timezone *)0);\r
+ begin();\r
\r
ROTM(&m, x, &inc_x, y, &inc_y, param);\r
\r
- gettimeofday(&stop, (struct timezone *)0);\r
+ end();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) +\r
- (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ time1 = getsec();\r
\r
timeg += time1;\r
}\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef SCAL
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SCAL (&m, alpha, x, &inc_x);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
*****************************************************************************/\r
\r
-#include <stdio.h>\r
-#include <stdlib.h>\r
-#ifdef __CYGWIN32__\r
-#include <sys/time.h>\r
-#endif\r
-#include "common.h"\r
-\r
+#include "bench.h"\r
\r
#undef SPMV\r
\r
-\r
#ifndef COMPLEX\r
\r
#ifdef DOUBLE\r
\r
#endif\r
\r
-#if defined(__WIN32__) || defined(__WIN64__)\r
-\r
-#ifndef DELTA_EPOCH_IN_MICROSECS\r
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL\r
-#endif\r
-\r
-int gettimeofday(struct timeval *tv, void *tz){\r
-\r
- FILETIME ft;\r
- unsigned __int64 tmpres = 0;\r
- static int tzflag;\r
-\r
- if (NULL != tv)\r
- {\r
- GetSystemTimeAsFileTime(&ft);\r
-\r
- tmpres |= ft.dwHighDateTime;\r
- tmpres <<= 32;\r
- tmpres |= ft.dwLowDateTime;\r
-\r
- /*converting file time to unix epoch*/\r
- tmpres /= 10; /*convert into microseconds*/\r
- tmpres -= DELTA_EPOCH_IN_MICROSECS;\r
- tv->tv_sec = (long)(tmpres / 1000000UL);\r
- tv->tv_usec = (long)(tmpres % 1000000UL);\r
- }\r
-\r
- return 0;\r
-}\r
-\r
-#endif\r
-\r
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0\r
-\r
-static void *huge_malloc(BLASLONG size){\r
- int shmid;\r
- void *address;\r
-\r
-#ifndef SHM_HUGETLB\r
-#define SHM_HUGETLB 04000\r
-#endif\r
-\r
- if ((shmid =shmget(IPC_PRIVATE,\r
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),\r
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {\r
- printf( "Memory allocation failed(shmget).\n");\r
- exit(1);\r
- }\r
-\r
- address = shmat(shmid, NULL, SHM_RND);\r
-\r
- if ((BLASLONG)address == -1){\r
- printf( "Memory allocation failed(shmat).\n");\r
- exit(1);\r
- }\r
-\r
- shmctl(shmid, IPC_RMID, 0);\r
-\r
- return address;\r
-}\r
-\r
-#define malloc huge_malloc\r
-\r
-#endif\r
-\r
int main(int argc, char *argv[]){\r
\r
FLOAT *a, *x, *y;\r
int to = 200;\r
int step = 1;\r
\r
- struct timeval start, stop;\r
double time1,timeg;\r
\r
argc--;argv++;\r
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){\r
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;\r
}\r
- gettimeofday( &start, (struct timezone *)0);\r
+ begin();\r
\r
SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );\r
\r
- gettimeofday( &stop, (struct timezone *)0);\r
+ end();\r
\r
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;\r
+ time1 = getsec();\r
\r
timeg += time1;\r
\r
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef SPR
#define SPR BLASFUNC(sspr)
#endif
-
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a,*c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SPR (&uplo, &m, alpha, c, &inc_x, a);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
}
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef SPR2
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a,*b,*c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
}
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef SWAP
#endif
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SWAP (&m, x, &inc_x, y, &inc_y );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef SYMM
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef SYMV
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef SYR
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x,*a;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SYR (&uplo, &m, alpha, x, &inc_x, a, &m );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef SYR2
#define SYR2 BLASFUNC(ssyr2)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y, *a;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef SYR2K
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef SYRK
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *c;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef TPMV
#endif
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1) {
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[])
{
int to = 200;
int step = 1;
- struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
argc--;argv++;
}
for (l = 0; l < loops; l++) {
- clock_gettime(CLOCK_REALTIME, &start);
+ begin();
TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x);
- clock_gettime(CLOCK_REALTIME, &stop);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+ time1 = getsec();
timeg += time1;
}
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef TPSV
#endif
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1) {
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[])
{
int to = 200;
int step = 1;
- struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
argc--;argv++;
}
for (l = 0; l < loops; l++) {
- clock_gettime(CLOCK_REALTIME, &start);
+ begin();
TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x);
- clock_gettime(CLOCK_REALTIME, &stop);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+ time1 = getsec();
timeg += time1;
}
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef TRMM
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef TRMV
#endif
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1) {
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[])
{
int to = 200;
int step = 1;
- struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
argc--;argv++;
}
for (l = 0; l < loops; l++) {
- clock_gettime(CLOCK_REALTIME, &start);
+ begin();
TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x);
- clock_gettime(CLOCK_REALTIME, &stop);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+ time1 = getsec();
timeg += time1;
}
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
#undef TRSM
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *b;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1;
argc--;argv++;
}
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
}
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include <time.h>
-#include "common.h"
-
+#include "bench.h"
#undef GEMV
#undef TRSV
#endif
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *a, *x;
int to = 200;
int step = 1;
- struct timespec time_start, time_end;
time_t seconds = 0;
double time1,timeg;
for(l =0;l< loops;l++){
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start);
-
+ begin();
TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x);
-
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end);
- nanos = time_end.tv_nsec - time_start.tv_nsec;
- seconds = time_end.tv_sec - time_start.tv_sec;
-
- time1 = seconds + nanos /1.e9;
+ end();
+ time1 = getsec();
timeg += time1;
}
-
timeg /= loops;
long long muls = n*(n+1)/2.0;
long long adds = (n - 1.0)*n/2.0;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#define RETURN_BY_STACK 1
-#include "common.h"
+#include "bench.h"
+#define RETURN_BY_STACK 1
#undef DOT
-
#ifdef DOUBLE
#define DOT BLASFUNC(zdotu)
#else
#define DOT BLASFUNC(cdotu)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
DOT (&result, &m, x, &inc_x, y, &inc_y );
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
#undef DOT
#define DOT BLASFUNC(cdotu)
#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
- FILETIME ft;
- unsigned __int64 tmpres = 0;
- static int tzflag;
-
- if (NULL != tv)
- {
- GetSystemTimeAsFileTime(&ft);
-
- tmpres |= ft.dwHighDateTime;
- tmpres <<= 32;
- tmpres |= ft.dwLowDateTime;
-
- /*converting file time to unix epoch*/
- tmpres /= 10; /*convert into microseconds*/
- tmpres -= DELTA_EPOCH_IN_MICROSECS;
- tv->tv_sec = (long)(tmpres / 1000000UL);
- tv->tv_usec = (long)(tmpres % 1000000UL);
- }
-
- return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
- int shmid;
- void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
- if ((shmid =shmget(IPC_PRIVATE,
- (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
- SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
- printf( "Memory allocation failed(shmget).\n");
- exit(1);
- }
-
- address = shmat(shmid, NULL, SHM_RND);
-
- if ((BLASLONG)address == -1){
- printf( "Memory allocation failed(shmat).\n");
- exit(1);
- }
-
- shmctl(shmid, IPC_RMID, 0);
-
- return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
int main(int argc, char *argv[]){
FLOAT *x, *y;
int to = 200;
int step = 1;
- struct timeval start, stop;
double time1,timeg;
argc--;argv++;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
- gettimeofday( &start, (struct timezone *)0);
+ begin();
#ifdef RETURN_BY_STACK
DOT (&result , &m, x, &inc_x, y, &inc_y );
#else
result = DOT (&m, x, &inc_x, y, &inc_y );
#endif
- gettimeofday( &stop, (struct timezone *)0);
+ end();
- time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+ time1 = getsec();
timeg += time1;
void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
/* dot production of BFLOAT16 input arrays, and output as float */
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
+void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
#ifdef __cplusplus
}
if (POWER)
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
+ set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
endif ()
if (X86)
endif ()
endif ()
-if (${CORE} STREQUAL "SKYLAKEX")
+if (${CORE} STREQUAL SKYLAKEX)
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
endif ()
endif ()
-if (${CORE} STREQUAL "COOPERLAKE")
+if (${CORE} STREQUAL COOPERLAKE)
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (HAVE_AVX)
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx")
endif ()
+ if (HAVE_SSE)
+ set (CCOMMON_OPT "${CCOMMON_OPT} -msse")
+ endif ()
+ if (HAVE_SSE2)
+ set (CCOMMON_OPT "${CCOMMON_OPT} -msse2")
+ endif ()
if (HAVE_SSE3)
set (CCOMMON_OPT "${CCOMMON_OPT} -msse3")
endif ()
if (HAVE_SSSE3)
set (CCOMMON_OPT "${CCOMMON_OPT} -mssse3")
endif ()
+ if (HAVE_SSE4_1)
+ set (CCOMMON_OPT "${CCOMMON_OPT} -msse4.1")
+ endif ()
endif()
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
if (BUILD_BFLOAT16)
- set(SBGEMVNKERNEL ../arm/gemv_n.c)
- set(SBGEMVTKERNEL ../arm/gemv_t.c)
+ set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
+ set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
set(SHGERKERNEL ../generic/ger.c)
endif ()
endmacro ()
set(NO_EXPRECISION 1)
endif ()
+if (DYNAMIC_ARCH)
+if (TARGET)
+if (${TARGET} STREQUAL "GENERIC")
+ set(NO_EXPRECISION 1)
+endif ()
+endif ()
+endif ()
+
if (UTEST_CHECK)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")
set(SANITY_CHECK 1)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
+elseif ("${TCORE}" STREQUAL "VORTEX")
+ file(APPEND ${TARGET_CONF_TEMP}
+ "#define ARMV8\n"
+ "#define L1_CODE_SIZE\t32768\n"
+ "#define L1_CODE_LINESIZE\t64\n"
+ "#define L1_CODE_ASSOCIATIVE\t4\n"
+ "#define L1_DATA_SIZE\t32768\n"
+ "#define L1_DATA_LINESIZE\t64\n"
+ "#define L1_DATA_ASSOCIATIVE\t4\n"
+ "#define L2_SIZE\t5262144\n"
+ "#define L2_LINESIZE\t64\n"
+ "#define L2_ASSOCIATIVE\t8\n"
+ "#define DTB_DEFAULT_ENTRIES\t64\n"
+ "#define DTB_SIZE\t4096\n")
+ set(SGEMM_UNROLL_M 16)
+ set(SGEMM_UNROLL_N 4)
+ set(DGEMM_UNROLL_M 8)
+ set(DGEMM_UNROLL_N 4)
+ set(CGEMM_UNROLL_M 8)
+ set(CGEMM_UNROLL_N 4)
+ set(ZGEMM_UNROLL_M 4)
+ set(ZGEMM_UNROLL_N 4)
+ set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "POWER6")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
endif ()
endif ()
+ unset (HAVE_AVX2)
+ unset (HAVE_AVX)
+ unset (HAVE_FMA3)
+ unset (HAVE_MMX)
+ unset (HAVE_SSE)
+ unset (HAVE_SSE2)
+ unset (HAVE_SSE3)
+ unset (HAVE_SSSE3)
+ unset (HAVE_SSE4A)
+ unset (HAVE_SSE4_1)
+ unset (HAVE_SSE4_2)
+ unset (HAVE_NEON)
+ unset (HAVE_VFP)
+ unset (HAVE_VFPV3)
+ unset (HAVE_VFPV4)
message(STATUS "Running getarch")
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
endif ()
endif ()
-if (DEFINED TARGET)
- if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
-# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
- execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
- if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
- set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
- else()
- set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
- endif()
-# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
-# endif()
- endif()
- if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
- set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
- endif()
- if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
- if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
- execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
- if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
- set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
- endif()
- elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
- set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
- endif()
- endif()
- if (DEFINED HAVE_SSE3)
- set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
- endif()
-endif()
if (DEFINED TARGET)
+ message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --")
message(STATUS "Targeting the ${TARGET} architecture.")
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif ()
endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
+if (DEFINED TARGET)
+ if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
+# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+ execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+ if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
+ else()
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+ endif()
+# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+# endif()
+ endif()
+ if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+ endif()
+ if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
+ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+ execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+ if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+ endif()
+ elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+ endif()
+ endif()
+ if (DEFINED HAVE_AVX)
+ if (NOT NO_AVX)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx")
+ endif()
+ endif()
+ if (DEFINED HAVE_AVX2)
+ if (NOT NO_AVX2)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+ endif()
+ endif()
+ if (DEFINED HAVE_FMA3)
+ if (NOT NO_AVX2)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
+ endif()
+ endif()
+ if (DEFINED HAVE_SSE)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
+ endif()
+ if (DEFINED HAVE_SSE2)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
+ endif()
+ if (DEFINED HAVE_SSE3)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+ endif()
+ if (DEFINED HAVE_SSSE3)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
+ endif()
+ if (DEFINED HAVE_SSE4_1)
+ set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
+ endif()
+endif()
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()
#include "common_mips.h"
#endif
+
#ifdef ARCH_RISCV64
#include "common_riscv64.h"
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#ifndef BUFFERSIZE
-#if defined(CORTEXA57)
-#define BUFFER_SIZE (20 << 20)
-#elif defined(TSV110) || defined(EMAG8180)
#define BUFFER_SIZE (32 << 20)
#else
-#define BUFFER_SIZE (16 << 20)
-#endif
-#else
#define BUFFER_SIZE (32 << BUFFERSIZE)
#endif
void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, blasint *);
+void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float *, bfloat16 *, blasint *,
+ bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
extern "C" {
#endif
+int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
+int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
#elif defined(BFLOAT16)
-#define D_TO_BF16_K SBDTOBF16_K
-#define D_BF16_TO_K DBF16TOD_K
-#define S_TO_BF16_K SBSTOBF16_K
-#define S_BF16_TO_K SBF16TOS_K
+#define D_TO_BF16_K SBDTOBF16_K
+#define D_BF16_TO_K DBF16TOD_K
+#define S_TO_BF16_K SBSTOBF16_K
+#define S_BF16_TO_K SBF16TOS_K
+#define SBGEMV_N SBGEMV_N_K
+#define SBGEMV_T SBGEMV_T_K
#define AMAX_K SAMAX_K
#define AMIN_K SAMIN_K
int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
- int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
- int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
+ int (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+ int (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
-#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
-#define BUFFER_SIZE ( 64 << 20)
+#elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
+#define BUFFER_SIZE ( 64 << 22)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
#define SBDTOBF16_K sbdtobf16_k
#define SBF16TOS_K sbf16tos_k
#define DBF16TOD_K dbf16tod_k
+#define SBGEMV_N_K sbgemv_n
+#define SBGEMV_T_K sbgemv_t
#define SBGEMM_ONCOPY sbgemm_oncopy
#define SBGEMM_OTCOPY sbgemm_otcopy
#define SBDTOBF16_K gotoblas -> sbdtobf16_k
#define SBF16TOS_K gotoblas -> sbf16tos_k
#define DBF16TOD_K gotoblas -> dbf16tod_k
+#define SBGEMV_N_K gotoblas -> sbgemv_n
+#define SBGEMV_T_K gotoblas -> sbgemv_t
#define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy
#define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
printf("#define L1_DATA_SIZE %d \n",value);
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
- printf("#define L2_DATA_SIZE %d \n",value);
+ printf("#define L2_SIZE %d \n",value);
break;
#endif
}
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written
+ 3. Neither the name of the OpenBLAS project nor the names of
+ its contributors may be used to endorse or promote products
+ derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
- ret=1; //OS support AVX
+ ret=1; //OS supports saving xmm and ymm registers (6 = (1<<1) | (1<<2))
}
}
return ret;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
- if((ebx & (1<<7)) != 0)
- ret=1; //OS supports AVX2
+ if((ebx & (1<<5)) != 0)
+ ret=1; //CPU supports AVX2
return ret;
#else
return 0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
- if((ebx & 32) != 32){
- ret=0; //OS does not even support AVX2
+ if((ebx & (1<<5)) == 0){
+ ret=0; //cpu does not have avx2 flag
}
- if((ebx & (1<<31)) != 0){
+ if((ebx & (1<<31)) != 0){ //AVX512VL flag
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
- ret=1; //OS supports AVX512VL
- }
+ ret=1; //OS supports saving zmm registers
+ }
return ret;
#else
return 0;
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
HAVE_C11
#endif
+
xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \
xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \
xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \
- xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) \
+ xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX)
+
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS += \
+ sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \
+ sbgemv_thread_t$(TSUFFIX).$(SUFFIX)
+endif
endif
xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
+sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h
+ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
+endif
+
+
include ../../Makefile.tail
--- /dev/null
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+#ifndef TRANSA
+#define SBGEMV SBGEMV_N
+#else
+#define SBGEMV SBGEMV_T
+#endif
+
+static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){
+
+ bfloat16 *a, *x;
+ float *y;
+ BLASLONG lda, incx, incy;
+ BLASLONG m_from, m_to, n_from, n_to;
+
+ a = (bfloat16 *)args->a;
+ x = (bfloat16 *)args->b;
+ y = (float *)args->c;
+
+ lda = args->lda;
+ incx = args->ldb;
+ incy = args->ldc;
+
+#ifndef TRANSA // N
+ m_from = *(range_m + 0);
+ m_to = *(range_m + 1);
+ n_from = 0;
+ n_to = args -> n;
+ a += m_from;
+ y += m_from * incy;
+#else // T
+ m_from = 0;
+ m_to = args->m;
+ n_from = *(range_n + 0);
+ n_to = *(range_n + 1);
+ a += n_from * lda;
+ y += n_from * incy;
+#endif
+
+ SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy);
+
+ return 0;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads)
+{
+ blas_arg_t args;
+ blas_queue_t queue[MAX_CPU_NUMBER];
+ BLASLONG range[MAX_CPU_NUMBER + 1];
+
+#ifndef TRANSA
+ BLASLONG width_for_split = m;
+#else
+ BLASLONG width_for_split = n;
+#endif
+
+ BLASLONG BLOCK_WIDTH = width_for_split/threads;
+
+ int mode = BLAS_BFLOAT16 | BLAS_REAL;
+
+ args.m = m;
+ args.n = n;
+ args.a = (void *)a;
+ args.b = (void *)x;
+ args.c = (void *)y;
+ args.lda = lda;
+ args.ldb = incx;
+ args.ldc = incy;
+ args.alpha = (void *)α
+ args.beta = (void *)β
+
+ range[0] = 0;
+
+ int thread_idx;
+
+ for (thread_idx=0; thread_idx<threads; thread_idx++) {
+ if (thread_idx != threads-1) {
+ range[thread_idx + 1] = range[thread_idx] + BLOCK_WIDTH;
+ } else {
+ range[thread_idx + 1] = range[thread_idx] + width_for_split;
+ }
+
+ queue[thread_idx].mode = mode;
+ queue[thread_idx].routine = sbgemv_kernel;
+ queue[thread_idx].args = &args;
+#ifndef TRANSA
+ queue[thread_idx].range_m = &range[thread_idx];
+ queue[thread_idx].range_n = NULL;
+#else
+ queue[thread_idx].range_m = NULL;
+ queue[thread_idx].range_n = &range[thread_idx];
+#endif
+ queue[thread_idx].sa = NULL;
+ queue[thread_idx].sb = NULL;
+ queue[thread_idx].next = &queue[thread_idx + 1];
+
+ width_for_split -= BLOCK_WIDTH;
+ }
+
+ if (thread_idx) {
+ queue[0].sa = NULL;
+ queue[0].sb = NULL;
+ queue[thread_idx - 1].next = NULL;
+
+ exec_blas(thread_idx, queue);
+ }
+
+ return 0;
+}
break;
}
- mode |= BLAS_LEGACY;
+ if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY;
for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
/* Other types in future */
}
}
-if (!sb) fprintf(stderr,"SB not declared!!!\n");
queue->sb=sb;
}
}
routine = queue -> routine;
- if (!(queue -> mode & BLAS_LEGACY)) {
+ if (queue -> mode & BLAS_LEGACY) {
+ legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
+ } else
+ if (queue -> mode & BLAS_PTHREAD) {
+ void (*pthreadcompat)(void *) = queue -> routine;
+ (pthreadcompat)(queue -> args);
+ } else
(routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0);
- } else {
- legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
- }
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
- if((ebx & (1<<7)) != 0)
- ret=1; //OS supports AVX2
+ if((ebx & (1<<5)) != 0)
+ ret=1; //AVX2 flag is set
return ret;
#else
return 0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
- if((ebx & (1<<7)) == 0){
- ret=0; //OS does not even support AVX2
+ if((ebx & (1<<5)) == 0){
+ ret=0; //cpu does not have avx2 flag
}
- if((ebx & (1u<<31)) != 0){
+ if((ebx & (1<<31)) != 0){ //AVX512VL flag is set
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
- ret=1; //OS supports AVX512VL
+ ret=1; //OS supports saving zmm register
}
return ret;
#else
static gotoblas_t *get_coretype(void) {
int implementer, variant, part, arch, revision, midr_el1;
+ char coremsg[128];
+
+#if (!defined OS_LINUX && !defined OS_ANDROID)
+ return NULL;
+#endif
-#if (defined OS_LINUX || defined OS_ANDROID)
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
- char coremsg[128];
+#ifdef __linux
+ FILE *infile;
+ char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
+ p = (char *) NULL ;
+ infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r");
+ if (!infile) return NULL;
+ fgets(buffer, sizeof(buffer), infile);
+ midr_el1=strtoul(buffer,NULL,16);
+ fclose(infile);
+#else
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
openblas_warning(1, coremsg);
return NULL;
- }
-#else
- return NULL;
#endif
-
- get_cpu_ftr(MIDR_EL1, midr_el1);
+ } else {
+ get_cpu_ftr(MIDR_EL1, midr_el1);
+ }
/*
* MIDR_EL1
*
return &gotoblas_FALKOR;
}
break;
+ default:
+ snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
+ openblas_warning(1, coremsg);
}
return NULL;
}
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
extern gotoblas_t gotoblas_POWER9;
#endif
-#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
- || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
-#define HAVE_P10_SUPPORT 1
-#endif
+//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
+// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
+//#define HAVE_P10_SUPPORT 1
+//#endif
#ifdef HAVE_P10_SUPPORT
extern gotoblas_t gotoblas_POWER10;
#endif
int get_num_procs(void) {
static int nums = 0;
+
+#if defined(__GLIBC_PREREQ)
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
-
-#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
$(LIBPREFIX).def : gensymbol
- perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+ perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
libgoto_hpl.def : gensymbol
- perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+ perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
ifeq ($(OSNAME), Darwin)
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
rm -f goto.$(SUFFIX)
osx.def : gensymbol ../Makefile.system ../getarch.c
- perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+ perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
aix.def : gensymbol ../Makefile.system ../getarch.c
- perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+ perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
objcopy.def : gensymbol ../Makefile.system ../getarch.c
- perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+ perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
objconv.def : gensymbol ../Makefile.system ../getarch.c
- perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
+ perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
test : linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
zgeadd, dzsum);
-@cblasobjs = (lsame, xerbla);
-@halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
+@blasobjs = (lsame, xerbla);
+@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@cblasobjsc = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
);
@cblasobjss = (
- cblas_sasum, cblas_saxpy,
+ cblas_sasum, cblas_saxpy, cblas_saxpby,
cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm,
cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg,
cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr,
@cblasobjs = ( cblas_xerbla );
-@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
+@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
@exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
cgeqrt, cgeqrt2, cgeqrt3, cgemqrt,
ctpqrt, ctpqrt2, ctpmqrt, ctprfb,
);
-@lapack2objszc = (
+@lapackobjs2zc = (
# ZCLASRC -- Double-single mixed precision complex routines called from
# single, single-extra and double precision complex LAPACK
# routines (i.e. from CLASRC, CXLASRC, ZLASRC).
cpotrs,
);
-@lapack2objsd = (
+@lapackobjs2d = (
# DLASRC -- Double precision real LAPACK routines
# already provided by @lapackobjs:
# dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri,
);
# functions added for lapack-3.6.0
-@lapack2objsc = ( @lapack2objsc,
+@lapackobjs2c = ( @lapackobjs2c,
cgejsv,
cgesvdx,
cgesvj,
csyr2,
cunm22,
);
-@lapackobjs2d = (@lapack2objsd,
+@lapackobjs2d = (@lapackobjs2d,
dbdsvdx,
dgesvdx,
dgetrf2,
dpotrf2,
dsecnd,
);
- @lapack2objss = (@lapack2objss,
+ @lapackobjs2s = (@lapackobjs2s,
sbdsvdx,
second,
sgesvdx,
sorm22,
spotrf2,
);
- @lapack2objsz = (@lapack2objsz,
+ @lapackobjs2z = (@lapackobjs2z,
zgejsv,
zgesvdx,
zgesvj,
zunm22,
);
# functions added for lapack-3.7.0
-@lapack2objss = (@lapack2objss,
+@lapackobjs2s = (@lapackobjs2s,
slarfy,
strevc3,
sgelqt,
stplqt2,
stpmlqt,
);
- @lapack2objsd = (@lapack2objsd,
+ @lapackobjs2d = (@lapackobjs2d,
dlarfy,
dsyconvf,
dtrevc3,
dtplqt2,
dtpmlqt,
);
- @lapack2objsc = (@lapack2objsc,
+ @lapackobjs2c = (@lapackobjs2c,
clarfy,
csyconvf,
ctrevc3,
ctplqt2,
ctpmlqt,
);
- @lapack2objsz = (@lapack2objsz,
+ @lapackobjs2z = (@lapackobjs2z,
zlarfy,
zsyconvf,
ztrevc3,
zlamswlq,
zgemlq,
);
- @lapack2objs = (@lapack2objs,
- sladiv1,
- dladiv1,
+ @lapackobjs2s = (@lapackobjs2s,
+ sladiv1);
+ @lapackobjs2d = (@lapackobjs2d,
+ dladiv1);
+ @lapackobjs = (@lapackobjs,
iparam2stage,
-
# functions added for lapack-3.8.0
-
ilaenv2stage,
);
# functions added for lapack-3.9.0
-@lapack2objsc = (@lapack2objsc,
+@lapackobjs2c = (@lapackobjs2c,
cgesvdq,
- cungtsqr,
- dcombssq,
+ cungtsqr
);
-@lapack2objsd = (@lapack2objsd,
+@lapackobjs2d = (@lapackobjs2d,
+ dcombssq,
dgesvdq,
dorgtsqr,
);
-@lapack2objss = (@lapack2objss,
+@lapackobjs2s = (@lapackobjs2s,
scombssq,
sgesvdq,
sorgtsqr,
);
-@lapack2objsz = (@lapack2objsz,
+@lapackobjs2z = (@lapackobjs2z,
zgesvdq,
zungtsqr
);
dlatzm, dtzrqf);
@lapack_deprecated_objss = (
+ sgelsx,
sgegs,
- sgegv,
+ sgegv,
+ sgeqpf,
+ sggsvd,
+ sggsvp,
+ slahrd,
+ slatzm,
+ stzrqf
);
-
+
+@lapack_deprecated_objsz = (
+ zgegs,
+ zgegv,
+ zgelsx,
+ zgeqpf,
+ zggsvd,
+ zggsvp,
+ zlahrd,
+ zlatzm,
+ ztzrqf
+ );
+
@lapacke_deprecated_objsc = (
LAPACKE_cggsvp,
LAPACKE_cggsvp_work,
my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
if ($ARGV[12] == 1) {
- @blasobjs = (@blasobjs, @halfblasobjs);
- @cblasobjs = (@cblasobjs, @halfcblasobjs);
+ @blasobjs = (@blasobjs, @bfblasobjs);
+ @cblasobjs = (@cblasobjs, @bfcblasobjs);
}
if ($ARGV[13] == 1) {
@blasobjs = (@blasobjs, @blasobjss);
@cblasobjs = (@cblasobjs, @cblasobjss);
@lapackobjs = (@lapackobjs, @lapackobjss);
- @lapack2objs = (@lapack2objs, @lapack2objss);
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2s);
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
+ @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objss);
+ @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objss);
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s);
@lapackeobjs = (@lapackeobjs, @lapackeobjss);
}
@blasobjs = (@blasobjs, @blasobjsd);
@cblasobjs = (@cblasobjs, @cblasobjsd);
@lapackobjs = (@lapackobjs, @lapackobjsd);
- @lapack2objs = (@lapack2objs, @lapack2objsd);
+ if ($ARGV[13] == 0) {
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2ds);
+ }
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2d, @lapackobjs2dz);
+ @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsd);
+ @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsd);
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d);
@lapackeobjs = (@lapackeobjs, @lapackeobjsd);
}
@blasobjs = (@blasobjs, @blasobjsc);
@cblasobjs = (@cblasobjs, @cblasobjsc);
@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc);
- @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsc);
+ @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsc);
@lapackobjs = (@lapackobjs, @lapackobjsc);
- @lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc);
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2c, @lapackobjs2zc);
+ if ($ARGV[13] == 0) {
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2sc);
+ }
+ @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsc);
+ @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsc);
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c);
@lapackeobjs = (@lapackeobjs, @lapackeobjsc);
}
@blasobjs = (@blasobjs, @blasobjsz);
@cblasobjs = (@cblasobjs, @cblasobjsz);
@gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz);
- @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsz);
+ @cblasgemm3mobjs = (@cblasgemm3mobjs, @cblasgemm3mobjsz);
@lapackobjs = (@lapackobjs, @lapackobjsz);
- @lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc);
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2z);
+ if ($ARGV[15] == 0) {
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2zc);
+ }
+ if ($ARGV[14] == 0) {
+ @lapackobjs2 = (@lapackobjs2, @lapackobjs2dz);
+ }
+ @lapack_deprecated_objs = (@lapack_deprecated_objs, @lapack_deprecated_objsz);
+ @lapacke_deprecated_objs = (@lapacke_deprecated_objs, @lapacke_deprecated_objsz);
@lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_z);
@lapackeobjs = (@lapackeobjs, @lapackeobjsz);
}
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
"pathf90", "pathf95",
"pgf95", "pgf90", "pgf77",
- "flang",
+ "flang", "egfortran",
"ifort");
OUTER:
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
- "-DFMA3"
+ "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
- "-DFMA3"
+ "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#else
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
- "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
+ "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
#define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX"
#endif
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
- "-DFMA3"
+ "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#else
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
- "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
+ "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
#define LIBNAME "cooperlake"
#define CORENAME "COOPERLAKE"
#endif
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
- "-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
+ "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "zen"
#define CORENAME "ZEN"
#endif
#else
#endif
+#ifdef FORCE_VORTEX
+#define FORCE
+#define ARCHITECTURE "ARM64"
+#define SUBARCHITECTURE "VORTEX"
+#define SUBDIRNAME "arm64"
+#define ARCHCONFIG "-DVORTEX " \
+ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME "vortex"
+#define CORENAME "VORTEX"
+#endif
+
#ifdef FORCE_ZARCH_GENERIC
#define FORCE
#define ARCHITECTURE "ZARCH"
printf("NUM_CORES=%d\n", get_num_cores());
-#if defined(__arm__) && !defined(FORCE)
+#if defined(__arm__)
+#if !defined(FORCE)
+ fprintf(stderr,"get features!\n");
get_features();
+#else
+ fprintf(stderr,"split archconfig!\n");
+ sprintf(buffer, "%s", ARCHCONFIG);
+
+ p = &buffer[0];
+
+ while (*p) {
+ if ((*p == '-') && (*(p + 1) == 'D')) {
+ p += 2;
+ if (*p != 'H') {
+ while( (*p != ' ') && (*p != '-') && (*p != '\0') && (*p != '\n')) {p++; }
+ if (*p == '-') continue;
+ }
+ while ((*p != ' ') && (*p != '\0')) {
+
+ if (*p == '=') {
+ printf("=");
+ p ++;
+ while ((*p != ' ') && (*p != '\0')) {
+ printf("%c", *p);
+ p ++;
+ }
+ } else {
+ printf("%c", *p);
+ p ++;
+ if ((*p == ' ') || (*p =='\0')) printf("=1\n");
+ }
+ }
+ } else p ++;
+ }
+#endif
#endif
ifeq ($(BUILD_BFLOAT16),1)
SBBLAS1OBJS = sbdot.$(SUFFIX)
+SBBLAS2OBJS = sbgemv.$(SUFFIX)
SBBLAS3OBJS = sbgemm.$(SUFFIX)
SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
endif
ifeq ($(BUILD_BFLOAT16),1)
CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
+CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
endif
SBLAS2OBJS += $(CSBLAS2OBJS)
SBLAS3OBJS += $(CSBLAS3OBJS)
SBBLAS1OBJS += $(CSBBLAS1OBJS)
+SBBLAS2OBJS += $(CSBBLAS2OBJS)
SBBLAS3OBJS += $(CSBBLAS3OBJS)
DBLAS1OBJS += $(CDBLAS1OBJS)
DBLAS2OBJS += $(CDBLAS2OBJS)
endif
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
-SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS)
+SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS)
DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
endif
FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
-$(info FUNCOBJS = {[$(FUNCOBJS)]} )
+
ifdef EXPRECISION
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
endif
level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
-level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
+level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS)
xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c
$(CC) -c $(CFLAGS) -DCONJ $< -o $(@F)
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+endif
+
ifndef USE_NETLIB_GEMV
sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c
$(CC) -c $(CFLAGS) -o $(@F) $<
cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
+ifeq ($(BUILD_BFLOAT16),1)
+cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
+ $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+endif
+
cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c
$(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $<
}
#endif
- //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta);
if ((m==0) || (n==0)) return;
lenx = n;
--- /dev/null
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "l1param.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#define ERROR_NAME "SBGEMV "
+
+#ifdef SMP
+static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = {
+ sbgemv_thread_n, sbgemv_thread_t,
+};
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY)
+{
+ char trans = *TRANS;
+ blasint m = *M;
+ blasint n = *N;
+ blasint lda = *LDA;
+ blasint incx = *INCX;
+ blasint incy = *INCY;
+ float alpha = *ALPHA;
+ float beta = *BETA;
+#ifdef SMP
+ int nthreads;
+#endif
+
+ int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
+ SBGEMV_N, SBGEMV_T,
+ };
+
+ blasint info;
+ blasint lenx, leny;
+ blasint i;
+
+ PRINT_DEBUG_NAME;
+
+ TOUPPER(trans);
+
+ info = 0;
+
+ i = -1;
+
+ if (trans == 'N') {i = 0;}
+ if (trans == 'T') {i = 1;}
+ if (trans == 'R') {i = 0;}
+ if (trans == 'C') {i = 1;}
+
+ if (incy == 0) {info = 11;}
+ if (incx == 0) {info = 8;}
+ if (lda < MAX(1, m)) {info = 6;}
+ if (n < 0) {info = 3;}
+ if (m < 0) {info = 2;}
+ if (i < 0) {info = 1;}
+
+ trans = i;
+
+ if (info != 0) {
+ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+ return;
+ }
+
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy)
+{
+ blasint lenx, leny;
+ int trans;
+ blasint info, t;
+#ifdef SMP
+ int nthreads;
+#endif
+
+ int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
+ SBGEMV_N, SBGEMV_T,
+ };
+
+ PRINT_DEBUG_CNAME;
+
+ trans = -1;
+ info = 0;
+
+ if (order == CblasColMajor) { // Column Major
+ if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
+ trans = 0;
+ } else if (TransA == CblasTrans || TransA == CblasConjTrans) {
+ trans = 1;
+ }
+ } else { // Row Major
+ if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
+ trans = 1;
+ } else if (TransA == CblasTrans || TransA == CblasConjTrans) {
+ trans = 0;
+ }
+
+ t = n;
+ n = m;
+ m = t;
+ }
+
+ info = -1;
+
+ if (incy == 0) {info = 11;}
+ if (incx == 0) {info = 8;}
+ if (lda < MAX(1, m)) {info = 6;}
+ if (n < 0) {info = 3;}
+ if (m < 0) {info = 2;}
+ if (trans < 0) {info = 1;}
+
+ if (info >= 0) {
+ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+ return;
+ }
+
+#endif
+
+ if ((m==0) || (n==0)) return;
+
+ if (trans) {
+ lenx = m;
+ leny = n;
+ } else {
+ lenx = n;
+ leny = m;
+ }
+
+ if (alpha == ZERO) {
+ if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
+ return;
+ }
+
+ IDEBUG_START;
+ FUNCTION_PROFILE_START();
+
+ if (incx < 0) {x -= (lenx - 1) * incx;}
+ if (incy < 0) {y -= (leny - 1) * incy;}
+
+#ifdef SMP
+ int thread_thres_row = 20480;
+ if (trans) {
+ if (n <= thread_thres_row) {
+ nthreads = 1;
+ } else {
+ nthreads = num_cpu_avail(1);
+ }
+ } else {
+ if (m <= thread_thres_row) {
+ nthreads = 1;
+ } else {
+ nthreads = num_cpu_avail(1);
+ }
+ }
+
+
+ if (nthreads == 1) {
+#endif
+ (sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy);
+#ifdef SMP
+ } else {
+ (sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads);
+ }
+#endif
+
+ FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
+ IDEBUG_END;
+
+ return;
+}
TOPDIR = ..
include $(TOPDIR)/Makefile.system
-ifdef HAVE_SSE3
-CFLAGS += -msse3
-endif
-ifdef HAVE_SSSE3
-CFLAGS += -mssse3
-endif
-
-ifeq ($(C_COMPILER), GCC)
-GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-endif
-
ifeq ($(ARCH), power)
ifeq ($(C_COMPILER), CLANG)
override CFLAGS += -fno-integrated-as
endif
endif
+
AVX2OPT =
ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0
- GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
- GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
- ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
AVX2OPT = -mavx2
endif
endif
ifeq ($(C_COMPILER), CLANG)
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
- GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
- GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
- ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2), 11)
+ GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
+ ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
AVX2OPT = -mavx2
endif
endif
endif
ifdef TARGET_CORE
- ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3))
- override CFLAGS += -msse3 -mssse3
-endif
ifeq ($(TARGET_CORE), COOPERLAKE)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1)
XGEMVTKERNEL = zgemv_t.S
endif
+ifeq ($(BUILD_BFLOAT16),1)
+ifndef SBGEMVNKERNEL
+SBGEMVNKERNEL = ../x86_64/sbgemv_n.c
+endif
+
+ifndef SBGEMVTKERNEL
+SBGEMVTKERNEL = ../x86_64/sbgemv_t.c
+endif
+endif
+
### GER ###
ifndef SGERKERNEL
xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \
xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX)
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS += \
+ sbgemv_n$(TSUFFIX).$(SUFFIX) \
+ sbgemv_t$(TSUFFIX).$(SUFFIX)
+endif
+
ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" ""
$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@
$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
+ifeq ($(BUILD_BFLOAT16),1)
+$(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVNKERNEL)
+ $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
+$(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL)
+ $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
+endif
if (inc_x == 1)
{
#if V_SIMD
+#ifdef DOUBLE
+ const int vstep = v_nlanes_f64;
+ const int unrollx2 = n & (-vstep * 2);
+ const int unrollx = n & -vstep;
+ v_f64 vsum0 = v_zero_f64();
+ v_f64 vsum1 = v_zero_f64();
+ while (i < unrollx2)
+ {
+ vsum0 = v_add_f64(vsum0, v_loadu_f64(x));
+ vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep));
+ i += vstep * 2;
+ }
+ vsum0 = v_add_f64(vsum0, vsum1);
+ while (i < unrollx)
+ {
+ vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
+ i += vstep;
+ }
+ sumf = v_sum_f64(vsum0);
+#else
const int vstep = v_nlanes_f32;
const int unrollx4 = n & (-vstep * 4);
const int unrollx = n & -vstep;
i += vstep;
}
sumf = v_sum_f32(vsum0);
+#endif
#else
int n1 = n & -4;
for (; i < n1; i += 4)
i++ ;
}
-#if !defined(__POWER__)
+#if !defined(__PPC__)
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
#else
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_power10.c
-DGEMMINCOPY = ../generic/gemm_ncopy_16.c
-DGEMMITCOPY = dgemm_tcopy_16_power8.S
-DGEMMONCOPY = dgemm_ncopy_4_power8.S
-DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMINCOPY =
+DGEMMITCOPY =
+DGEMMONCOPY = dgemm_ncopy_8_power10.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c
#
-SAXPYKERNEL = saxpy.c
+SAXPYKERNEL = saxpy_power10.c
DAXPYKERNEL = daxpy_power10.c
-ifneq ($(GCCVERSIONGTEQ9),1)
-CAXPYKERNEL = caxpy_power9.S
-else
-CAXPYKERNEL = caxpy.c
-endif
+CAXPYKERNEL = caxpy_power10.c
ZAXPYKERNEL = zaxpy_power10.c
#
-SCOPYKERNEL = scopy.c
+SCOPYKERNEL = scopy_power10.c
DCOPYKERNEL = dcopy_power10.c
-CCOPYKERNEL = ccopy.c
+CCOPYKERNEL = ccopy_power10.c
ZCOPYKERNEL = zcopy_power10.c
#
-SDOTKERNEL = sdot.c
-DDOTKERNEL = ddot.c
-DSDOTKERNEL = sdot.c
+SDOTKERNEL = sdot_power10.c
+DDOTKERNEL = ddot_power10.c
+DSDOTKERNEL = sdot_power10.c
ifneq ($(GCCVERSIONGTEQ9),1)
CDOTKERNEL = cdot_power9.S
else
--- /dev/null
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void caxpy_kernel_8 (long n, float *x, float *y,
+ float alpha_r, float alpha_i)
+{
+#if !defined(CONJ)
+ static const float mvec[4] = { -1.0, 1.0, -1.0, 1.0 };
+#else
+ static const float mvec[4] = { 1.0, -1.0, 1.0, -1.0 };
+#endif
+ const float *mvecp = mvec;
+ /* We have to load reverse mask for big endian. */
+ /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
+
+ __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+ long ytmp;
+
+ __asm__
+ (
+ "xscvdpspn 32, %7 \n\t"
+ "xscvdpspn 33, %8 \n\t"
+ "xxspltw 32, 32, 0 \n\t"
+ "xxspltw 33, 33, 0 \n\t"
+ "lxvd2x 36, 0, %9 \n\t" // mvec
+
+#if !defined(CONJ)
+ "xvmulsp 33, 33, 36 \n\t" // alpha_i * mvec
+#else
+ "xvmulsp 32, 32, 36 \n\t" // alpha_r * mvec
+#endif
+ "mr %4, %3 \n\t"
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
+
+ "lxvp 40, 0(%2) \n\t" // x0
+ "lxvp 42, 32(%2) \n\t" // x2
+ "lxvp 48, 0(%3) \n\t" // y0
+ "lxvp 50, 32(%3) \n\t" // y2
+
+ "xxperm 52, 40, %x10 \n\t" // exchange real and imag part
+ "xxperm 53, 41, %x10 \n\t" // exchange real and imag part
+ "xxperm 54, 42, %x10 \n\t" // exchange real and imag part
+ "xxperm 55, 43, %x10 \n\t" // exchange real and imag part
+
+ "lxvp 44, 64(%2) \n\t" // x4
+ "lxvp 46, 96(%2) \n\t" // x6
+ "lxvp 34, 64(%3) \n\t" // y4
+ "lxvp 38, 96(%3) \n\t" // y6
+
+ "xxperm 56, 44, %x10 \n\t" // exchange real and imag part
+ "xxperm 57, 45, %x10 \n\t" // exchange real and imag part
+ "xxperm 58, 46, %x10 \n\t" // exchange real and imag part
+ "xxperm 59, 47, %x10 \n\t" // exchange real and imag part
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "ble two%= \n\t"
+
+ ".align 5 \n"
+ "one%=: \n\t"
+
+ "xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
+ "xvmaddasp 49, 41, 32 \n\t"
+ "lxvp 40, 0(%2) \n\t" // x0
+ "xvmaddasp 50, 42, 32 \n\t"
+ "xvmaddasp 51, 43, 32 \n\t"
+ "lxvp 42, 32(%2) \n\t" // x2
+
+ "xvmaddasp 34, 44, 32 \n\t"
+ "xvmaddasp 35, 45, 32 \n\t"
+ "lxvp 44, 64(%2) \n\t" // x4
+ "xvmaddasp 38, 46, 32 \n\t"
+ "xvmaddasp 39, 47, 32 \n\t"
+ "lxvp 46, 96(%2) \n\t" // x6
+
+ "xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
+ "addi %2, %2, 128 \n\t"
+ "xvmaddasp 49, 53, 33 \n\t"
+ "xvmaddasp 50, 54, 33 \n\t"
+ "xvmaddasp 51, 55, 33 \n\t"
+
+ "xvmaddasp 34, 56, 33 \n\t"
+ "xvmaddasp 35, 57, 33 \n\t"
+ "xvmaddasp 38, 58, 33 \n\t"
+ "xvmaddasp 39, 59, 33 \n\t"
+
+ "stxvp 48, 0(%4) \n\t"
+ "stxvp 50, 32(%4) \n\t"
+ "stxvp 34, 64(%4) \n\t"
+ "stxvp 38, 96(%4) \n\t"
+
+ "addi %4, %4, 128 \n\t"
+ "xxperm 52, 40, %x10 \n\t" // exchange real and imag part
+ "xxperm 53, 41, %x10 \n\t" // exchange real and imag part
+
+ "lxvp 48, 0(%3) \n\t" // y0
+ "xxperm 54, 42, %x10 \n\t" // exchange real and imag part
+ "xxperm 55, 43, %x10 \n\t" // exchange real and imag part
+ "lxvp 50, 32(%3) \n\t" // y2
+
+ "xxperm 56, 44, %x10 \n\t" // exchange real and imag part
+ "xxperm 57, 45, %x10 \n\t" // exchange real and imag part
+ "lxvp 34, 64(%3) \n\t" // y4
+ "xxperm 58, 46, %x10 \n\t" // exchange real and imag part
+ "xxperm 59, 47, %x10 \n\t" // exchange real and imag part
+ "lxvp 38, 96(%3) \n\t" // y6
+
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "bgt one%= \n"
+
+ "two%=: \n\t"
+ "xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
+ "xvmaddasp 49, 41, 32 \n\t"
+ "xvmaddasp 50, 42, 32 \n\t"
+ "xvmaddasp 51, 43, 32 \n\t"
+
+ "xvmaddasp 34, 44, 32 \n\t"
+ "xvmaddasp 35, 45, 32 \n\t"
+ "xvmaddasp 38, 46, 32 \n\t"
+ "xvmaddasp 39, 47, 32 \n\t"
+
+ "xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
+ "xvmaddasp 49, 53, 33 \n\t"
+ "xvmaddasp 50, 54, 33 \n\t"
+ "xvmaddasp 51, 55, 33 \n\t"
+
+ "xvmaddasp 34, 56, 33 \n\t"
+ "xvmaddasp 35, 57, 33 \n\t"
+ "xvmaddasp 38, 58, 33 \n\t"
+ "xvmaddasp 39, 59, 33 \n\t"
+
+ "stxvp 48, 0(%4) \n\t"
+ "stxvp 50, 32(%4) \n\t"
+ "stxvp 34, 64(%4) \n\t"
+ "stxvp 38, 96(%4) \n\t"
+
+ "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
+ :
+ "+m" (*y),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y), // 3
+ "=b" (ytmp) // 4
+ :
+ "m" (*x),
+ "m" (*mvecp),
+ "d" (alpha_r), // 7
+ "d" (alpha_i), // 8
+ "4" (mvecp), // 9
+ "wa" (mask)
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+ "vs56","vs57","vs58","vs59"
+ );
+}
--- /dev/null
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "caxpy_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i)
+{
+ BLASLONG register i = 0;
+ BLASLONG register ix = 0;
+
+
+
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+ y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
+ y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
+#else
+ y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+ y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
+ y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
+#endif
+
+ ix+=4 ;
+ i+=2 ;
+
+ }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -16;
+
+ if ( n1 )
+ {
+ caxpy_kernel_8 (n1, x, y, da_r, da_i);
+ ix = 2 * n1;
+ }
+ i = n1;
+ while(i < n)
+ {
+#if !defined(CONJ)
+ y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ i++ ;
+ ix += 2;
+
+ }
+ return(0);
+
+
+ }
+
+ inc_x *=2;
+ inc_y *=2;
+
+ while(i < n)
+ {
+
+#if !defined(CONJ)
+ y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
+ y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
+#else
+ y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
+ y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
+#endif
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "copy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL
+
+static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i<n )
+ {
+
+ f0 = x1[0];
+ f1 = x1[1];
+ f2 = x1[2];
+ f3 = x1[3];
+ f4 = x1[4];
+ f5 = x1[5];
+ f6 = x1[6];
+ f7 = x1[7];
+
+ y1[0] = f0;
+ y1[1] = f1;
+ y1[2] = f2;
+ y1[3] = f3;
+ y1[4] = f4;
+ y1[5] = f5;
+ y1[6] = f6;
+ y1[7] = f7;
+
+ x1 += 8;
+ y1 += 8;
+
+ i+=4;
+ }
+ return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1 ))
+ {
+
+ BLASLONG n1 = n & -64;
+ if ( n1 > 0 )
+ {
+ copy_kernel(n1, x, y);
+ i=n1;
+ ix=n1*2;
+ iy=n1*2;
+ }
+
+ while(i < n)
+ {
+ y[iy] = x[iy] ;
+ y[iy+1] = x[ix+1] ;
+ ix+=2;
+ iy+=2;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ BLASLONG inc_x2 = 2 * inc_x;
+ BLASLONG inc_y2 = 2 * inc_y;
+
+ while(i < n)
+ {
+ y[iy] = x[ix] ;
+ y[iy+1] = x[ix+1] ;
+ ix += inc_x2 ;
+ iy += inc_y2 ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-#define HAVE_KERNEL_64 1
+#define HAVE_KERNEL 1
-static void dcopy_kernel_64 (long n, double *x, double *y)
+static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__
(
"lxvp 60, 448(%2) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %2, %2, 512 \n\t"
-
+#if !defined(COMPLEX) && !defined(DOUBLE)
+ "addic. %1, %1, -128 \n\t"
+#elif defined(COMPLEX) && defined(DOUBLE)
+ "addic. %1, %1, -32 \n\t"
+#else
"addic. %1, %1, -64 \n\t"
+#endif
"ble two%= \n\t"
".align 5 \n"
"addi %3, %3, 512 \n\t"
"addi %2, %2, 512 \n\t"
+#if !defined(COMPLEX) && !defined(DOUBLE)
+ "addic. %1, %1, -128 \n\t"
+#elif defined(COMPLEX) && defined(DOUBLE)
+ "addic. %1, %1, -32 \n\t"
+#else
"addic. %1, %1, -64 \n\t"
+#endif
"bgt one%= \n"
"two%=: \n\t"
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
- "+b" (y) // 3
+ "+b" (y) // 3
:
"m" (*x)
:
#endif
#ifdef __64BIT__
-#define STACKSIZE 400
#define STACKSIZE 592
#define ALPHA_R_SP 304+192(SP)
#define ALPHA_I_SP 312+192(SP)
#else
-#define STACKSIZE 256
#define STACKSIZE 452
#define ALPHA_R_SP 224+196(SP)
#define ALPHA_I_SP 232+196(SP)
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
-#include "dcopy_microk_power10.c"
+#include "copy_microk_power10.c"
#endif
-#ifndef HAVE_KERNEL_64
+#ifndef HAVE_KERNEL
-static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
+static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
- dcopy_kernel_64(n1, x, y);
+ copy_kernel(n1, x, y);
i=n1;
}
--- /dev/null
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static double ddot_kernel_8 (long n, double *x, double *y)
+{
+ double dot;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvp 40, 0(%2) \n\t"
+ "lxvp 42, 32(%2) \n\t"
+ "lxvp 44, 64(%2) \n\t"
+ "lxvp 46, 96(%2) \n\t"
+ "lxvp 48, 0(%3) \n\t"
+ "lxvp 50, 32(%3) \n\t"
+ "lxvp 52, 64(%3) \n\t"
+ "lxvp 54, 96(%3) \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "ble two%= \n\t"
+
+ ".align 5 \n"
+ "one%=: \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t"
+ "xvmaddadp 33, 41, 49 \n\t"
+ "lxvp 40, 0(%2) \n\t"
+ "lxvp 48, 0(%3) \n\t"
+ "xvmaddadp 34, 42, 50 \n\t"
+ "xvmaddadp 35, 43, 51 \n\t"
+ "lxvp 42, 32(%2) \n\t"
+ "lxvp 50, 32(%3) \n\t"
+ "xvmaddadp 36, 44, 52 \n\t"
+ "xvmaddadp 37, 45, 53 \n\t"
+ "lxvp 44, 64(%2) \n\t"
+ "lxvp 52, 64(%3) \n\t"
+ "xvmaddadp 38, 46, 54 \n\t"
+ "xvmaddadp 39, 47, 55 \n\t"
+ "lxvp 46, 96(%2) \n\t"
+ "lxvp 54, 96(%3) \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "bgt one%= \n"
+
+ "two%=: \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t"
+ "xvmaddadp 33, 41, 49 \n\t"
+ "xvmaddadp 34, 42, 50 \n\t"
+ "xvmaddadp 35, 43, 51 \n\t"
+ "xvmaddadp 36, 44, 52 \n\t"
+ "xvmaddadp 37, 45, 53 \n\t"
+ "xvmaddadp 38, 46, 54 \n\t"
+ "xvmaddadp 39, 47, 55 \n\t"
+
+ "xvadddp 32, 32, 33 \n\t"
+ "xvadddp 34, 34, 35 \n\t"
+ "xvadddp 36, 36, 37 \n\t"
+ "xvadddp 38, 38, 39 \n\t"
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+
+ XXSWAPD_S(33,32)
+
+ "xsadddp %x0, 32, 33 \n"
+
+ "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
+ :
+ "=d" (dot), // 0
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y) // 3
+ :
+ "m" (*x),
+ "m" (*y)
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
+ );
+
+ return dot;
+}
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "ddot_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
+{
+ BLASLONG register i = 0;
+ FLOAT dot = 0.0;
+
+ while(i < n)
+ {
+ dot += y[i] * x[i]
+ + y[i+1] * x[i+1]
+ + y[i+2] * x[i+2]
+ + y[i+3] * x[i+3]
+ + y[i+4] * x[i+4]
+ + y[i+5] * x[i+5]
+ + y[i+6] * x[i+6]
+ + y[i+7] * x[i+7] ;
+
+ i+=8 ;
+
+ }
+ return dot;
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ FLOAT dot = 0.0 ;
+
+ if ( n <= 0 ) return(dot);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -16;
+
+ if ( n1 )
+ dot = ddot_kernel_8(n1, x, y);
+
+ i = n1;
+ while(i < n)
+ {
+
+ dot += y[i] * x[i] ;
+ i++ ;
+
+ }
+ return(dot);
+
+
+ }
+
+ FLOAT temp1 = 0.0;
+ FLOAT temp2 = 0.0;
+
+ BLASLONG n1 = n & -4;
+
+ while(i < n1)
+ {
+
+ FLOAT m1 = y[iy] * x[ix] ;
+ FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
+
+ FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
+ FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
+
+ ix += inc_x*4 ;
+ iy += inc_y*4 ;
+
+ temp1 += m1+m3;
+ temp2 += m2+m4;
+
+ i+=4 ;
+
+ }
+
+ while(i < n)
+ {
+
+ temp1 += y[iy] * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ dot = temp1 + temp2;
+ return(dot);
+
+}
+
+
#endif
#ifdef __64BIT__
-#define STACKSIZE 320
#define STACKSIZE 512
#define ALPHA_SP 296+192(SP)
#define FZERO 304+192(SP)
#else
-#define STACKSIZE 240
#define STACKSIZE 440
#define ALPHA_SP 224+200(SP)
#define FZERO 232+200(SP)
#endif
)
{
- BLASLONG N = n;
BLASLONG i1;
#if defined(TRMMKERNEL)
BLASLONG off;
off = -offset;
#endif
v4sf_t valpha = { alpha, alpha };
- N = n >> 2;
- for (i1 = 0; i1 < N; i1++)
+ for (i1 = 0; i1 < (n >> 3); i1++)
{
- BLASLONG i, j, temp;
+ BLASLONG j, temp;
FLOAT *CO;
FLOAT *AO;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
CO = C;
- C += ldc << 2;
+ C += ldc << 3;
AO = A;
PREFETCH1 (A, 128);
PREFETCH1 (A, 256);
- i = m >> 4;
- for (j = 0; j < i; j++)
+ for (j = 0; j < (m >> 3); j++)
{
- FLOAT *BO;
+ FLOAT *BO;
#if defined(TRMMKERNEL)
- REFRESH_POINTERS (16, 4);
+ REFRESH_POINTERS (8, 8);
#else
BO = B;
temp = k;
#endif
v4sf_t *rowC;
v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
BLASLONG l = 0;
- PREFETCH1 (CO, 0);
- PREFETCH1 (CO + ldc, 0);
- PREFETCH1 (CO + ldc + ldc, 0);
- PREFETCH1 (CO + ldc + ldc + ldc, 0);
- PREFETCH1 (CO, 128);
- PREFETCH1 (CO + ldc, 128);
- PREFETCH1 (CO + ldc + ldc, 128);
- PREFETCH1 (CO + ldc + ldc + ldc, 128);
- __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
vec_t *rowA = (vec_t *) & AO[0];
- __vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
+ __vector_pair rowB, rowB1;
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
- __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
- __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
- __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
- __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
- __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
- __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
- __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
+ __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+ __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+ __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+ __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
+ __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
+ __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
+ __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
for (l = 1; l < temp; l++)
{
- rowA = (vec_t *) & AO[l << 4];
- rb = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 3];
+ rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
- __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
- __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
- __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
- __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
- __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
- __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
- __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
+ __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
+ __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
+ __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
}
SAVE_ACC (&acc0, 0);
- SAVE_ACC (&acc2, 4);
- SAVE_ACC (&acc1, 2);
- SAVE_ACC (&acc3, 6);
- SAVE_ACC (&acc4, 8);
- SAVE_ACC (&acc6, 12);
- SAVE_ACC (&acc5, 10);
- SAVE_ACC (&acc7, 14);
- AO += temp << 4;
- BO += temp << 2;
+ SAVE_ACC1 (&acc1, 0);
+ SAVE_ACC (&acc2, 2);
+ SAVE_ACC1 (&acc3, 2);
+ SAVE_ACC (&acc4, 4);
+ SAVE_ACC1 (&acc5, 4);
+ SAVE_ACC (&acc6, 6);
+ SAVE_ACC1 (&acc7, 6);
+ CO += 8;
+ AO += temp << 3;
+ BO += temp << 3;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (8, 8)
+#endif
+ }
+ if (m & 4)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (4, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1, acc2, acc3;
+ BLASLONG l = 0;
+ vec_t *rowA = (vec_t *) & AO[0];
+ __vector_pair rowB, rowB1;
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+ __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+ __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+ for (l = 1; l < temp; l++)
+ {
+ rowA = (vec_t *) & AO[l << 2];
+ rb = (vec_t *) & BO[l << 3];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+ __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC1 (&acc1, 0);
+ SAVE_ACC (&acc2, 2);
+ SAVE_ACC1 (&acc3, 2);
+ CO += 4;
+ AO += temp << 2;
+ BO += temp << 3;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (4, 8)
+#endif
+ }
+ if (m & 2)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (2, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ v4sf_t *rowC;
+ v4sf_t result[4];
+ __vector_quad acc0, acc1;
+ BLASLONG l = 0;
+ vec_t *rowA = (vec_t *) & AO[0];
+ __vector_pair rowB, rowB1;
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+ for (l = 1; l < temp; l++)
+ {
+ rowA = (vec_t *) & AO[l << 1];
+ rb = (vec_t *) & BO[l << 3];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+ }
+ SAVE_ACC (&acc0, 0);
+ SAVE_ACC1 (&acc1, 0);
+ CO += 2;
+ AO += temp << 1;
+ BO += temp << 3;
#if defined(TRMMKERNEL)
- REFRESH_AFTER_SAVE (16, 4)
+ REFRESH_AFTER_SAVE (2, 8)
#endif
- CO += 16;
}
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ if (m & 1)
+ {
+ FLOAT *BO;
+#if defined(TRMMKERNEL)
+ REFRESH_POINTERS (1, 8);
+#else
+ BO = B;
+ temp = k;
+#endif
+ BLASLONG l = 0;
+ v4sf_t t = { 0, 0 };
+ v4sf_t t1 = { 0, 0 };
+ v4sf_t t2 = { 0, 0 };
+ v4sf_t t3 = { 0, 0 };
+ for (l = 0; l < temp; l++)
+ {
+ v4sf_t rowA = { AO[l], AO[l] };
+ v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
+ v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
+ v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
+ v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
+ t += rowA * rowB;
+ t1 += rowA * rowB1;
+ t2 += rowA * rowB2;
+ t3 += rowA * rowB3;
+ }
+ t = t * valpha;
+ t1 = t1 * valpha;
+ t2 = t2 * valpha;
+ t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+ CO[0 * ldc] = t[0];
+ CO[1 * ldc] = t[1];
+ CO[2 * ldc] = t1[0];
+ CO[3 * ldc] = t1[1];
+ CO[4 * ldc] = t2[0];
+ CO[5 * ldc] = t2[1];
+ CO[6 * ldc] = t3[0];
+ CO[7 * ldc] = t3[1];
+#else
+ CO[0 * ldc] += t[0];
+ CO[1 * ldc] += t[1];
+ CO[2 * ldc] += t1[0];
+ CO[3 * ldc] += t1[1];
+ CO[4 * ldc] += t2[0];
+ CO[5 * ldc] += t2[1];
+ CO[6 * ldc] += t3[0];
+ CO[7 * ldc] += t3[1];
+#endif
+ CO += 1;
+ AO += temp;
+ BO += temp << 3;
+#if defined(TRMMKERNEL)
+ REFRESH_AFTER_SAVE (1, 8)
+#endif
+ }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 8; // number of values in A
+#endif
+ B += k << 3;
+ }
+ if (n & 4)
+ {
+ BLASLONG j, temp;
+ FLOAT *CO;
+ FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+ CO = C;
+ C += ldc << 2;
+ AO = A;
+ PREFETCH1 (A, 128);
+ PREFETCH1 (A, 256);
+ for (j = 0; j < (m >> 3); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 4)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 4)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 4)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
#endif
B += k << 2;
}
- N = (n & 3) >> 1;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 2)
{
- BLASLONG i, j, temp;
+ BLASLONG j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
CO = C;
C += ldc << 1;
AO = A;
- i = m >> 4;
- for (j = 0; j < i; j++)
- {
- FLOAT *BO;
-#if defined(TRMMKERNEL)
- REFRESH_POINTERS (16, 2);
-#else
- BO = B;
- temp = k;
-#endif
- v4sf_t *rowC;
- v4sf_t result[4];
- __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
- BLASLONG l = 0;
- FLOAT t[4] = { 0, 0, 0, 0 };
- t[0] = BO[0], t[1] = BO[1];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- vec_t *rowA = (vec_t *) & AO[0];
- __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
- __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
- __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
- __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
- __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
- __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
- __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
- __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
- for (l = 1; l < temp; l++)
- {
- t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- rowA = (vec_t *) & AO[l << 4];
- __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
- __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
- __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
- __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
- __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
- __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
- __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
- __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
- }
- SAVE2x4_ACC (&acc0, 0);
- SAVE2x4_ACC (&acc1, 2);
- SAVE2x4_ACC (&acc2, 4);
- SAVE2x4_ACC (&acc3, 6);
- SAVE2x4_ACC (&acc4, 8);
- SAVE2x4_ACC (&acc5, 10);
- SAVE2x4_ACC (&acc6, 12);
- SAVE2x4_ACC (&acc7, 14);
- CO += 16;
- AO += temp << 4;
- BO += temp << 1;
-#if defined(TRMMKERNEL)
- REFRESH_AFTER_SAVE (16, 2)
-#endif
- }
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ for (j = 0; j < (m >> 3); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 2)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 2)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 2)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
#endif
B += k << 1;
}
- N = (n & 1) >> 0;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 1)
{
BLASLONG i, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
CO = C;
C += ldc;
AO = A;
- i = m;
- while (i >= 16)
- {
- FLOAT *BO;
-#if defined(TRMMKERNEL)
- REFRESH_POINTERS (16, 1)
-#else
- BO = B;
- temp = k;
-#endif
- BLASLONG l = 0;
- v4sf_t t = { 0, 0 };
- v4sf_t t1 = { 0, 0 };
- v4sf_t t2 = { 0, 0 };
- v4sf_t t3 = { 0, 0 };
- v4sf_t t4 = { 0, 0 };
- v4sf_t t5 = { 0, 0 };
- v4sf_t t6 = { 0, 0 };
- v4sf_t t7 = { 0, 0 };
- for (l = 0; l < temp; l++)
- {
- v4sf_t rowB = { BO[l], BO[l] };
- v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
- v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
- v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
- v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
- v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
- v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
- v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
- v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
- t += rowA * rowB;
- t1 += rowA1 * rowB;
- t2 += rowA2 * rowB;
- t3 += rowA3 * rowB;
- t4 += rowA4 * rowB;
- t5 += rowA5 * rowB;
- t6 += rowA6 * rowB;
- t7 += rowA7 * rowB;
- }
- t = t * valpha;
- t1 = t1 * valpha;
- t2 = t2 * valpha;
- t3 = t3 * valpha;
- t4 = t4 * valpha;
- t5 = t5 * valpha;
- t6 = t6 * valpha;
- t7 = t7 * valpha;
-#if defined(TRMMKERNEL)
- CO[0] = t[0];
- CO[1] = t[1];
- CO[2] = t1[0];
- CO[3] = t1[1];
- CO[4] = t2[0];
- CO[5] = t2[1];
- CO[6] = t3[0];
- CO[7] = t3[1];
- CO[8] = t4[0];
- CO[9] = t4[1];
- CO[10] = t5[0];
- CO[11] = t5[1];
- CO[12] = t6[0];
- CO[13] = t6[1];
- CO[14] = t7[0];
- CO[15] = t7[1];
-#else
- CO[0] += t[0];
- CO[1] += t[1];
- CO[2] += t1[0];
- CO[3] += t1[1];
- CO[4] += t2[0];
- CO[5] += t2[1];
- CO[6] += t3[0];
- CO[7] += t3[1];
- CO[8] += t4[0];
- CO[9] += t4[1];
- CO[10] += t5[0];
- CO[11] += t5[1];
- CO[12] += t6[0];
- CO[13] += t6[1];
- CO[14] += t7[0];
- CO[15] += t7[1];
-#endif
- AO += temp << 4;
- BO += temp;
- CO += 16;
- i -= 16;
-#if defined(TRMMKERNEL)
- REFRESH_AFTER_SAVE (16, 1)
-#endif
- }
- while (i >= 8)
+ for (i = 0; i < (m >> 3); i++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
AO += temp << 3;
BO += temp;
CO += 8;
- i -= 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
- while (i >= 4)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
AO += temp << 2;
BO += temp;
CO += 4;
- i -= 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
}
- while (i >= 2)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
AO += temp << 1;
BO += temp;
CO += 2;
- i -= 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
- while (i >= 1)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
CO[0] += t * alpha;
#endif
CO += 1;
- i -= 1;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif
--- /dev/null
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin. */
+/* All rights reserved. */
+/* */
+/* Redistribution and use in source and binary forms, with or */
+/* without modification, are permitted provided that the following */
+/* conditions are met: */
+/* */
+/* 1. Redistributions of source code must retain the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer. */
+/* */
+/* 2. Redistributions in binary form must reproduce the above */
+/* copyright notice, this list of conditions and the following */
+/* disclaimer in the documentation and/or other materials */
+/* provided with the distribution. */
+/* */
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
+/* POSSIBILITY OF SUCH DAMAGE. */
+/* */
+/* The views and conclusions contained in the software and */
+/* documentation are those of the authors and should not be */
+/* interpreted as representing official policies, either expressed */
+/* or implied, of The University of Texas at Austin. */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <altivec.h>
+#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+ BLASLONG i, j;
+
+ IFLOAT *aoffset;
+ IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+ IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+
+ IFLOAT *boffset;
+ IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+ IFLOAT ctemp09, ctemp17, ctemp33;
+ IFLOAT ctemp25, ctemp41;
+ IFLOAT ctemp49, ctemp57;
+
+ aoffset = a;
+ boffset = b;
+
+ j = (n >> 3);
+ if (j > 0){
+ do{
+ aoffset1 = aoffset;
+ aoffset2 = aoffset1 + lda;
+ aoffset3 = aoffset2 + lda;
+ aoffset4 = aoffset3 + lda;
+ aoffset5 = aoffset4 + lda;
+ aoffset6 = aoffset5 + lda;
+ aoffset7 = aoffset6 + lda;
+ aoffset8 = aoffset7 + lda;
+ aoffset += 8 * lda;
+
+ i = (m >> 3);
+ if (i > 0){
+ do{
+ PREFETCHA (aoffset1, 384);
+ PREFETCHA (aoffset2, 384);
+ PREFETCHA (aoffset3, 384);
+ PREFETCHA (aoffset4, 384);
+ PREFETCHA (aoffset5, 384);
+ PREFETCHA (aoffset6, 384);
+ PREFETCHA (aoffset7, 384);
+ PREFETCHA (aoffset8, 384);
+ __vector double va0 = *(__vector double*)(aoffset1 + 0);
+ __vector double va1 = *(__vector double*)(aoffset1 + 2);
+ __vector double va2 = *(__vector double*)(aoffset1 + 4);
+ __vector double va3 = *(__vector double*)(aoffset1 + 6);
+
+ __vector double va4 = *(__vector double*)(aoffset2 + 0);
+ __vector double va5 = *(__vector double*)(aoffset2 + 2);
+ __vector double va6 = *(__vector double*)(aoffset2 + 4);
+ __vector double va7 = *(__vector double*)(aoffset2 + 6);
+
+ __vector double va8 = *(__vector double*)(aoffset3 + 0);
+ __vector double va9 = *(__vector double*)(aoffset3 + 2);
+ __vector double va10 = *(__vector double*)(aoffset3 + 4);
+ __vector double va11 = *(__vector double*)(aoffset3 + 6);
+
+ __vector double va12 = *(__vector double*)(aoffset4 + 0);
+ __vector double va13 = *(__vector double*)(aoffset4 + 2);
+ __vector double va14 = *(__vector double*)(aoffset4 + 4);
+ __vector double va15 = *(__vector double*)(aoffset4 + 6);
+
+ __vector double va16 = *(__vector double*)(aoffset5 + 0);
+ __vector double va17 = *(__vector double*)(aoffset5 + 2);
+ __vector double va18 = *(__vector double*)(aoffset5 + 4);
+ __vector double va19 = *(__vector double*)(aoffset5 + 6);
+
+ __vector double va20 = *(__vector double*)(aoffset6 + 0);
+ __vector double va21 = *(__vector double*)(aoffset6 + 2);
+ __vector double va22 = *(__vector double*)(aoffset6 + 4);
+ __vector double va23 = *(__vector double*)(aoffset6 + 6);
+
+ __vector double va24 = *(__vector double*)(aoffset7 + 0);
+ __vector double va25 = *(__vector double*)(aoffset7 + 2);
+ __vector double va26 = *(__vector double*)(aoffset7 + 4);
+ __vector double va27 = *(__vector double*)(aoffset7 + 6);
+
+ __vector double va28 = *(__vector double*)(aoffset8 + 0);
+ __vector double va29 = *(__vector double*)(aoffset8 + 2);
+ __vector double va30 = *(__vector double*)(aoffset8 + 4);
+ __vector double va31 = *(__vector double*)(aoffset8 + 6);
+
+ *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va4, 0);
+ *(__vector double*)(boffset + 2) = vec_xxpermdi(va8, va12, 0);
+ *(__vector double*)(boffset + 4) = vec_xxpermdi(va16, va20, 0);
+ *(__vector double*)(boffset + 6) = vec_xxpermdi(va24, va28, 0);
+ *(__vector double*)(boffset + 8) = vec_xxpermdi(va0, va4, 3);
+ *(__vector double*)(boffset + 10) = vec_xxpermdi(va8, va12, 3);
+ *(__vector double*)(boffset + 12) = vec_xxpermdi(va16, va20, 3);
+ *(__vector double*)(boffset + 14) = vec_xxpermdi(va24, va28, 3);
+
+ *(__vector double*)(boffset + 16) = vec_xxpermdi(va1, va5, 0);
+ *(__vector double*)(boffset + 18) = vec_xxpermdi(va9, va13, 0);
+ *(__vector double*)(boffset + 20) = vec_xxpermdi(va17, va21, 0);
+ *(__vector double*)(boffset + 22) = vec_xxpermdi(va25, va29, 0);
+ *(__vector double*)(boffset + 24) = vec_xxpermdi(va1, va5, 3);
+ *(__vector double*)(boffset + 26) = vec_xxpermdi(va9, va13, 3);
+ *(__vector double*)(boffset + 28) = vec_xxpermdi(va17, va21, 3);
+ *(__vector double*)(boffset + 30) = vec_xxpermdi(va25, va29, 3);
+
+ *(__vector double*)(boffset + 32) = vec_xxpermdi(va2, va6, 0);
+ *(__vector double*)(boffset + 34) = vec_xxpermdi(va10, va14, 0);
+ *(__vector double*)(boffset + 36) = vec_xxpermdi(va18, va22, 0);
+ *(__vector double*)(boffset + 38) = vec_xxpermdi(va26, va30, 0);
+ *(__vector double*)(boffset + 40) = vec_xxpermdi(va2, va6, 3);
+ *(__vector double*)(boffset + 42) = vec_xxpermdi(va10, va14, 3);
+ *(__vector double*)(boffset + 44) = vec_xxpermdi(va18, va22, 3);
+ *(__vector double*)(boffset + 46) = vec_xxpermdi(va26, va30, 3);
+
+ *(__vector double*)(boffset + 48) = vec_xxpermdi(va3, va7, 0);
+ *(__vector double*)(boffset + 50) = vec_xxpermdi(va11, va15, 0);
+ *(__vector double*)(boffset + 52) = vec_xxpermdi(va19, va23, 0);
+ *(__vector double*)(boffset + 54) = vec_xxpermdi(va27, va31, 0);
+ *(__vector double*)(boffset + 56) = vec_xxpermdi(va3, va7, 3);
+ *(__vector double*)(boffset + 58) = vec_xxpermdi(va11, va15, 3);
+ *(__vector double*)(boffset + 60) = vec_xxpermdi(va19, va23, 3);
+ *(__vector double*)(boffset + 62) = vec_xxpermdi(va27, va31, 3);
+ aoffset1 += 8;
+ aoffset2 += 8;
+ aoffset3 += 8;
+ aoffset4 += 8;
+ aoffset5 += 8;
+ aoffset6 += 8;
+ aoffset7 += 8;
+ aoffset8 += 8;
+ boffset += 64;
+ i --;
+ }while(i > 0);
+ }
+
+ i = (m & 7);
+ if (i > 0){
+ do{
+ ctemp01 = *(aoffset1 + 0);
+ ctemp09 = *(aoffset2 + 0);
+ ctemp17 = *(aoffset3 + 0);
+ ctemp25 = *(aoffset4 + 0);
+ ctemp33 = *(aoffset5 + 0);
+ ctemp41 = *(aoffset6 + 0);
+ ctemp49 = *(aoffset7 + 0);
+ ctemp57 = *(aoffset8 + 0);
+
+ *(boffset + 0) = ctemp01;
+ *(boffset + 1) = ctemp09;
+ *(boffset + 2) = ctemp17;
+ *(boffset + 3) = ctemp25;
+ *(boffset + 4) = ctemp33;
+ *(boffset + 5) = ctemp41;
+ *(boffset + 6) = ctemp49;
+ *(boffset + 7) = ctemp57;
+
+ aoffset1 ++;
+ aoffset2 ++;
+ aoffset3 ++;
+ aoffset4 ++;
+ aoffset5 ++;
+ aoffset6 ++;
+ aoffset7 ++;
+ aoffset8 ++;
+
+ boffset += 8;
+ i --;
+ }while(i > 0);
+ }
+ j--;
+ }while(j > 0);
+ } /* end of if(j > 0) */
+
+ if (n & 4){
+ aoffset1 = aoffset;
+ aoffset2 = aoffset1 + lda;
+ aoffset3 = aoffset2 + lda;
+ aoffset4 = aoffset3 + lda;
+ aoffset += 4 * lda;
+
+ i = (m >> 2);
+ if (i > 0){
+ do{
+ PREFETCHA (aoffset1, 384);
+ PREFETCHA (aoffset2, 384);
+ PREFETCHA (aoffset3, 384);
+ PREFETCHA (aoffset4, 384);
+ __vector double va0 = *(__vector double*)(aoffset1 + 0);
+ __vector double va1 = *(__vector double*)(aoffset1 + 2);
+ __vector double va2 = *(__vector double*)(aoffset2 + 0);
+ __vector double va3 = *(__vector double*)(aoffset2 + 2);
+ __vector double va4 = *(__vector double*)(aoffset3 + 0);
+ __vector double va5 = *(__vector double*)(aoffset3 + 2);
+ __vector double va6 = *(__vector double*)(aoffset4 + 0);
+ __vector double va7 = *(__vector double*)(aoffset4 + 2);
+ *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va2, 0);
+ *(__vector double*)(boffset + 2) = vec_xxpermdi(va4, va6, 0);
+ *(__vector double*)(boffset + 4) = vec_xxpermdi(va0, va2, 3);
+ *(__vector double*)(boffset + 6) = vec_xxpermdi(va4, va6, 3);
+ *(__vector double*)(boffset + 8) = vec_xxpermdi(va1, va3, 0);
+ *(__vector double*)(boffset + 10) = vec_xxpermdi(va5, va7, 0);
+ *(__vector double*)(boffset + 12) = vec_xxpermdi(va1, va3, 3);
+ *(__vector double*)(boffset + 14) = vec_xxpermdi(va5, va7, 3);
+
+ aoffset1 += 4;
+ aoffset2 += 4;
+ aoffset3 += 4;
+ aoffset4 += 4;
+ boffset += 16;
+ i --;
+ }while(i > 0);
+ }
+
+ i = (m & 3);
+ if (i > 0){
+ do{
+ ctemp01 = *(aoffset1 + 0);
+ ctemp02 = *(aoffset2 + 0);
+ ctemp03 = *(aoffset3 + 0);
+ ctemp04 = *(aoffset4 + 0);
+
+ *(boffset + 0) = ctemp01;
+ *(boffset + 1) = ctemp02;
+ *(boffset + 2) = ctemp03;
+ *(boffset + 3) = ctemp04;
+
+ aoffset1 ++;
+ aoffset2 ++;
+ aoffset3 ++;
+ aoffset4 ++;
+
+ boffset += 4;
+ i --;
+ }while(i > 0);
+ }
+ } /* end of if(j > 0) */
+
+ if (n & 2){
+ aoffset1 = aoffset;
+ aoffset2 = aoffset1 + lda;
+ aoffset += 2 * lda;
+
+ i = (m >> 1);
+ if (i > 0){
+ do{
+ __vector double va0 = *(__vector double*)(aoffset1 + 0);
+ __vector double va1 = *(__vector double*)(aoffset2 + 0);
+ *(__vector double*)(boffset + 0) = vec_xxpermdi(va0, va1, 0);
+ *(__vector double*)(boffset + 2) = vec_xxpermdi(va0, va1, 3);
+
+ aoffset1 += 2;
+ aoffset2 += 2;
+ boffset += 4;
+ i --;
+ }while(i > 0);
+ }
+
+ if (m & 1){
+ ctemp01 = *(aoffset1 + 0);
+ ctemp02 = *(aoffset2 + 0);
+
+ *(boffset + 0) = ctemp01;
+ *(boffset + 1) = ctemp02;
+
+ aoffset1 ++;
+ aoffset2 ++;
+ boffset += 2;
+ }
+ } /* end of if(j > 0) */
+
+ if (n & 1){
+ aoffset1 = aoffset;
+
+ i = m;
+ if (i > 0){
+ do{
+ ctemp01 = *(aoffset1 + 0);
+
+ *(boffset + 0) = ctemp01;
+
+ aoffset1 ++;
+ boffset ++;
+ i --;
+ }while(i > 0);
+ }
+
+ } /* end of if(j > 0) */
+
+ return 0;
+}
#endif
#ifdef __64BIT__
-#define STACKSIZE 320
#define STACKSIZE 520
#define ALPHA_SP 296+200(SP)
#define FZERO 304+200(SP)
#endif
#ifdef __64BIT__
-#define STACKSIZE 320
#define STACKSIZE 520
#define ALPHA 296+200(SP)
#define FZERO 304+200(SP)
--- /dev/null
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void saxpy_kernel_64(long n, float *x, float *y, float alpha)
+{
+ __vector float t0 = {alpha, alpha,alpha, alpha};
+
+ __asm__
+ (
+
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
+
+ "lxvp 32, 0(%2) \n\t"
+ "lxvp 34, 32(%2) \n\t"
+ "lxvp 40, 64(%2) \n\t"
+ "lxvp 42, 96(%2) \n\t"
+ "lxvp 48, 128(%2) \n\t"
+ "lxvp 50, 160(%2) \n\t"
+ "lxvp 52, 192(%2) \n\t"
+ "lxvp 54, 224(%2) \n\t"
+
+ "lxvp 36, 0(%3) \n\t"
+ "lxvp 38, 32(%3) \n\t"
+ "lxvp 44, 64(%3) \n\t"
+ "lxvp 46, 96(%3) \n\t"
+ "lxvp 56, 128(%3) \n\t"
+ "lxvp 58, 160(%3) \n\t"
+ "lxvp 60, 192(%3) \n\t"
+ "lxvp 62, 224(%3) \n\t"
+
+ "addi %2, %2, 256 \n\t"
+
+ "addic. %1, %1, -64 \n\t"
+ "ble two%= \n\t"
+
+ ".align 5 \n"
+ "one%=: \n\t"
+
+ "xvmaddasp 36, 32, %x4 \n\t"
+ "xvmaddasp 37, 33, %x4 \n\t"
+
+ "lxvp 32, 0(%2) \n\t"
+ "stxvp 36, 0(%3) \n\t"
+
+ "xvmaddasp 38, 34, %x4 \n\t"
+ "xvmaddasp 39, 35, %x4 \n\t"
+
+ "lxvp 34, 32(%2) \n\t"
+ "stxvp 38, 32(%3) \n\t"
+
+ "lxvp 36, 256(%3) \n\t"
+ "lxvp 38, 288(%3) \n\t"
+
+ "xvmaddasp 44, 40, %x4 \n\t"
+ "xvmaddasp 45, 41, %x4 \n\t"
+
+ "lxvp 40, 64(%2) \n\t"
+ "stxvp 44, 64(%3) \n\t"
+
+ "xvmaddasp 46, 42, %x4 \n\t"
+ "xvmaddasp 47, 43, %x4 \n\t"
+
+ "lxvp 42, 96(%2) \n\t"
+ "stxvp 46, 96(%3) \n\t"
+
+ "lxvp 44, 320(%3) \n\t"
+ "lxvp 46, 352(%3) \n\t"
+
+ "xvmaddasp 56, 48, %x4 \n\t"
+ "xvmaddasp 57, 49, %x4 \n\t"
+
+ "lxvp 48, 128(%2) \n\t"
+ "stxvp 56, 128(%3) \n\t"
+
+ "xvmaddasp 58, 50, %x4 \n\t"
+ "xvmaddasp 59, 51, %x4 \n\t"
+
+ "lxvp 50, 160(%2) \n\t"
+ "stxvp 58, 160(%3) \n\t"
+
+ "lxvp 56, 384(%3) \n\t"
+ "lxvp 58, 416(%3) \n\t"
+
+ "xvmaddasp 60, 52, %x4 \n\t"
+ "xvmaddasp 61, 53, %x4 \n\t"
+
+ "lxvp 52, 192(%2) \n\t"
+ "stxvp 60, 192(%3) \n\t"
+
+ "xvmaddasp 62, 54, %x4 \n\t"
+ "xvmaddasp 63, 55, %x4 \n\t"
+
+ "lxvp 54, 224(%2) \n\t"
+ "stxvp 62, 224(%3) \n\t"
+
+ "lxvp 60, 448(%3) \n\t"
+ "lxvp 62, 480(%3) \n\t"
+
+ "addi %2, %2, 256 \n\t"
+ "addi %3, %3, 256 \n\t"
+
+ "addic. %1, %1, -64 \n\t"
+ "bgt one%= \n"
+
+ "two%=: \n\t"
+
+ "xvmaddasp 36, 32, %x4 \n\t"
+ "xvmaddasp 37, 33, %x4 \n\t"
+ "xvmaddasp 38, 34, %x4 \n\t"
+ "xvmaddasp 39, 35, %x4 \n\t"
+
+ "xvmaddasp 44, 40, %x4 \n\t"
+ "xvmaddasp 45, 41, %x4 \n\t"
+ "xvmaddasp 46, 42, %x4 \n\t"
+ "xvmaddasp 47, 43, %x4 \n\t"
+
+ "xvmaddasp 56, 48, %x4 \n\t"
+ "xvmaddasp 57, 49, %x4 \n\t"
+ "xvmaddasp 58, 50, %x4 \n\t"
+ "xvmaddasp 59, 51, %x4 \n\t"
+
+ "xvmaddasp 60, 52, %x4 \n\t"
+ "xvmaddasp 61, 53, %x4 \n\t"
+ "xvmaddasp 62, 54, %x4 \n\t"
+ "xvmaddasp 63, 55, %x4 \n\t"
+ "stxvp 36, 0(%3) \n\t"
+ "stxvp 38, 32(%3) \n\t"
+ "stxvp 44, 64(%3) \n\t"
+ "stxvp 46, 96(%3) \n\t"
+ "stxvp 56, 128(%3) \n\t"
+ "stxvp 58, 160(%3) \n\t"
+ "stxvp 60, 192(%3) \n\t"
+ "stxvp 62, 224(%3) \n\t"
+
+ "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n"
+ :
+ "+m" (*y),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y) // 3
+ :
+ "wa" (t0), // 4
+ "m" (*x)
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+ "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+ );
+
+}
+
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "saxpy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL_8
+static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
+{
+ BLASLONG register i = 0;
+
+ while(i < n)
+ {
+ y[i] += alpha * x[i];
+ y[i+1] += alpha * x[i+1];
+ y[i+2] += alpha * x[i+2];
+ y[i+3] += alpha * x[i+3];
+ y[i+4] += alpha * x[i+4];
+ y[i+5] += alpha * x[i+5];
+ y[i+6] += alpha * x[i+6];
+ y[i+7] += alpha * x[i+7];
+ i+=8 ;
+
+ }
+
+}
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ BLASLONG n1 = n & -64;
+
+ if ( n1 )
+ saxpy_kernel_64(n1, x, y, da);
+
+ i = n1;
+ while(i < n)
+ {
+
+ y[i] += da * x[i] ;
+ i++ ;
+
+ }
+ return(0);
+
+
+ }
+
+ BLASLONG n1 = n & -4;
+
+ while(i < n1)
+ {
+
+ FLOAT m1 = da * x[ix] ;
+ FLOAT m2 = da * x[ix+inc_x] ;
+ FLOAT m3 = da * x[ix+2*inc_x] ;
+ FLOAT m4 = da * x[ix+3*inc_x] ;
+
+ y[iy] += m1 ;
+ y[iy+inc_y] += m2 ;
+ y[iy+2*inc_y] += m3 ;
+ y[iy+3*inc_y] += m4 ;
+
+ ix += inc_x*4 ;
+ iy += inc_y*4 ;
+ i+=4 ;
+
+ }
+
+ while(i < n)
+ {
+
+ y[iy] += da * x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(0);
+
+}
+
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "copy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL
+
+static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+ BLASLONG i=0;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+
+ while ( i<n )
+ {
+
+ f0 = x1[0];
+ f1 = x1[1];
+ f2 = x1[2];
+ f3 = x1[3];
+ f4 = x1[4];
+ f5 = x1[5];
+ f6 = x1[6];
+ f7 = x1[7];
+
+ y1[0] = f0;
+ y1[1] = f1;
+ y1[2] = f2;
+ y1[3] = f3;
+ y1[4] = f4;
+ y1[5] = f5;
+ y1[6] = f6;
+ y1[7] = f7;
+
+ x1 += 8;
+ y1 += 8;
+
+ i+=8;
+ }
+ return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+
+ if ( n <= 0 ) return(0);
+
+ if ( (inc_x == 1) && (inc_y == 1 ))
+ {
+
+ BLASLONG n1 = n & -128;
+ if ( n1 > 0 )
+ {
+ copy_kernel (n1, x, y);
+ i=n1;
+ }
+
+ while(i < n)
+ {
+ y[i] = x[i] ;
+ i++ ;
+
+ }
+
+
+ }
+ else
+ {
+
+ while(i < n)
+ {
+ y[iy] = x[ix] ;
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+
+ }
+ return(0);
+
+
+}
+
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static float sdot_kernel_16 (long n, float *x, float *y)
+{
+ float dot;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvp 40, 0(%2) \n\t"
+ "lxvp 42, 32(%2) \n\t"
+ "lxvp 44, 64(%2) \n\t"
+ "lxvp 46, 96(%2) \n\t"
+ "lxvp 48, 0(%3) \n\t"
+ "lxvp 50, 32(%3) \n\t"
+ "lxvp 52, 64(%3) \n\t"
+ "lxvp 54, 96(%3) \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "ble two%= \n\t"
+
+ ".align 5 \n"
+ "one%=: \n\t"
+
+ "xvmaddasp 32, 40, 48 \n\t"
+ "xvmaddasp 33, 41, 49 \n\t"
+ "lxvp 40, 0(%2) \n\t"
+ "lxvp 48, 0(%3) \n\t"
+ "xvmaddasp 34, 42, 50 \n\t"
+ "xvmaddasp 35, 43, 51 \n\t"
+ "lxvp 42, 32(%2) \n\t"
+ "lxvp 50, 32(%3) \n\t"
+ "xvmaddasp 36, 44, 52 \n\t"
+ "xvmaddasp 37, 45, 53 \n\t"
+ "lxvp 44, 64(%2) \n\t"
+ "lxvp 52, 64(%3) \n\t"
+ "xvmaddasp 38, 46, 54 \n\t"
+ "xvmaddasp 39, 47, 55 \n\t"
+ "lxvp 46, 96(%2) \n\t"
+ "lxvp 54, 96(%3) \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "bgt one%= \n"
+
+ "two%=: \n\t"
+
+ "xvmaddasp 32, 40, 48 \n\t"
+ "xvmaddasp 33, 41, 49 \n\t"
+ "xvmaddasp 34, 42, 50 \n\t"
+ "xvmaddasp 35, 43, 51 \n\t"
+ "xvmaddasp 36, 44, 52 \n\t"
+ "xvmaddasp 37, 45, 53 \n\t"
+ "xvmaddasp 38, 46, 54 \n\t"
+ "xvmaddasp 39, 47, 55 \n\t"
+
+ "xvaddsp 32, 32, 33 \n\t"
+ "xvaddsp 34, 34, 35 \n\t"
+ "xvaddsp 36, 36, 37 \n\t"
+ "xvaddsp 38, 38, 39 \n\t"
+
+ "xvaddsp 32, 32, 34 \n\t"
+ "xvaddsp 36, 36, 38 \n\t"
+
+ "xvaddsp 32, 32, 36 \n\t"
+
+ "xxsldwi 33, 32, 32, 2 \n\t"
+ "xvaddsp 32, 32, 33 \n\t"
+
+ "xxsldwi 33, 32, 32, 1 \n\t"
+ "xvaddsp 32, 32, 33 \n\t"
+
+ "xscvspdp %x0, 32 \n"
+
+ "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
+ :
+ "=f" (dot), // 0
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y) // 3
+ :
+ "m" (*x),
+ "m" (*y)
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
+ );
+
+ return dot;
+}
--- /dev/null
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "sdot_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+ BLASLONG register i = 0;
+ FLOAT dot = 0.0;
+
+ while(i < n)
+ {
+ dot += y[i] * x[i]
+ + y[i+1] * x[i+1]
+ + y[i+2] * x[i+2]
+ + y[i+3] * x[i+3]
+ + y[i+4] * x[i+4]
+ + y[i+5] * x[i+5]
+ + y[i+6] * x[i+6]
+ + y[i+7] * x[i+7] ;
+
+ i+=8 ;
+
+ }
+ return dot;
+}
+
+#endif
+
+#if defined (DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+ BLASLONG i=0;
+ BLASLONG ix=0,iy=0;
+ double dot = 0.0 ;
+
+#if defined (DSDOT)
+ double mydot = 0.0;
+ FLOAT asmdot = 0.0;
+#else
+ FLOAT mydot=0.0;
+#endif
+ BLASLONG n1;
+
+ if ( n <= 0 ) return(dot);
+
+ if ( (inc_x == 1) && (inc_y == 1) )
+ {
+
+ n1 = n & (BLASLONG)(-32);
+
+ if ( n1 )
+#if defined(DSDOT)
+ {
+ FLOAT *x1=x;
+ FLOAT *y1=y;
+ BLASLONG n2 = 32;
+ while (i<n1) {
+ asmdot = sdot_kernel_16(n2, x1, y1);
+ mydot += (double)asmdot;
+ asmdot=0.;
+ x1+=32;
+ y1+=32;
+ i+=32;
+ }
+ }
+#else
+ mydot = sdot_kernel_16(n1, x, y);
+#endif
+ i = n1;
+ while(i < n)
+ {
+#if defined(DSDOT)
+ dot += (double)y[i] * (double)x[i] ;
+#else
+ dot += y[i] * x[i] ;
+#endif
+ i++ ;
+
+ }
+
+ dot+=mydot;
+ return(dot);
+
+
+ }
+
+ n1 = n & (BLASLONG)(-2);
+
+ while(i < n1)
+ {
+#if defined (DSDOT)
+ dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
+#else
+ dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
+#endif
+ ix += inc_x*2 ;
+ iy += inc_y*2 ;
+ i+=2 ;
+
+ }
+
+ while(i < n)
+ {
+#if defined (DSDOT)
+ dot += (double)y[iy] * (double)x[ix] ;
+#else
+ dot += y[iy] * x[ix] ;
+#endif
+ ix += inc_x ;
+ iy += inc_y ;
+ i++ ;
+
+ }
+ return(dot);
+
+}
+
+
#endif
)
{
- BLASLONG N = n;
BLASLONG i1;
#if defined(TRMMKERNEL)
BLASLONG off;
#endif
v4sf_t valpha = { alpha, alpha, alpha, alpha };
- N = n >> 3;
- for (i1 = 0; i1 < N; i1++)
+ for (i1 = 0; i1 < (n >> 3); i1++)
{
- BLASLONG i, j, temp;
+ BLASLONG j, temp;
FLOAT *CO;
FLOAT *AO;
#if defined(TRMMKERNEL) && defined(LEFT)
AO = A;
PREFETCH1 (A, 128);
PREFETCH1 (A, 256);
- i = m >> 4;
- for (j = 0; j < i; j++)
+ for (j = 0; j < (m >> 4); j++)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
#endif
CO += 16;
}
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ if (m & 8)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 8)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 8)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 8)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
B += k << 3;
}
- N = (n & 7) >> 2;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 4)
{
BLASLONG i, j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
REFRESH_AFTER_SAVE (16, 4)
#endif
}
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ if (m & 8)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 4)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 4)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 4)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
B += k << 2;
}
- N = (n & 3) >> 1;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 2)
{
BLASLONG i, j, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
REFRESH_AFTER_SAVE (16, 2)
#endif
}
- i = (m & 15) >> 3;
- for (j = 0; j < i; j++)
+ if (m & 8)
{
FLOAT *BO;
v4sf_t *rowC;
REFRESH_AFTER_SAVE (8, 2)
#endif
}
- i = (m & 7) >> 2;
- for (j = 0; j < i; j++)
+ if (m & 4)
{
FLOAT *BO;
v4sf_t *rowC;
REFRESH_AFTER_SAVE (4, 2)
#endif
}
- i = (m & 3) >> 1;
- for (j = 0; j < i; j++)
+ if (m & 2)
{
FLOAT *BO;
BLASLONG l = 0;
REFRESH_AFTER_SAVE (2, 2)
#endif
}
- i = (m & 1) >> 0;
- for (j = 0; j < i; j++)
+ if (m & 1)
{
FLOAT *BO;
BLASLONG l = 0;
B += k << 1;
}
- N = (n & 1) >> 0;
- for (i1 = 0; i1 < N; i1++)
+ if (n & 1)
{
BLASLONG i, temp;
#if defined(TRMMKERNEL) && defined(LEFT)
CO = C;
C += ldc;
AO = A;
- i = m;
- while (i >= 16)
+ for (i = 0; i < (m >> 4); i++)
{
FLOAT *BO;
BLASLONG l = 0;
AO += temp << 4;
BO += temp;
CO += 16;
- i -= 16;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (16, 1)
#endif
}
- while (i >= 8)
+ if (m & 8)
{
FLOAT *BO;
BLASLONG l = 0;
AO += temp << 3;
BO += temp;
CO += 8;
- i -= 8;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (8, 1)
#endif
}
- while (i >= 4)
+ if (m & 4)
{
FLOAT *BO;
BLASLONG l = 0;
AO += temp << 2;
BO += temp;
CO += 4;
- i -= 4;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (4, 1)
#endif
}
- while (i >= 2)
+ if (m & 2)
{
FLOAT *BO;
BLASLONG l = 0;
AO += temp << 1;
BO += temp;
CO += 2;
- i -= 2;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (2, 1)
#endif
}
- while (i >= 1)
+ if (m & 1)
{
FLOAT *BO;
#if defined(TRMMKERNEL)
CO[0] += t * alpha;
#endif
CO += 1;
- i -= 1;
#if defined(TRMMKERNEL)
REFRESH_AFTER_SAVE (1, 1)
#endif
#endif
#ifdef __64BIT__
-#define STACKSIZE 340
#define STACKSIZE 540
#define ALPHA_SP 296+200(SP)
#define FZERO 304+200(SP)
+++ /dev/null
-/***************************************************************************
-Copyright (c) 2020, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define HAVE_KERNEL_32 1
-
-static void zcopy_kernel_32 (long n, double *x, double *y)
-{
- __asm__
- (
- "lxvp 32, 0(%2) \n\t"
- "lxvp 34, 32(%2) \n\t"
- "lxvp 36, 64(%2) \n\t"
- "lxvp 38, 96(%2) \n\t"
- "lxvp 40, 128(%2) \n\t"
- "lxvp 42, 160(%2) \n\t"
- "lxvp 44, 192(%2) \n\t"
- "lxvp 46, 224(%2) \n\t"
-
- "lxvp 48, 256(%2) \n\t"
- "lxvp 50, 288(%2) \n\t"
- "lxvp 52, 320(%2) \n\t"
- "lxvp 54, 352(%2) \n\t"
- "lxvp 56, 384(%2) \n\t"
- "lxvp 58, 416(%2) \n\t"
- "lxvp 60, 448(%2) \n\t"
- "lxvp 62, 480(%2) \n\t"
- "addi %2, %2, 512 \n\t"
-
- "addic. %1, %1, -32 \n\t"
- "ble two%= \n\t"
-
- ".align 5 \n"
- "one%=: \n\t"
-
- "stxvp 32, 0(%3) \n\t"
- "lxvp 32, 0(%2) \n\t"
- "stxvp 34, 32(%3) \n\t"
- "lxvp 34, 32(%2) \n\t"
- "stxvp 36, 64(%3) \n\t"
- "lxvp 36, 64(%2) \n\t"
- "stxvp 38, 96(%3) \n\t"
- "lxvp 38, 96(%2) \n\t"
-
- "stxvp 40, 128(%3) \n\t"
- "lxvp 40, 128(%2) \n\t"
- "stxvp 42, 160(%3) \n\t"
- "lxvp 42, 160(%2) \n\t"
- "stxvp 44, 192(%3) \n\t"
- "lxvp 44, 192(%2) \n\t"
- "stxvp 46, 224(%3) \n\t"
- "lxvp 46, 224(%2) \n\t"
-
- "stxvp 48, 256(%3) \n\t"
- "lxvp 48, 256(%2) \n\t"
- "stxvp 50, 288(%3) \n\t"
- "lxvp 50, 288(%2) \n\t"
- "stxvp 52, 320(%3) \n\t"
- "lxvp 52, 320(%2) \n\t"
- "stxvp 54, 352(%3) \n\t"
- "lxvp 54, 352(%2) \n\t"
- "stxvp 56, 384(%3) \n\t"
- "lxvp 56, 384(%2) \n\t"
- "stxvp 58, 416(%3) \n\t"
- "lxvp 58, 416(%2) \n\t"
- "stxvp 60, 448(%3) \n\t"
- "lxvp 60, 448(%2) \n\t"
- "stxvp 62, 480(%3) \n\t"
- "lxvp 62, 480(%2) \n\t"
-
- "addi %3, %3, 512 \n\t"
- "addi %2, %2, 512 \n\t"
-
- "addic. %1, %1, -32 \n\t"
- "bgt one%= \n"
-
- "two%=: \n\t"
-
- "stxvp 32, 0(%3) \n\t"
- "stxvp 34, 32(%3) \n\t"
- "stxvp 36, 64(%3) \n\t"
- "stxvp 38, 96(%3) \n\t"
- "stxvp 40, 128(%3) \n\t"
- "stxvp 42, 160(%3) \n\t"
- "stxvp 44, 192(%3) \n\t"
- "stxvp 46, 224(%3) \n\t"
- "stxvp 48, 256(%3) \n\t"
- "stxvp 50, 288(%3) \n\t"
- "stxvp 52, 320(%3) \n\t"
- "stxvp 54, 352(%3) \n\t"
- "stxvp 56, 384(%3) \n\t"
- "stxvp 58, 416(%3) \n\t"
- "stxvp 60, 448(%3) \n\t"
- "stxvp 62, 480(%3) \n\t"
-
- "#n=%1 x=%4=%2 y=%0=%3"
- :
- "=m" (*y),
- "+r" (n), // 1
- "+b" (x), // 2
- "+b" (y) // 3
- :
- "m" (*x)
- :
- "cr0",
- "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
- "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
- "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
- "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
- );
-}
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
-#include "zcopy_microk_power10.c"
+#include "copy_microk_power10.c"
#endif
-#ifndef HAVE_KERNEL_32
+#ifndef HAVE_KERNEL
-static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+static void copy_kernel(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG i=0;
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
- zcopy_kernel_32(n1, x, y);
+ copy_kernel(n1, x, y);
i=n1;
ix=n1*2;
iy=n1*2;
#endif
-static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+static __attribute__((always_inline)) inline void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for (i = 0; i < n; i++) {
*dest = *src;
while(i < n)
{
- if( x[ix] < minf )
+ if( x[ix] > minf )
{
min = i;
minf = x[ix];
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS,
dsdot_kTS,
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
- sgemv_nTS, sgemv_tTS, sger_kTS,
+ sbgemv_nTS, sbgemv_tTS, sger_kTS,
ssymv_LTS, ssymv_UTS,
sbgemm_kernelTS, sbgemm_betaTS,
TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q;
#endif
-#if (CORE_KATMAI) || (CORE_COPPERMINE) || (CORE_BANIAS) || (CORE_YONAH) || (CORE_ATHLON)
+#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
#ifdef DEBUG
fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n");
* Data Type
***************************/
typedef __m256 v_f32;
+typedef __m256d v_f64;
#define v_nlanes_f32 8
+#define v_nlanes_f64 4
/***************************
* Arithmetic
***************************/
#define v_add_f32 _mm256_add_ps
+#define v_add_f64 _mm256_add_pd
#define v_mul_f32 _mm256_mul_ps
+#define v_mul_f64 _mm256_mul_pd
#ifdef HAVE_FMA3
// multiply and add, a*b + c
#define v_muladd_f32 _mm256_fmadd_ps
+ #define v_muladd_f64 _mm256_fmadd_pd
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return v_add_f32(v_mul_f32(a, b), c); }
+ BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
+ { return v_add_f64(v_mul_f64(a, b), c); }
#endif // !HAVE_FMA3
// Horizontal add: Calculates the sum of all vector elements.
return _mm_cvtss_f32(sum);
}
+BLAS_FINLINE double v_sum_f64(__m256d a)
+{
+ __m256d sum_halves = _mm256_hadd_pd(a, a);
+ __m128d lo = _mm256_castpd256_pd128(sum_halves);
+ __m128d hi = _mm256_extractf128_pd(sum_halves, 1);
+ __m128d sum = _mm_add_pd(lo, hi);
+ return _mm_cvtsd_f64(sum);
+}
/***************************
* memory
***************************/
// unaligned load
#define v_loadu_f32 _mm256_loadu_ps
+#define v_loadu_f64 _mm256_loadu_pd
#define v_storeu_f32 _mm256_storeu_ps
+#define v_storeu_f64 _mm256_storeu_pd
#define v_setall_f32(VAL) _mm256_set1_ps(VAL)
-#define v_zero_f32 _mm256_setzero_ps
\ No newline at end of file
+#define v_setall_f64(VAL) _mm256_set1_pd(VAL)
+#define v_zero_f32 _mm256_setzero_ps
+#define v_zero_f64 _mm256_setzero_pd
\ No newline at end of file
* Data Type
***************************/
typedef __m512 v_f32;
+typedef __m512d v_f64;
#define v_nlanes_f32 16
+#define v_nlanes_f64 8
/***************************
* Arithmetic
***************************/
#define v_add_f32 _mm512_add_ps
+#define v_add_f64 _mm512_add_pd
#define v_mul_f32 _mm512_mul_ps
+#define v_mul_f64 _mm512_mul_pd
// multiply and add, a*b + c
#define v_muladd_f32 _mm512_fmadd_ps
-
+#define v_muladd_f64 _mm512_fmadd_pd
BLAS_FINLINE float v_sum_f32(v_f32 a)
{
__m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
__m512 sum4 = _mm512_add_ps(sum8, h4);
return _mm_cvtss_f32(_mm512_castps512_ps128(sum4));
}
+
+BLAS_FINLINE double v_sum_f64(v_f64 a)
+{
+ __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2));
+ __m512d sum32 = _mm512_add_pd(a, h64);
+ __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2));
+ __m512d sum16 = _mm512_add_pd(sum32, h32);
+ __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1));
+ __m512d sum8 = _mm512_add_pd(sum16, h16);
+ return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8));
+}
/***************************
* memory
***************************/
// unaligned load
#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
+#define v_loadu_f64(PTR) _mm512_loadu_pd((const __m512*)(PTR))
#define v_storeu_f32 _mm512_storeu_ps
+#define v_storeu_f64 _mm512_storeu_pd
#define v_setall_f32(VAL) _mm512_set1_ps(VAL)
+#define v_setall_f64(VAL) _mm512_set1_pd(VAL)
#define v_zero_f32 _mm512_setzero_ps
+#define v_zero_f64 _mm512_setzero_pd
* Data Type
***************************/
typedef float32x4_t v_f32;
+#if V_SIMD_F64
+ typedef float64x2_t v_f64;
+#endif
#define v_nlanes_f32 4
+#define v_nlanes_f64 2
/***************************
* Arithmetic
***************************/
#define v_add_f32 vaddq_f32
+#define v_add_f64 vaddq_f64
#define v_mul_f32 vmulq_f32
+#define v_mul_f64 vmulq_f64
// FUSED F32
#ifdef HAVE_VFPV4 // FMA
{ return vmlaq_f32(c, a, b); }
#endif
+// FUSED F64
+#if V_SIMD_F64
+ BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
+ { return vfmaq_f64(c, a, b); }
+#endif
+
// Horizontal add: Calculates the sum of all vector elements.
BLAS_FINLINE float v_sum_f32(float32x4_t a)
{
float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a));
return vget_lane_f32(vpadd_f32(r, r), 0);
}
+
+#if V_SIMD_F64
+ BLAS_FINLINE double v_sum_f64(float64x2_t a)
+ {
+ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0);
+ }
+#endif
+
/***************************
* memory
***************************/
#define v_loadu_f32(a) vld1q_f32((const float*)a)
#define v_storeu_f32 vst1q_f32
#define v_setall_f32(VAL) vdupq_n_f32(VAL)
-#define v_zero_f32() vdupq_n_f32(0.0f)
\ No newline at end of file
+#define v_zero_f32() vdupq_n_f32(0.0f)
+#if V_SIMD_F64
+ #define v_loadu_f64(a) vld1q_f64((const double*)a)
+ #define v_storeu_f64 vst1q_f64
+ #define v_setall_f64 vdupq_n_f64
+ #define v_zero_f64() vdupq_n_f64(0.0)
+#endif
\ No newline at end of file
* Data Type
***************************/
typedef __m128 v_f32;
+typedef __m128d v_f64;
#define v_nlanes_f32 4
+#define v_nlanes_f64 2
/***************************
* Arithmetic
***************************/
#define v_add_f32 _mm_add_ps
+#define v_add_f64 _mm_add_pd
#define v_mul_f32 _mm_mul_ps
+#define v_mul_f64 _mm_mul_pd
#ifdef HAVE_FMA3
// multiply and add, a*b + c
#define v_muladd_f32 _mm_fmadd_ps
+ #define v_muladd_f64 _mm_fmadd_pd
#elif defined(HAVE_FMA4)
// multiply and add, a*b + c
#define v_muladd_f32 _mm_macc_ps
+ #define v_muladd_f64 _mm_macc_pd
#else
// multiply and add, a*b + c
BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
{ return v_add_f32(v_mul_f32(a, b), c); }
+ BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
+ { return v_add_f64(v_mul_f64(a, b), c); }
#endif // HAVE_FMA3
// Horizontal add: Calculates the sum of all vector elements.
return _mm_cvtss_f32(t4);
#endif
}
+
+BLAS_FINLINE double v_sum_f64(__m128d a)
+{
+#ifdef HAVE_SSE3
+ return _mm_cvtsd_f64(_mm_hadd_pd(a, a));
+#else
+ return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a)));
+#endif
+}
/***************************
* memory
***************************/
// unaligned load
#define v_loadu_f32 _mm_loadu_ps
+#define v_loadu_f64 _mm_loadu_pd
#define v_storeu_f32 _mm_storeu_ps
+#define v_storeu_f64 _mm_storeu_pd
#define v_setall_f32(VAL) _mm_set1_ps(VAL)
-#define v_zero_f32 _mm_setzero_ps
\ No newline at end of file
+#define v_setall_f64(VAL) _mm_set1_pd(VAL)
+#define v_zero_f32 _mm_setzero_ps
+#define v_zero_f64 _mm_setzero_pd
\ No newline at end of file
GEMVDEP = ../l2param.h
+ifndef SBGEMVNKERNEL
+SBGEMVNKERNEL = sbgemv_n.c
+endif
+
+ifndef SBGEMVTKERNEL
+SBGEMVTKERNEL = sbgemv_t.c
+endif
+
ifndef SGEMVNKERNEL
SGEMVNKERNEL = sgemv_n.c
endif
SASUMKERNEL = sasum.c
DASUMKERNEL = dasum.c
+
+SROTKERNEL = srot.c
+DROTKERNEL = drot.c
--- /dev/null
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#ifndef __BF16_COMMON_MACROS
+#define __BF16_COMMON_MACROS
+
+#include <immintrin.h>
+
+#define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512) \
+ reg256##_0 = _mm512_castps512_ps256(reg512##_0); \
+ reg256##_1 = _mm512_castps512_ps256(reg512##_1);
+
+
+#define BF16_MATRIX_LOAD_8x32(regArray, a, lda, idx_m, idx_n) \
+ regArray##_0 = _mm512_loadu_si512(&a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm512_loadu_si512(&a[(idx_m+1)*lda + idx_n]); \
+ regArray##_2 = _mm512_loadu_si512(&a[(idx_m+2)*lda + idx_n]); \
+ regArray##_3 = _mm512_loadu_si512(&a[(idx_m+3)*lda + idx_n]); \
+ regArray##_4 = _mm512_loadu_si512(&a[(idx_m+4)*lda + idx_n]); \
+ regArray##_5 = _mm512_loadu_si512(&a[(idx_m+5)*lda + idx_n]); \
+ regArray##_6 = _mm512_loadu_si512(&a[(idx_m+6)*lda + idx_n]); \
+ regArray##_7 = _mm512_loadu_si512(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \
+ regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \
+ regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \
+ regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \
+ regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \
+ regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \
+ regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \
+ regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \
+ regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \
+ regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \
+ regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \
+ regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \
+ regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \
+ regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \
+ regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \
+ regArray = _mm512_loadu_si512(&a[idx_m*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x32(regArray, a, lda, idx_m, idx_n, mask) \
+ regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
+ regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
+ regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \
+ regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
+ regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \
+ regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \
+ regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x16(regArray, a, lda, idx_m, idx_n, mask) \
+ regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
+ regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
+ regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \
+ regArray##_4 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
+ regArray##_5 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \
+ regArray##_6 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \
+ regArray##_7 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x8(regArray, a, lda, idx_m, idx_n, mask) \
+ regArray##_0 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
+ regArray##_2 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
+ regArray##_3 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \
+ regArray##_4 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
+ regArray##_5 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \
+ regArray##_6 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \
+ regArray##_7 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x32(regArray, a, lda, idx_m, idx_n, mask) \
+ regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
+ regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
+ regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x16(regArray, a, lda, idx_m, idx_n, mask) \
+ regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
+ regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
+ regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x32_2(regArray, a, lda, idx_m, idx_n, mask) \
+ regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
+ regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
+ regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \
+ regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+8)*lda + idx_n]); \
+ regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+10)*lda + idx_n]); \
+ regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+12)*lda + idx_n]); \
+ regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+14)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x32_2(regArray, a, lda, idx_m, idx_n, mask) \
+ regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
+ regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
+ regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
+ regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);
+
+#define BF16_MATRIX_MASKZ_LOAD_1x32(regArray, a, lda, idx_m, idx_n, mask) \
+ regArray = _mm512_maskz_loadu_epi16(mask, &a[idx_m*lda + idx_n]);
+
+#define BF16_VECTOR_LOAD_1x32(reg, x, idx_n) \
+ reg = _mm512_loadu_si512(x + idx_n);
+
+
+#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \
+ reg = _mm256_loadu_si256(x + idx_n);
+
+
+#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \
+ reg = _mm_loadu_si128(x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \
+ reg = _mm512_maskz_loadu_epi16(mask, x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x16(reg, x, idx_n, mask) \
+ reg = _mm256_maskz_loadu_epi16(mask, x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x8(reg, x, idx_n, mask) \
+ reg = _mm_maskz_loadu_epi16(mask, x + idx_n);
+
+
+/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
+ Input - register array of 8 rows of raw-major matrix
+ Output - the output of Step 2
+
+ Step 1: 2-element interleave for matrix
+ |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
+ |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
+ |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11|e16|e17|f16|f17|e18|e19|f18|f19|e24|e25|f24|f25|e26|e27|f26|f27
+ |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11|g16|g17|h16|h17|g18|g19|h18|h19|g24|g25|h24|h25|g26|g27|h26|h27
+ |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
+ |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
+ |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15|e20|e21|f20|f21|e22|e23|f22|f23|e28|e29|f28|f29|e30|e31|f30|f31
+ |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15|g20|g21|h20|h21|g22|g23|h22|h23|g28|g29|h28|h29|g30|g31|h30|h31
+
+ Step 2: 4-element interleave for matrix
+ |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
+ |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
+ |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 |e16|e17|f16|f17|g16|g17|h16|h17|e24|e25|f24|f25|g24|g25|h24|h25
+ |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11|e18|e19|f18|f19|g18|g19|h18|h19|e26|e27|f26|f27|g26|g27|h26|h27
+ |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
+ |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
+ |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13|e20|e21|f20|f21|g20|g21|h20|h21|e28|e29|f28|f29|g28|g29|h28|h29
+ |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15|e22|e23|f22|f23|g22|g23|h22|h23|e30|e31|f30|f31|g30|g31|h30|h31
+*/
+#define BF16_INTERLEAVE_8x32(regArray) \
+ regArray##_8 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \
+ regArray##_9 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \
+ regArray##_10 = _mm512_unpacklo_epi32(regArray##_4, regArray##_5); \
+ regArray##_11 = _mm512_unpacklo_epi32(regArray##_6, regArray##_7); \
+ regArray##_12 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \
+ regArray##_13 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \
+ regArray##_14 = _mm512_unpackhi_epi32(regArray##_4, regArray##_5); \
+ regArray##_15 = _mm512_unpackhi_epi32(regArray##_6, regArray##_7); \
+ \
+ regArray##_0 = _mm512_unpacklo_epi64(regArray##_8, regArray##_9); \
+ regArray##_1 = _mm512_unpackhi_epi64(regArray##_8, regArray##_9); \
+ regArray##_2 = _mm512_unpacklo_epi64(regArray##_10, regArray##_11); \
+ regArray##_3 = _mm512_unpackhi_epi64(regArray##_10, regArray##_11); \
+ regArray##_4 = _mm512_unpacklo_epi64(regArray##_12, regArray##_13); \
+ regArray##_5 = _mm512_unpackhi_epi64(regArray##_12, regArray##_13); \
+ regArray##_6 = _mm512_unpacklo_epi64(regArray##_14, regArray##_15); \
+ regArray##_7 = _mm512_unpackhi_epi64(regArray##_14, regArray##_15);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
+ Input - register array of 8 rows of raw-major matrix
+ Output - the output of Step 2
+
+ Step 1: 2-element interleave for matrix
+ |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
+ |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
+ |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11
+ |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11
+ |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
+ |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
+ |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15
+ |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15
+
+ Step 2: 4-element interleave for matrix
+ |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
+ |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
+ |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9
+ |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11
+ |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
+ |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
+ |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13
+ |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15
+*/
+#define BF16_INTERLEAVE_8x16(regArray) \
+ regArray##_8 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \
+ regArray##_9 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \
+ regArray##_10 = _mm256_unpacklo_epi32(regArray##_4, regArray##_5); \
+ regArray##_11 = _mm256_unpacklo_epi32(regArray##_6, regArray##_7); \
+ regArray##_12 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \
+ regArray##_13 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \
+ regArray##_14 = _mm256_unpackhi_epi32(regArray##_4, regArray##_5); \
+ regArray##_15 = _mm256_unpackhi_epi32(regArray##_6, regArray##_7); \
+ \
+ regArray##_0 = _mm256_unpacklo_epi64(regArray##_8, regArray##_9); \
+ regArray##_1 = _mm256_unpackhi_epi64(regArray##_8, regArray##_9); \
+ regArray##_2 = _mm256_unpacklo_epi64(regArray##_10, regArray##_11); \
+ regArray##_3 = _mm256_unpackhi_epi64(regArray##_10, regArray##_11); \
+ regArray##_4 = _mm256_unpacklo_epi64(regArray##_12, regArray##_13); \
+ regArray##_5 = _mm256_unpackhi_epi64(regArray##_12, regArray##_13); \
+ regArray##_6 = _mm256_unpacklo_epi64(regArray##_14, regArray##_15); \
+ regArray##_7 = _mm256_unpackhi_epi64(regArray##_14, regArray##_15);
+
+/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
+ Input - register array of 8 rows of raw-major matrix
+ Output - the output of Step 2
+
+ Step 1: 2-element interleave for matrix
+ |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
+ |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
+ |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
+ |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
+
+ Step 2: 4-element interleave for matrix
+ |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
+ |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
+ |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
+ |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
+*/
+#define BF16_INTERLEAVE_4x32(regArray) \
+ regArray##_4 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \
+ regArray##_5 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \
+ regArray##_6 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \
+ regArray##_7 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \
+ \
+ regArray##_0 = _mm512_unpacklo_epi64(regArray##_4, regArray##_5); \
+ regArray##_1 = _mm512_unpackhi_epi64(regArray##_4, regArray##_5); \
+ regArray##_2 = _mm512_unpacklo_epi64(regArray##_6, regArray##_7); \
+ regArray##_3 = _mm512_unpackhi_epi64(regArray##_6, regArray##_7);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
+ Input - register array of 8 rows of raw-major matrix
+ Output - the output of Step 2
+
+ Step 1: 2-element interleave for matrix
+ |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
+ |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
+ |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
+ |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
+
+ Step 2: 4-element interleave for matrix
+ |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
+ |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
+ |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
+ |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
+*/
+#define BF16_INTERLEAVE_4x16(regArray) \
+ regArray##_4 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \
+ regArray##_5 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \
+ regArray##_6 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \
+ regArray##_7 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \
+ \
+ regArray##_0 = _mm256_unpacklo_epi64(regArray##_4, regArray##_5); \
+ regArray##_1 = _mm256_unpackhi_epi64(regArray##_4, regArray##_5); \
+ regArray##_2 = _mm256_unpacklo_epi64(regArray##_6, regArray##_7); \
+ regArray##_3 = _mm256_unpackhi_epi64(regArray##_6, regArray##_7);
+
+
+/* 2-step interleave for x with 32 BF16 elements
+ Input - original vector
+ Output - the output of Step 2
+
+ Step 1: 2-element interleave for x:
+ |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11|x16|x17|x16|x17|x18|x19|x18|x19|x24|x25|x24|x25|x26|x27|x26|x27
+ |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15|x20|x21|x20|x21|x22|x23|x22|x23|x28|x29|x28|x29|x30|x31|x30|x31
+
+ Step 2: 4-element interleave for x:
+ |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 |x16|x17|x16|x17|x16|x17|x16|x17|x24|x25|x24|x25|x24|x25|x24|x25
+ |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11|x18|x19|x18|x19|x18|x19|x18|x19|x26|x27|x26|x27|x26|x27|x26|x27
+ |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13|x20|x21|x20|x21|x20|x21|x20|x21|x28|x29|x28|x29|x28|x29|x28|x29
+ |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15|x22|x23|x22|x23|x22|x23|x22|x23|x30|x31|x30|x31|x30|x31|x30|x31
+*/
+#define BF16_INTERLEAVE_1x32(regArray) \
+ regArray##_1 = _mm512_unpacklo_epi32(regArray##_0, regArray##_0); \
+ regArray##_3 = _mm512_unpackhi_epi32(regArray##_0, regArray##_0); \
+ \
+ regArray##_0 = _mm512_unpacklo_epi64(regArray##_1, regArray##_1); \
+ regArray##_1 = _mm512_unpackhi_epi64(regArray##_1, regArray##_1); \
+ regArray##_2 = _mm512_unpacklo_epi64(regArray##_3, regArray##_3); \
+ regArray##_3 = _mm512_unpackhi_epi64(regArray##_3, regArray##_3);
+
+
+/* 2-step interleave for x with 16 BF16 elements
+ Input - original vector
+ Output - the output of Step 2
+
+ Step 1: 2-element interleave for x:
+ |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11
+ |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15
+
+ Step 2: 4-element interleave for x:
+ |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9
+ |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11
+ |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13
+ |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15
+*/
+#define BF16_INTERLEAVE_1x16(regArray) \
+ regArray##_1 = _mm256_unpacklo_epi32(regArray##_0, regArray##_0); \
+ regArray##_3 = _mm256_unpackhi_epi32(regArray##_0, regArray##_0); \
+ \
+ regArray##_0 = _mm256_unpacklo_epi64(regArray##_1, regArray##_1); \
+ regArray##_1 = _mm256_unpackhi_epi64(regArray##_1, regArray##_1); \
+ regArray##_2 = _mm256_unpacklo_epi64(regArray##_3, regArray##_3); \
+ regArray##_3 = _mm256_unpackhi_epi64(regArray##_3, regArray##_3);
+
+/* 1-step interleave to exchange the high-256s bit and low-256 bits of 4 pair of registers
+ |a0|a1|...|a14|a15|i0|i1|...|i14|i15|
+ |b0|b1|...|b14|b15|j0|j1|...|j14|j15|
+ |c0|c1|...|c14|c15|k0|k1|...|k14|k15|
+ |d0|d1|...|d14|d15|l0|l1|...|l14|l15|
+ |e0|e1|...|e14|e15|m0|m1|...|m14|m15|
+ |f0|f1|...|f14|f15|n0|n1|...|n14|n15|
+ |g0|g1|...|g14|g15|o0|o1|...|o14|o15|
+ |h0|h1|...|h14|h15|p0|p1|...|p14|p15|
+*/
+#define BF16_INTERLEAVE256_8x32(regArray) \
+ regArray##_0 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0x44); \
+ regArray##_1 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0xee); \
+ regArray##_2 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0x44); \
+ regArray##_3 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0xee); \
+ regArray##_4 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0x44); \
+ regArray##_5 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0xee); \
+ regArray##_6 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0x44); \
+ regArray##_7 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0xee);
+
+
+/* 1-step interleave to exchange the high-256s bit and low-256 bits of 2 pair of registers
+ |a0|a1|...|a14|a15|e0|e1|...|e14|e15|
+ |b0|b1|...|b14|b15|f0|f1|...|f14|f15|
+ |c0|c1|...|c14|c15|g0|g1|...|g14|g15|
+ |d0|d1|...|d14|d15|h0|h1|...|h14|h15|
+*/
+#define BF16_INTERLEAVE256_4x32(regArray) \
+ regArray##_0 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0x44); \
+ regArray##_1 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0xee); \
+ regArray##_2 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0x44); \
+ regArray##_3 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0xee);
+
+
+#define BF16_PERMUTE_8x32(idx, regArray) \
+ regArray##_8 = _mm512_permutexvar_epi16(idx, regArray##_0); \
+ regArray##_9 = _mm512_permutexvar_epi16(idx, regArray##_1); \
+ regArray##_10 = _mm512_permutexvar_epi16(idx, regArray##_2); \
+ regArray##_11 = _mm512_permutexvar_epi16(idx, regArray##_3); \
+ regArray##_12 = _mm512_permutexvar_epi16(idx, regArray##_4); \
+ regArray##_13 = _mm512_permutexvar_epi16(idx, regArray##_5); \
+ regArray##_14 = _mm512_permutexvar_epi16(idx, regArray##_6); \
+ regArray##_15 = _mm512_permutexvar_epi16(idx, regArray##_7);
+
+
+#define BF16_PERMUTE_8x32_2(idx, regArray) \
+ regArray##_8 = _mm512_permutexvar_epi32(idx, regArray##_0); \
+ regArray##_9 = _mm512_permutexvar_epi32(idx, regArray##_1); \
+ regArray##_10 = _mm512_permutexvar_epi32(idx, regArray##_2); \
+ regArray##_11 = _mm512_permutexvar_epi32(idx, regArray##_3); \
+ regArray##_12 = _mm512_permutexvar_epi32(idx, regArray##_4); \
+ regArray##_13 = _mm512_permutexvar_epi32(idx, regArray##_5); \
+ regArray##_14 = _mm512_permutexvar_epi32(idx, regArray##_6); \
+ regArray##_15 = _mm512_permutexvar_epi32(idx, regArray##_7);
+
+
+#define BF16_PERMUTE_4x32(idx, regArray) \
+ regArray##_4 = _mm512_permutexvar_epi16(idx, regArray##_0); \
+ regArray##_5 = _mm512_permutexvar_epi16(idx, regArray##_1); \
+ regArray##_6 = _mm512_permutexvar_epi16(idx, regArray##_2); \
+ regArray##_7 = _mm512_permutexvar_epi16(idx, regArray##_3);
+
+
+#define BF16_PERMUTE_4x32_2(idx, regArray) \
+ regArray##_4 = _mm512_permutexvar_epi32(idx, regArray##_0); \
+ regArray##_5 = _mm512_permutexvar_epi32(idx, regArray##_1); \
+ regArray##_6 = _mm512_permutexvar_epi32(idx, regArray##_2); \
+ regArray##_7 = _mm512_permutexvar_epi32(idx, regArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+ (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_8x32(accumArray, matArray, xArray) \
+ accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \
+ accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_2, (__m512bh) xArray##_0); \
+ accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \
+ accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_1); \
+ accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_4, (__m512bh) xArray##_2); \
+ accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_6, (__m512bh) xArray##_2); \
+ accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_5, (__m512bh) xArray##_3); \
+ accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_7, (__m512bh) xArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+ (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_8x16(accumArray, matArray, xArray) \
+ accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \
+ accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_2, (__m256bh) xArray##_0); \
+ accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \
+ accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_1); \
+ accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_4, (__m256bh) xArray##_2); \
+ accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_6, (__m256bh) xArray##_2); \
+ accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_5, (__m256bh) xArray##_3); \
+ accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_7, (__m256bh) xArray##_3);
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+ (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_4x32(accumArray, matArray, xArray) \
+ accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \
+ accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \
+ accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_2, (__m512bh) xArray##_2); \
+ accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+ (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_4x16(accumArray, matArray, xArray) \
+ accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \
+ accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \
+ accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_2, (__m256bh) xArray##_2); \
+ accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_3);
+
+
+/* Calculate the dot result for matrix and vector at 32 elements per row
+ (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_8x32(accumArray, matArray, xArray) \
+ accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray); \
+ accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray); \
+ accumArray##_2 = _mm512_dpbf16_ps(accumArray##_2, (__m512bh) matArray##_2, (__m512bh) xArray); \
+ accumArray##_3 = _mm512_dpbf16_ps(accumArray##_3, (__m512bh) matArray##_3, (__m512bh) xArray); \
+ accumArray##_4 = _mm512_dpbf16_ps(accumArray##_4, (__m512bh) matArray##_4, (__m512bh) xArray); \
+ accumArray##_5 = _mm512_dpbf16_ps(accumArray##_5, (__m512bh) matArray##_5, (__m512bh) xArray); \
+ accumArray##_6 = _mm512_dpbf16_ps(accumArray##_6, (__m512bh) matArray##_6, (__m512bh) xArray); \
+ accumArray##_7 = _mm512_dpbf16_ps(accumArray##_7, (__m512bh) matArray##_7, (__m512bh) xArray);
+
+/* Calculate the dot result for matrix and vector at 32 elements per row
+ (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_1x32(accumArray, matArray, xArray) \
+ accumArray = _mm512_dpbf16_ps(accumArray, (__m512bh) matArray, (__m512bh) xArray);
+
+/* Calculate the dot result for matrix and vector at 16 elements per row
+ (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_8x16(accumArray, matArray, xArray) \
+ accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray); \
+ accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray); \
+ accumArray##_2 = _mm256_dpbf16_ps(accumArray##_2, (__m256bh) matArray##_2, (__m256bh) xArray); \
+ accumArray##_3 = _mm256_dpbf16_ps(accumArray##_3, (__m256bh) matArray##_3, (__m256bh) xArray); \
+ accumArray##_4 = _mm256_dpbf16_ps(accumArray##_4, (__m256bh) matArray##_4, (__m256bh) xArray); \
+ accumArray##_5 = _mm256_dpbf16_ps(accumArray##_5, (__m256bh) matArray##_5, (__m256bh) xArray); \
+ accumArray##_6 = _mm256_dpbf16_ps(accumArray##_6, (__m256bh) matArray##_6, (__m256bh) xArray); \
+ accumArray##_7 = _mm256_dpbf16_ps(accumArray##_7, (__m256bh) matArray##_7, (__m256bh) xArray);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 fp32 elements per row
+ Input - register array of 8 rows of raw-major matrix
+ Output - the output of Step 2
+
+ Step 1: 2-element interleave for matrix
+ |a0|b0|a1|b1|a4|b4|a5|b5|a8 |b8 |a9 |b9 |a12|b12|a13|b13|
+ |c0|d0|c1|d1|c4|d4|c5|d5|c8 |d8 |c9 |d9 |c12|d12|c13|d13|
+ |e0|f0|e1|f1|e4|f4|e5|f5|e8 |f8 |e9 |f9 |e12|f12|e13|f13|
+ |g0|h0|g1|h1|g4|h4|g5|h5|g8 |h8 |g9 |h9 |g12|h12|g13|h13|
+ |a2|b2|a3|b3|a6|b6|a7|b7|a10|b10|a11|b11|a14|b14|a15|b15|
+ |c2|d2|c3|d3|c6|d6|c7|d7|c10|d10|c11|d11|c14|d14|c15|d15|
+ |e2|f2|e3|f3|e6|f6|e7|f7|e10|f10|e11|f11|e14|f14|e15|f15|
+ |g2|h2|g3|h3|g6|h6|g7|h7|g10|h10|g11|h11|g14|h14|g15|h15|
+
+ Step 2: 4-element interleave for matrix
+ |a0|b0|c0|d0|a4|b4|c4|d4|a8 |b8 |c8 |d8 |a12|b12|c12|d12|
+ |a1|b1|c1|d1|a5|b5|c5|d5|a9 |b9 |c9 |d9 |a13|b13|c13|d13|
+ |e0|f0|g0|h0|e4|f4|g4|h4|e8 |f8 |g8 |h8 |e12|f12|g12|h12|
+ |e1|f1|g1|h1|e5|f5|g5|h5|e9 |f9 |g9 |h9 |e13|f13|g13|h13|
+ |a2|b2|c2|d2|a6|b6|c6|d6|a10|b10|c10|d10|a14|b14|c14|d14|
+ |a3|b3|c3|d3|a7|b7|c7|d7|a11|b11|c11|d11|a15|b15|c15|d15|
+ |e2|f2|g2|h2|e6|f6|g6|h6|e10|f10|g10|h10|e14|f14|g14|h14|
+ |e3|f3|g3|h3|e7|f7|g7|h7|e11|f11|g11|h11|e15|f15|g15|h15|
+*/
+#define FP32_INTERLEAVE_8x16(regArray) \
+ regArray##_8 = _mm512_unpacklo_ps(regArray##_0, regArray##_1); \
+ regArray##_9 = _mm512_unpacklo_ps(regArray##_2, regArray##_3); \
+ regArray##_10 = _mm512_unpacklo_ps(regArray##_4, regArray##_5); \
+ regArray##_11 = _mm512_unpacklo_ps(regArray##_6, regArray##_7); \
+ regArray##_12 = _mm512_unpackhi_ps(regArray##_0, regArray##_1); \
+ regArray##_13 = _mm512_unpackhi_ps(regArray##_2, regArray##_3); \
+ regArray##_14 = _mm512_unpackhi_ps(regArray##_4, regArray##_5); \
+ regArray##_15 = _mm512_unpackhi_ps(regArray##_6, regArray##_7); \
+ \
+ regArray##_0 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \
+ regArray##_1 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \
+ regArray##_4 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
+ regArray##_5 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
+ regArray##_2 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
+ regArray##_3 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
+ regArray##_6 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_14, (__m512d) regArray##_15); \
+ regArray##_7 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_14, (__m512d) regArray##_15);
+
+#define FP32_INTERLEAVE_8x16_ARRAY(regArray) \
+ regArray[8] = _mm512_unpacklo_ps(regArray[0], regArray[1]); \
+ regArray[9] = _mm512_unpacklo_ps(regArray[2], regArray[3]); \
+ regArray[10] = _mm512_unpacklo_ps(regArray[4], regArray[5]); \
+ regArray[11] = _mm512_unpacklo_ps(regArray[6], regArray[7]); \
+ regArray[12] = _mm512_unpackhi_ps(regArray[0], regArray[1]); \
+ regArray[13] = _mm512_unpackhi_ps(regArray[2], regArray[3]); \
+ regArray[14] = _mm512_unpackhi_ps(regArray[4], regArray[5]); \
+ regArray[15] = _mm512_unpackhi_ps(regArray[6], regArray[7]); \
+ \
+ regArray[0] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[8], (__m512d) regArray[9]); \
+ regArray[1] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[8], (__m512d) regArray[9]); \
+ regArray[4] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
+ regArray[5] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
+ regArray[2] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
+ regArray[3] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
+ regArray[6] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[14], (__m512d) regArray[15]); \
+ regArray[7] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[14], (__m512d) regArray[15]);
+
+/* 2-step interleave for matrix against 8 rows with 8 fp32 elements per row
+ Input - register array of 8 rows of raw-major matrix
+ Output - the output of Step 2
+
+ Step 1: 2-element interleave for matrix
+ |a0|b0|a1|b1|a4|b4|a5|b5|
+ |c0|d0|c1|d1|c4|d4|c5|d5|
+ |e0|f0|e1|f1|e4|f4|e5|f5|
+ |g0|h0|g1|h1|g4|h4|g5|h5|
+ |a2|b2|a3|b3|a6|b6|a7|b7|
+ |c2|d2|c3|d3|c6|d6|c7|d7|
+ |e2|f2|e3|f3|e6|f6|e7|f7|
+ |g2|h2|g3|h3|g6|h6|g7|h7|
+
+ Step 2: 4-element interleave for matrix
+ |a0|b0|c0|d0|a4|b4|c4|d4|
+ |a1|b1|c1|d1|a5|b5|c5|d5|
+ |e0|f0|g0|h0|e4|f4|g4|h4|
+ |e1|f1|g1|h1|e5|f5|g5|h5|
+ |a2|b2|c2|d2|a6|b6|c6|d6|
+ |a3|b3|c3|d3|a7|b7|c7|d7|
+ |e2|f2|g2|h2|e6|f6|g6|h6|
+ |e3|f3|g3|h3|e7|f7|g7|h7|
+*/
+#define FP32_INTERLEAVE_8x8(regArray) \
+ regArray##_8 = _mm256_unpacklo_ps(regArray##_0, regArray##_1); \
+ regArray##_9 = _mm256_unpacklo_ps(regArray##_2, regArray##_3); \
+ regArray##_10 = _mm256_unpacklo_ps(regArray##_4, regArray##_5); \
+ regArray##_11 = _mm256_unpacklo_ps(regArray##_6, regArray##_7); \
+ regArray##_12 = _mm256_unpackhi_ps(regArray##_0, regArray##_1); \
+ regArray##_13 = _mm256_unpackhi_ps(regArray##_2, regArray##_3); \
+ regArray##_14 = _mm256_unpackhi_ps(regArray##_4, regArray##_5); \
+ regArray##_15 = _mm256_unpackhi_ps(regArray##_6, regArray##_7); \
+ \
+ regArray##_0 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \
+ regArray##_1 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \
+ regArray##_4 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
+ regArray##_5 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
+ regArray##_2 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
+ regArray##_3 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
+ regArray##_6 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_14, (__m256d) regArray##_15); \
+ regArray##_7 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_14, (__m256d) regArray##_15);
+
+
+/* Accumulate the result for 2 batch of 4-registers
+*/
+#define FP32_ACCUM2_8x16(regArray) \
+ regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_1); \
+ regArray##_2 = _mm512_add_ps(regArray##_2, regArray##_3); \
+ regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_5); \
+ regArray##_6 = _mm512_add_ps(regArray##_6, regArray##_7); \
+ regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_2); \
+ regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_6);
+
+#define FP32_ACCUM2_8x16_ARRAY(regArray) \
+ regArray[0] = _mm512_add_ps(regArray[0], regArray[1]); \
+ regArray[2] = _mm512_add_ps(regArray[2], regArray[3]); \
+ regArray[4] = _mm512_add_ps(regArray[4], regArray[5]); \
+ regArray[6] = _mm512_add_ps(regArray[6], regArray[7]); \
+ regArray[0] = _mm512_add_ps(regArray[0], regArray[2]); \
+ regArray[4] = _mm512_add_ps(regArray[4], regArray[6]);
+
+/* Accumulate the result for 2 batch of 4-registers
+*/
+#define FP32_ACCUM2_8x8(regArray) \
+ regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_1); \
+ regArray##_2 = _mm256_add_ps(regArray##_2, regArray##_3); \
+ regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_5); \
+ regArray##_6 = _mm256_add_ps(regArray##_6, regArray##_7); \
+ regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_2); \
+ regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_6);
+
+
+/* Store 16 (alpha * result + beta * y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \
+ regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_loadu_ps(targetAddr))); \
+ _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (alpha * result + beta * y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \
+ regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_maskz_loadu_ps(mask, targetAddr))); \
+ _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (alpha * result + beta * y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \
+ regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_loadu_ps(targetAddr))); \
+ _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (alpha * result + beta * y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \
+ regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_maskz_loadu_ps(mask, targetAddr))); \
+ _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (alpha * result + beta * y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \
+ regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_loadu_ps(targetAddr))); \
+ _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (alpha * result + beta * y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \
+ regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_maskz_loadu_ps(mask, targetAddr))); \
+ _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 16 (alpha * result + y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \
+ regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_loadu_ps(targetAddr)); \
+ _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (alpha * result + y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \
+ regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_maskz_loadu_ps(mask, targetAddr)); \
+ _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (alpha * result + y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \
+ regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_loadu_ps(targetAddr)); \
+ _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (alpha * result + y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \
+ regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_maskz_loadu_ps(mask, targetAddr)); \
+ _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (alpha * result + y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \
+ regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_loadu_ps(targetAddr)); \
+ _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (alpha * result + y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \
+ regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_maskz_loadu_ps(mask, targetAddr)); \
+ _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 16 (alpha * result) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \
+ _mm512_storeu_ps(targetAddr, _mm512_mul_ps(ALPHAVECTOR, regResult));
+
+
+/* Masked store 16 (alpha * result) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \
+ _mm512_mask_storeu_ps(targetAddr, mask, _mm512_mul_ps(ALPHAVECTOR, regResult));
+
+
+/* Store 8 (alpha * result) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \
+ _mm256_storeu_ps(targetAddr, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
+
+
+/* Masked store 8 (alpha * result) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \
+ _mm256_mask_storeu_ps(targetAddr, mask, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
+
+
+/* Store 4 (alpha * result) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \
+ _mm_storeu_ps(targetAddr, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
+
+
+/* Masked store 4 (alpha * result) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \
+ _mm_mask_storeu_ps(targetAddr, mask, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
+
+
+/* Store 16 result to y
+*/
+#define STORE16_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \
+ _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 result to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \
+ _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 result to y
+*/
+#define STORE8_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \
+ _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 result to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \
+ _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 result to y
+*/
+#define STORE4_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \
+ _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 result to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \
+ _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+#endif
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
BLASLONG register i = 0;
FLOAT a = *alpha;
#if V_SIMD
+#ifdef DOUBLE
+ v_f64 __alpha, tmp;
+ __alpha = v_setall_f64(*alpha);
+ const int vstep = v_nlanes_f64;
+ for (; i < n; i += vstep) {
+ tmp = v_muladd_f64(__alpha, v_loadu_f64( x + i ), v_loadu_f64(y + i));
+ v_storeu_f64(y + i, tmp);
+ }
+#else
v_f32 __alpha, tmp;
__alpha = v_setall_f32(*alpha);
const int vstep = v_nlanes_f32;
tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i));
v_storeu_f32(y + i, tmp);
}
+#endif
#else
while(i < n)
{
"r" (y), // 3
"r" (alpha) // 4
: "cc",
- "%xmm0",
- "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (y), // 3
"r" (dot) // 4
: "cc",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (y), // 3
"r" (dot) // 4
: "cc",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
src1 += src_inc;
}
+ return 0;
}
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
- "%xmm8", "%xmm9",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (ap[1]), // 5
"r" (alpha) // 6
: "cc",
- "%xmm0", "%xmm1",
- "%xmm4", "%xmm5",
- "%xmm6",
- "%xmm8",
- "%xmm12", "%xmm13",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
}
"%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
- "%xmm8", "%xmm9",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
- "%xmm8", "%xmm9",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
--- /dev/null
+#include "common.h"
+
+#if defined(SKYLAKEX)
+#include "drot_microk_skylakex-2.c"
+#elif defined(HASWELL)
+#include "drot_microk_haswell-2.c"
+#endif
+
+#ifndef HAVE_DROT_KERNEL
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+ BLASLONG i = 0;
+ FLOAT f0, f1, f2, f3;
+ FLOAT x0, x1, x2, x3;
+ FLOAT g0, g1, g2, g3;
+ FLOAT y0, y1, y2, y3;
+
+ FLOAT* xp = x;
+ FLOAT* yp = y;
+
+ BLASLONG n1 = n & (~7);
+
+ while (i < n1) {
+ x0 = xp[0];
+ y0 = yp[0];
+ x1 = xp[1];
+ y1 = yp[1];
+ x2 = xp[2];
+ y2 = yp[2];
+ x3 = xp[3];
+ y3 = yp[3];
+
+ f0 = c*x0 + s*y0;
+ g0 = c*y0 - s*x0;
+ f1 = c*x1 + s*y1;
+ g1 = c*y1 - s*x1;
+ f2 = c*x2 + s*y2;
+ g2 = c*y2 - s*x2;
+ f3 = c*x3 + s*y3;
+ g3 = c*y3 - s*x3;
+
+ xp[0] = f0;
+ yp[0] = g0;
+ xp[1] = f1;
+ yp[1] = g1;
+ xp[2] = f2;
+ yp[2] = g2;
+ xp[3] = f3;
+ yp[3] = g3;
+
+ xp += 4;
+ yp += 4;
+ i += 4;
+ }
+
+ while (i < n) {
+ FLOAT temp = c*x[i] + s*y[i];
+ y[i] = c*y[i] - s*x[i];
+ x[i] = temp;
+
+ i++;
+ }
+}
+
+#endif
+static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+ BLASLONG i = 0;
+ BLASLONG ix = 0, iy = 0;
+
+ FLOAT temp;
+
+ if (n <= 0)
+ return;
+ if ((inc_x == 1) && (inc_y == 1)) {
+ drot_kernel(n, x, y, c, s);
+ }
+ else {
+ while (i < n) {
+ temp = c * x[ix] + s * y[iy];
+ y[iy] = c * y[iy] - s * x[ix];
+ x[ix] = temp;
+
+ ix += inc_x;
+ iy += inc_y;
+ i++;
+ }
+ }
+ return;
+}
+
+
+#if defined(SMP)
+static int rot_thread_function(blas_arg_t *args)
+{
+
+ rot_compute(args->m,
+ args->a, args->lda,
+ args->b, args->ldb,
+ ((FLOAT *)args->alpha)[0],
+ ((FLOAT *)args->alpha)[1]);
+ return 0;
+}
+
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+#if defined(SMP)
+ int nthreads;
+ FLOAT alpha[2]={c, s};
+ FLOAT dummy_c;
+#endif
+
+#if defined(SMP)
+ if (inc_x == 0 || inc_y == 0 || n <= 100000) {
+ nthreads = 1;
+ }
+ else {
+ nthreads = num_cpu_avail(1);
+ }
+
+ if (nthreads == 1) {
+ rot_compute(n, x, inc_x, y, inc_y, c, s);
+ }
+ else {
+#if defined(DOUBLE)
+ int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
+#else
+ int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
+#endif
+ blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+ }
+#else
+ rot_compute(n, x, inc_x, y, inc_y, c, s);
+#endif
+ return 0;
+}
--- /dev/null
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_DROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+ BLASLONG i = 0;
+
+ BLASLONG tail_index_4 = n&(~3);
+ BLASLONG tail_index_16 = n&(~15);
+
+ __m256d c_256, s_256;
+ if (n >= 4) {
+ c_256 = _mm256_set1_pd(c);
+ s_256 = _mm256_set1_pd(s);
+ }
+
+ __m256d x0, x1, x2, x3;
+ __m256d y0, y1, y2, y3;
+ __m256d t0, t1, t2, t3;
+
+ for (i = 0; i < tail_index_16; i += 16) {
+ x0 = _mm256_loadu_pd(&x[i + 0]);
+ x1 = _mm256_loadu_pd(&x[i + 4]);
+ x2 = _mm256_loadu_pd(&x[i + 8]);
+ x3 = _mm256_loadu_pd(&x[i +12]);
+ y0 = _mm256_loadu_pd(&y[i + 0]);
+ y1 = _mm256_loadu_pd(&y[i + 4]);
+ y2 = _mm256_loadu_pd(&y[i + 8]);
+ y3 = _mm256_loadu_pd(&y[i +12]);
+
+ t0 = _mm256_mul_pd(s_256, y0);
+ t1 = _mm256_mul_pd(s_256, y1);
+ t2 = _mm256_mul_pd(s_256, y2);
+ t3 = _mm256_mul_pd(s_256, y3);
+
+ t0 = _mm256_fmadd_pd(c_256, x0, t0);
+ t1 = _mm256_fmadd_pd(c_256, x1, t1);
+ t2 = _mm256_fmadd_pd(c_256, x2, t2);
+ t3 = _mm256_fmadd_pd(c_256, x3, t3);
+
+ _mm256_storeu_pd(&x[i + 0], t0);
+ _mm256_storeu_pd(&x[i + 4], t1);
+ _mm256_storeu_pd(&x[i + 8], t2);
+ _mm256_storeu_pd(&x[i +12], t3);
+
+ t0 = _mm256_mul_pd(s_256, x0);
+ t1 = _mm256_mul_pd(s_256, x1);
+ t2 = _mm256_mul_pd(s_256, x2);
+ t3 = _mm256_mul_pd(s_256, x3);
+
+ t0 = _mm256_fmsub_pd(c_256, y0, t0);
+ t1 = _mm256_fmsub_pd(c_256, y1, t1);
+ t2 = _mm256_fmsub_pd(c_256, y2, t2);
+ t3 = _mm256_fmsub_pd(c_256, y3, t3);
+
+ _mm256_storeu_pd(&y[i + 0], t0);
+ _mm256_storeu_pd(&y[i + 4], t1);
+ _mm256_storeu_pd(&y[i + 8], t2);
+ _mm256_storeu_pd(&y[i +12], t3);
+
+ }
+
+ for (i = tail_index_16; i < tail_index_4; i += 4) {
+ x0 = _mm256_loadu_pd(&x[i]);
+ y0 = _mm256_loadu_pd(&y[i]);
+
+ t0 = _mm256_mul_pd(s_256, y0);
+ t0 = _mm256_fmadd_pd(c_256, x0, t0);
+ _mm256_storeu_pd(&x[i], t0);
+
+ t0 = _mm256_mul_pd(s_256, x0);
+ t0 = _mm256_fmsub_pd(c_256, y0, t0);
+ _mm256_storeu_pd(&y[i], t0);
+ }
+
+ for (i = tail_index_4; i < n; ++i) {
+ FLOAT temp = c * x[i] + s * y[i];
+ y[i] = c * y[i] - s * x[i];
+ x[i] = temp;
+ }
+}
+#endif
--- /dev/null
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_DROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+ BLASLONG i = 0;
+ BLASLONG n1 = n;
+
+ BLASLONG tail_index_8 = 0;
+ BLASLONG tail_index_32 = 0;
+
+ __m512d c_512 = _mm512_set1_pd(c);
+ __m512d s_512 = _mm512_set1_pd(s);
+
+ tail_index_8 = n1 & (~7);
+ tail_index_32 = n1 & (~31);
+
+
+ __m512d x0, x1, x2, x3;
+ __m512d y0, y1, y2, y3;
+ __m512d t0, t1, t2, t3;
+
+ for (i = 0; i < tail_index_32; i += 32) {
+ x0 = _mm512_loadu_pd(&x[i + 0]);
+ x1 = _mm512_loadu_pd(&x[i + 8]);
+ x2 = _mm512_loadu_pd(&x[i +16]);
+ x3 = _mm512_loadu_pd(&x[i +24]);
+ y0 = _mm512_loadu_pd(&y[i + 0]);
+ y1 = _mm512_loadu_pd(&y[i + 8]);
+ y2 = _mm512_loadu_pd(&y[i +16]);
+ y3 = _mm512_loadu_pd(&y[i +24]);
+
+ t0 = _mm512_mul_pd(s_512, y0);
+ t1 = _mm512_mul_pd(s_512, y1);
+ t2 = _mm512_mul_pd(s_512, y2);
+ t3 = _mm512_mul_pd(s_512, y3);
+
+ t0 = _mm512_fmadd_pd(c_512, x0, t0);
+ t1 = _mm512_fmadd_pd(c_512, x1, t1);
+ t2 = _mm512_fmadd_pd(c_512, x2, t2);
+ t3 = _mm512_fmadd_pd(c_512, x3, t3);
+
+ _mm512_storeu_pd(&x[i + 0], t0);
+ _mm512_storeu_pd(&x[i + 8], t1);
+ _mm512_storeu_pd(&x[i +16], t2);
+ _mm512_storeu_pd(&x[i +24], t3);
+
+ t0 = _mm512_mul_pd(s_512, x0);
+ t1 = _mm512_mul_pd(s_512, x1);
+ t2 = _mm512_mul_pd(s_512, x2);
+ t3 = _mm512_mul_pd(s_512, x3);
+
+ t0 = _mm512_fmsub_pd(c_512, y0, t0);
+ t1 = _mm512_fmsub_pd(c_512, y1, t1);
+ t2 = _mm512_fmsub_pd(c_512, y2, t2);
+ t3 = _mm512_fmsub_pd(c_512, y3, t3);
+
+ _mm512_storeu_pd(&y[i + 0], t0);
+ _mm512_storeu_pd(&y[i + 8], t1);
+ _mm512_storeu_pd(&y[i +16], t2);
+ _mm512_storeu_pd(&y[i +24], t3);
+ }
+
+ for (i = tail_index_32; i < tail_index_8; i += 8) {
+ x0 = _mm512_loadu_pd(&x[i]);
+ y0 = _mm512_loadu_pd(&y[i]);
+
+ t0 = _mm512_mul_pd(s_512, y0);
+ t0 = _mm512_fmadd_pd(c_512, x0, t0);
+ _mm512_storeu_pd(&x[i], t0);
+
+ t0 = _mm512_mul_pd(s_512, x0);
+ t0 = _mm512_fmsub_pd(c_512, y0, t0);
+ _mm512_storeu_pd(&y[i], t0);
+ }
+
+ if ((n1&7) > 0) {
+ unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7)));
+ __m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]);
+ __m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]);
+ __m512d temp = _mm512_mul_pd(s_512, tail_y);
+ temp = _mm512_fmadd_pd(c_512, tail_x, temp);
+ _mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp);
+ temp = _mm512_mul_pd(s_512, tail_x);
+ temp = _mm512_fmsub_pd(c_512, tail_y, temp);
+ _mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp);
+ }
+}
+#endif
#define MAXPS maxps
#define MAXSS maxss
#ifdef USE_MIN
+#undef MAXPS
+#undef MAXSS
#define MAXPS minps
#define MAXSS minss
#endif
"r" (y), // 3
"r" (alpha) // 4
: "cc",
- "%xmm0",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
"r" (y), // 3
"r" (alpha) // 4
: "cc",
- "%xmm0",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
"r" (y), // 3
"r" (alpha) // 4
: "cc",
- "%xmm0",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
--- /dev/null
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined (COOPERLAKE)
+#include "sbgemv_n_microk_cooperlake.c"
+#endif
+
+#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \
+ ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
+ ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
+
+#define ALIGN64_FREE(ptr) \
+ free(ptr)
+
+#ifndef HAVE_SBGEMV_N_ACCL_KERNEL
+static void sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+ BLASLONG offset_lda, offset_m;
+ float accum = 0.0;
+ float tmp_x = 0.0;
+
+ bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
+ float * a_fp32 = malloc(sizeof(float)*m*n);
+ float * x_fp32 = malloc(sizeof(float)*n);
+
+ for (BLASLONG j=0; j<n; j++) {
+ offset_lda = lda * j;
+ offset_m = m * j;
+ for (BLASLONG i=0; i<m; i++) {
+ a_bf16[offset_m + i] = a[offset_lda + i];
+ }
+ }
+
+ SBF16TOS_K(n, x, 1, x_fp32, 1);
+ SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
+
+ for (BLASLONG i=0; i<m; i++) {
+ accum = 0.0;
+ for (BLASLONG j=0; j<n; j++) {
+ accum += a_fp32[j*m + i] * x_fp32[j];
+ }
+ if (beta == ZERO) {
+ y[i] = alpha * accum;
+ } else {
+ y[i] = alpha * accum + beta * y[i];
+ }
+ }
+
+ free(a_bf16);
+ free(a_fp32);
+ free(x_fp32);
+}
+#endif
+
+static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
+{
+ for(BLASLONG i=0; i<n; i++) {
+ target[i] = src[i*inc];
+ }
+}
+
+static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+ for(BLASLONG i=0; i<n; i++) {
+ target[i] = src[i*inc];
+ }
+}
+
+static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+ for(BLASLONG i=0; i<n; i++) {
+ target[i*inc] = src[i];
+ }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
+{
+ if ( m < 1 || n < 1) return(0);
+
+ bfloat16 * xbuffer_align = x;
+ float * ybuffer_align = y;
+
+ bfloat16 * xbuffer = NULL;
+ float * ybuffer = NULL;
+
+ if (incx != 1) {
+ ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
+ bf16_compress_vector(n, x, xbuffer_align, incx);
+ }
+
+ if (incy != 1) {
+ ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
+ if (beta != ZERO) {
+ fp32_compress_vector(m, y, ybuffer_align, incy);
+ }
+ }
+
+ sbgemv_kernel_n(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
+
+ if (incy != 1) {
+ fp32_expand_vector(m, ybuffer_align, y, incy);
+ ALIGN64_FREE(ybuffer);
+ }
+
+ if (incx != 1) {
+ ALIGN64_FREE(xbuffer);
+ }
+
+ return(0);
+}
--- /dev/null
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SBGEMV_N_ACCL_KERNEL 1
+#include "common.h"
+#include <immintrin.h>
+
+// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
+#undef ZERO_BETA
+#undef ONE_BETA
+#undef ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
+#undef ZERO_BETA
+#define ONE_BETA 1
+#undef ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#undef ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#define ONE_ALPHA 1
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+static int sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+ if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data
+ if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA
+ sbgemv_kernel_32xN_lda_direct(m, n, alpha, a, lda, x, y);
+ } else { // ALPHA != 1.0, need to multipy ALPHA
+ sbgemv_kernel_32xN_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+ }
+ } else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
+ if (beta == ONE) {
+ sbgemv_kernel_32xN_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+ } else {
+ sbgemv_kernel_32xN_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+ }
+ }
+
+ return 0;
+}
+
+#endif
--- /dev/null
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+
+// Include common macros for BF16 based operations with IA intrinsics
+#include "bf16_common_macros.h"
+
+#ifndef ZERO_BETA // Beta is non-zero
+
+#ifndef ONE_BETA // BETA is not ONE
+
+#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA
+#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
+
+#else // BETA is ONE
+
+#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE
+#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+#endif
+
+#else // BETA is zero
+
+#ifndef ONE_ALPHA // ALPHA is not ONE
+
+#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA
+#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA
+#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA
+#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA
+#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA
+#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA
+
+#else // ALPHA is ONE
+
+#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT
+#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT
+#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT
+#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT
+#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT
+#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT
+
+#endif
+
+#endif
+
+
+
+// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32xN_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32xN_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32xN_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_32x = m & (~31);
+ BLASLONG tag_m_128x = m & (~127);
+
+ __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+ accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
+ __m512i xArray_0;
+
+ __m512i ZERO512 = _mm512_setzero_si512();
+
+ unsigned int blend_hi_mask_value = ((unsigned int)0xaaaaaaaa);
+ __mmask32 blend_hi_mask = *((__mmask32*) &blend_hi_mask_value);
+ unsigned int blend_lo_mask_value = ((unsigned int)0x55555555);
+ __mmask32 blend_lo_mask = *((__mmask32*) &blend_lo_mask_value);
+
+ __m512i M512_EPI32_8 = _mm512_set1_epi32(8);
+ __m512i idx_base_0 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_8);
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+ accum512_2 = _mm512_setzero_ps();
+ accum512_3 = _mm512_setzero_ps();
+ accum512_4 = _mm512_setzero_ps();
+ accum512_5 = _mm512_setzero_ps();
+ accum512_6 = _mm512_setzero_ps();
+ accum512_7 = _mm512_setzero_ps();
+
+ for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+ xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+ BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m + 0)
+ BF16_MATRIX_LOAD_1x32(matrixArray_seed_1, a, lda, idx_n, idx_m + 32)
+ BF16_MATRIX_LOAD_1x32(matrixArray_seed_2, a, lda, idx_n, idx_m + 64)
+ BF16_MATRIX_LOAD_1x32(matrixArray_seed_3, a, lda, idx_n, idx_m + 96)
+
+ matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+ matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+ matrixArray_2 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_1);
+ matrixArray_3 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_1);
+ matrixArray_4 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_2);
+ matrixArray_5 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_2);
+ matrixArray_6 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_3);
+ matrixArray_7 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_3);
+
+ BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+ BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+ BF16_DOT_1x32(accum512_2, matrixArray_2, xArray_0)
+ BF16_DOT_1x32(accum512_3, matrixArray_3, xArray_0)
+ BF16_DOT_1x32(accum512_4, matrixArray_4, xArray_0)
+ BF16_DOT_1x32(accum512_5, matrixArray_5, xArray_0)
+ BF16_DOT_1x32(accum512_6, matrixArray_6, xArray_0)
+ BF16_DOT_1x32(accum512_7, matrixArray_7, xArray_0)
+ }
+ accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+ accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+ accum512_10 = _mm512_permutex2var_ps(accum512_2, idx_base_0, accum512_3);
+ accum512_11 = _mm512_permutex2var_ps(accum512_2, idx_base_1, accum512_3);
+ accum512_12 = _mm512_permutex2var_ps(accum512_4, idx_base_0, accum512_5);
+ accum512_13 = _mm512_permutex2var_ps(accum512_4, idx_base_1, accum512_5);
+ accum512_14 = _mm512_permutex2var_ps(accum512_6, idx_base_0, accum512_7);
+ accum512_15 = _mm512_permutex2var_ps(accum512_6, idx_base_1, accum512_7);
+
+ STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0)
+ STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16)
+ STORE16_COMPLETE_RESULT(accum512_10, y+idx_m+32)
+ STORE16_COMPLETE_RESULT(accum512_11, y+idx_m+48)
+ STORE16_COMPLETE_RESULT(accum512_12, y+idx_m+64)
+ STORE16_COMPLETE_RESULT(accum512_13, y+idx_m+80)
+ STORE16_COMPLETE_RESULT(accum512_14, y+idx_m+96)
+ STORE16_COMPLETE_RESULT(accum512_15, y+idx_m+112)
+ }
+
+ for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_32x; idx_m+=32) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+ xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+ BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m)
+
+ matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+ matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+
+ BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+ BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+ }
+ accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+ accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+
+ STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0)
+ STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16)
+ }
+
+ if (tag_m_32x != m) {
+ unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
+ __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+ unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
+ __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
+
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+ xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+ BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, tag_m_32x, tail_mask)
+
+ matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+ matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+
+ BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+ BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+ }
+ accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+ accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+
+ if ((m-tag_m_32x) > 16) {
+ STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0)
+ STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask)
+ } else {
+ STORE16_MASK_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0, store_tail_mask)
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined (COOPERLAKE)
+#include "sbgemv_t_microk_cooperlake.c"
+#endif
+
+#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \
+ ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
+ ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
+
+#define ALIGN64_FREE(ptr) \
+ free(ptr)
+
+#ifndef HAVE_SBGEMV_T_ACCL_KERNEL
+static void sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+ BLASLONG offset_lda, offset_n;
+ float accum = 0.0;
+
+ bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
+ float * a_fp32 = malloc(sizeof(float)*m*n);
+ float * x_fp32 = malloc(sizeof(float)*n);
+
+ for (BLASLONG i=0; i<m; i++) {
+ offset_lda = lda * i;
+ offset_n = n * i;
+ for (BLASLONG j=0; j<n; j++) {
+ a_bf16[offset_n + j] = a[offset_lda + j];
+ }
+ }
+
+ SBF16TOS_K(n, x, 1, x_fp32, 1);
+ SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
+
+ for (BLASLONG i=0; i<m; i++) {
+ offset_n = n * i;
+ accum = 0.0;
+ for (BLASLONG j=0; j<n; j++) {
+ accum += a_fp32[offset_n + j] * x_fp32[j];
+ }
+ if (beta == ZERO) {
+ y[i] = alpha * accum;
+ } else {
+ y[i] = alpha * accum + beta * y[i];
+ }
+ }
+
+ free(a_bf16);
+ free(a_fp32);
+ free(x_fp32);
+}
+#endif
+
+static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
+{
+ for(BLASLONG i=0; i<n; i++) {
+ target[i] = src[i*inc];
+ }
+}
+
+static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+ for(BLASLONG i=0; i<n; i++) {
+ target[i] = src[i*inc];
+ }
+}
+
+static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+ for(BLASLONG i=0; i<n; i++) {
+ target[i*inc] = src[i];
+ }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
+{
+ if ( m < 1 || n < 1) return(0);
+
+ bfloat16 * xbuffer_align = x;
+ float * ybuffer_align = y;
+
+ bfloat16 * xbuffer = NULL;
+ float * ybuffer = NULL;
+
+ // Switch m and n
+ BLASLONG t = m;
+ m = n;
+ n = t;
+
+ if (incx != 1) {
+ ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
+ bf16_compress_vector(n, x, xbuffer_align, incx);
+ }
+
+ if (incy != 1) {
+ ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
+ if (beta != ZERO) {
+ fp32_compress_vector(m, y, ybuffer_align, incy);
+ }
+ }
+
+ sbgemv_kernel_t(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
+
+ if (incy != 1) {
+ fp32_expand_vector(m, ybuffer_align, y, incy);
+ ALIGN64_FREE(ybuffer);
+ }
+
+ if (incx != 1) {
+ ALIGN64_FREE(xbuffer);
+ }
+
+ return(0);
+}
--- /dev/null
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SBGEMV_T_ACCL_KERNEL 1
+
+// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
+#undef ZERO_BETA
+#undef ONE_BETA
+#undef ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
+#undef ZERO_BETA
+#define ONE_BETA 1
+#undef ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#undef ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#define ONE_ALPHA 1
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+static int sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+ if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data
+ if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA
+ if (n > 127) {
+ sbgemv_kernel_1x128_lda_direct(m, n, alpha, a, lda, x, y);
+ } else if (n > 32) {
+ sbgemv_kernel_8x32_lda_direct(m, n, alpha, a, lda, x, y);
+ } else {
+ if (n > 16) {
+ sbgemv_kernel_8x16p_lda(m, n, alpha, a, lda, x, y);
+ } else {
+ if (lda == n) {
+ switch(n) {
+ case 1: sbgemv_kernel_32x1 (m, alpha, a, x, y); break;
+ case 2: sbgemv_kernel_32x2 (m, alpha, a, x, y); break;
+ case 3: sbgemv_kernel_32x3 (m, alpha, a, x, y); break;
+ case 4: sbgemv_kernel_16x4 (m, alpha, a, x, y); break;
+ case 5: sbgemv_kernel_30x5 (m, alpha, a, x, y); break;
+ case 6: sbgemv_kernel_16x6 (m, alpha, a, x, y); break;
+ case 7: sbgemv_kernel_16x7 (m, alpha, a, x, y); break;
+ case 8: sbgemv_kernel_16x8 (m, alpha, a, x, y); break;
+ case 9: sbgemv_kernel_14x9 (m, alpha, a, x, y); break;
+ case 10: sbgemv_kernel_12x10(m, alpha, a, x, y); break;
+ case 11: sbgemv_kernel_15x11(m, alpha, a, x, y); break;
+ case 12: sbgemv_kernel_15x12(m, alpha, a, x, y); break;
+ case 13: sbgemv_kernel_16x13(m, alpha, a, x, y); break;
+ case 14: sbgemv_kernel_16x14(m, alpha, a, x, y); break;
+ case 15: sbgemv_kernel_16x15(m, alpha, a, x, y); break;
+ case 16: sbgemv_kernel_16x16(m, alpha, a, x, y); break;
+ default: break;
+ }
+ } else {
+ sbgemv_kernel_8x16m_lda(m, n, alpha, a, lda, x, y);
+ }
+ }
+ }
+ } else { // ALPHA != 1.0, need to multipy ALPHA
+ if (n > 127) {
+ sbgemv_kernel_1x128_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+ } else if (n > 32) {
+ sbgemv_kernel_8x32_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+ } else {
+ if (n > 16) {
+ sbgemv_kernel_8x16p_lda_alpha(m, n, alpha, a, lda, x, y);
+ } else {
+ if (lda == n) {
+ switch(n) {
+ case 1: sbgemv_kernel_32x1_alpha (m, alpha, a, x, y); break;
+ case 2: sbgemv_kernel_32x2_alpha (m, alpha, a, x, y); break;
+ case 3: sbgemv_kernel_32x3_alpha (m, alpha, a, x, y); break;
+ case 4: sbgemv_kernel_16x4_alpha (m, alpha, a, x, y); break;
+ case 5: sbgemv_kernel_30x5_alpha (m, alpha, a, x, y); break;
+ case 6: sbgemv_kernel_16x6_alpha (m, alpha, a, x, y); break;
+ case 7: sbgemv_kernel_16x7_alpha (m, alpha, a, x, y); break;
+ case 8: sbgemv_kernel_16x8_alpha (m, alpha, a, x, y); break;
+ case 9: sbgemv_kernel_14x9_alpha (m, alpha, a, x, y); break;
+ case 10: sbgemv_kernel_12x10_alpha(m, alpha, a, x, y); break;
+ case 11: sbgemv_kernel_15x11_alpha(m, alpha, a, x, y); break;
+ case 12: sbgemv_kernel_15x12_alpha(m, alpha, a, x, y); break;
+ case 13: sbgemv_kernel_16x13_alpha(m, alpha, a, x, y); break;
+ case 14: sbgemv_kernel_16x14_alpha(m, alpha, a, x, y); break;
+ case 15: sbgemv_kernel_16x15_alpha(m, alpha, a, x, y); break;
+ case 16: sbgemv_kernel_16x16_alpha(m, alpha, a, x, y); break;
+ default: break;
+ }
+ } else {
+ sbgemv_kernel_8x16m_lda_alpha(m, n, alpha, a, lda, x, y);
+ }
+ }
+ }
+ }
+ } else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
+ if (beta == ONE) {
+ if (n > 127) {
+ sbgemv_kernel_1x128_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+ } else if (n > 32) {
+ sbgemv_kernel_8x32_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+ } else {
+ if (n > 16) {
+ sbgemv_kernel_8x16p_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
+ } else {
+ if (lda == n) {
+ switch(n) {
+ case 1: sbgemv_kernel_32x1_alpha_one (m, alpha, a, x, beta, y); break;
+ case 2: sbgemv_kernel_32x2_alpha_one (m, alpha, a, x, beta, y); break;
+ case 3: sbgemv_kernel_32x3_alpha_one (m, alpha, a, x, beta, y); break;
+ case 4: sbgemv_kernel_16x4_alpha_one (m, alpha, a, x, beta, y); break;
+ case 5: sbgemv_kernel_30x5_alpha_one (m, alpha, a, x, beta, y); break;
+ case 6: sbgemv_kernel_16x6_alpha_one (m, alpha, a, x, beta, y); break;
+ case 7: sbgemv_kernel_16x7_alpha_one (m, alpha, a, x, beta, y); break;
+ case 8: sbgemv_kernel_16x8_alpha_one (m, alpha, a, x, beta, y); break;
+ case 9: sbgemv_kernel_14x9_alpha_one (m, alpha, a, x, beta, y); break;
+ case 10: sbgemv_kernel_12x10_alpha_one(m, alpha, a, x, beta, y); break;
+ case 11: sbgemv_kernel_15x11_alpha_one(m, alpha, a, x, beta, y); break;
+ case 12: sbgemv_kernel_15x12_alpha_one(m, alpha, a, x, beta, y); break;
+ case 13: sbgemv_kernel_16x13_alpha_one(m, alpha, a, x, beta, y); break;
+ case 14: sbgemv_kernel_16x14_alpha_one(m, alpha, a, x, beta, y); break;
+ case 15: sbgemv_kernel_16x15_alpha_one(m, alpha, a, x, beta, y); break;
+ case 16: sbgemv_kernel_16x16_alpha_one(m, alpha, a, x, beta, y); break;
+ default: break;
+ }
+ } else {
+ sbgemv_kernel_8x16m_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
+ }
+ }
+ }
+ } else {
+ if (n > 127) {
+ sbgemv_kernel_1x128_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+ } else if (n > 32) {
+ sbgemv_kernel_8x32_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+ } else {
+ if (n > 16) {
+ sbgemv_kernel_8x16p_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+ } else {
+ if (lda == n) {
+ switch(n) {
+ case 1: sbgemv_kernel_32x1_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 2: sbgemv_kernel_32x2_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 3: sbgemv_kernel_32x3_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 4: sbgemv_kernel_16x4_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 5: sbgemv_kernel_30x5_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 6: sbgemv_kernel_16x6_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 7: sbgemv_kernel_16x7_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 8: sbgemv_kernel_16x8_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 9: sbgemv_kernel_14x9_alpha_beta (m, alpha, a, x, beta, y); break;
+ case 10: sbgemv_kernel_12x10_alpha_beta(m, alpha, a, x, beta, y); break;
+ case 11: sbgemv_kernel_15x11_alpha_beta(m, alpha, a, x, beta, y); break;
+ case 12: sbgemv_kernel_15x12_alpha_beta(m, alpha, a, x, beta, y); break;
+ case 13: sbgemv_kernel_16x13_alpha_beta(m, alpha, a, x, beta, y); break;
+ case 14: sbgemv_kernel_16x14_alpha_beta(m, alpha, a, x, beta, y); break;
+ case 15: sbgemv_kernel_16x15_alpha_beta(m, alpha, a, x, beta, y); break;
+ case 16: sbgemv_kernel_16x16_alpha_beta(m, alpha, a, x, beta, y); break;
+ default: break;
+ }
+ } else {
+ sbgemv_kernel_8x16m_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+ }
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+#endif
--- /dev/null
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+// Include common macros for BF16 based operations with IA intrinsics
+#include "bf16_common_macros.h"
+
+#ifndef ZERO_BETA // Beta is non-zero
+
+#ifndef ONE_BETA // BETA is not ONE
+
+#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA
+#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
+
+#else // BETA is ONE
+
+#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE
+#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+#endif
+
+#else // BETA is zero
+
+#ifndef ONE_ALPHA // ALPHA is not ONE
+
+#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA
+#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA
+#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA
+#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA
+#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA
+#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA
+
+#else // ALPHA is ONE
+
+#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT
+#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT
+#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT
+#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT
+#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT
+#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT
+
+#endif
+
+#endif
+
+
+// 32 rows parallel processing BF16 GEMV kernel for n=1 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x1_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x1_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x1_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x1(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_32x = m & (~31);
+
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2;
+ __m512i xArray;
+ __m512 result_0, result_1;
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+#endif
+
+ __m512i load_idx_lo = _mm512_set_epi16(0, 15, 0, 14, 0, 13, 0, 12, 0, 11, 0, 10, 0, 9, 0, 8,\
+ 0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0);
+ __m512i M512_EPI16_16 = _mm512_set1_epi16(16);
+ __m512i load_idx_hi = _mm512_add_epi16(load_idx_lo, M512_EPI16_16);
+
+ unsigned int interleve_mask_value = ((unsigned int) 0x55555555);
+ __mmask32 interleave_mask = *((__mmask32*) &interleve_mask_value);
+
+ xArray = _mm512_set1_epi16((short) x[0]);
+ xArray = _mm512_mask_blend_epi16(interleave_mask, _mm512_setzero_si512(), xArray);
+
+ if (tag_m_32x > 0) {
+ for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)]); // Load 32 rows with n=1
+ matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0); // Expand the low 16 elements
+ matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0); // Expand the high 16 elements
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray);
+
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+ }
+ }
+
+ BLASLONG tail_num = m - tag_m_32x;
+ if (tail_num > 16) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-tail_num));
+ __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+ matrixArray_0 = _mm512_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 32 rows with n=1
+ matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0); // Expand the low 16 elements
+ matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0); // Expand the high 16 elements
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray);
+
+ unsigned short store_mask_value = (((unsigned short)0xffff) >> (32-tail_num));
+ __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+ STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x)
+ STORE16_MASK_COMPLETE_RESULT(result_1, y+tag_m_32x+16, store_mask)
+ } else if (tail_num > 8) {
+ __m256 result256_0 = _mm256_setzero_ps();
+ __m256 result256_1 = _mm256_setzero_ps();
+
+ __m256i load_idx_lo256 = _mm512_castsi512_si256(load_idx_lo);
+ __m256i load_idx_hi256 = _mm512_extracti32x8_epi32(load_idx_lo, 0x1);
+ __m256i xArray256 = _mm512_castsi512_si256(xArray);
+
+ unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num));
+ __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+ __m256i matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 16 rows with n=1
+ __m256i matrixArray256_1 = _mm256_permutexvar_epi16(load_idx_lo256, matrixArray256_0); // Expand the low 8 elements
+ __m256i matrixArray256_2 = _mm256_permutexvar_epi16(load_idx_hi256, matrixArray256_0); // Expand the high 8 elements
+
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_1, (__m256bh) xArray256);
+ result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_2, (__m256bh) xArray256);
+
+ unsigned char store_mask_value = (((unsigned char)0xff) >> (16-tail_num));
+ __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+ STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x)
+ STORE8_MASK_COMPLETE_RESULT(result256_1, y+tag_m_32x+8, store_mask)
+ } else {
+ __m128 result128_0 = _mm_setzero_ps();
+ __m128 result128_1 = _mm_setzero_ps();
+
+ __m128i load_idx_lo128 = _mm_set_epi16(0, 3, 0, 2, 0, 1, 0, 0);
+ __m128i M128_EPI16_4 = _mm_set1_epi16(4);
+ __m128i load_idx_hi128 = _mm_add_epi16(load_idx_lo128, M128_EPI16_4);
+
+ __m128i xArray128 = _mm512_castsi512_si128(xArray);
+
+ unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+ __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+ __m128i matrixArray128_0 = _mm_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 8 rows with n=1
+ __m128i matrixArray128_1 = _mm_permutexvar_epi16(load_idx_lo128, matrixArray128_0); // Expand the low 4 elements
+ __m128i matrixArray128_2 = _mm_permutexvar_epi16(load_idx_hi128, matrixArray128_0); // Expand the high 4 elements
+
+ result128_0 = _mm_dpbf16_ps(result128_0, (__m128bh) matrixArray128_1, (__m128bh) xArray128);
+ result128_1 = _mm_dpbf16_ps(result128_1, (__m128bh) matrixArray128_2, (__m128bh) xArray128);
+
+ if (tail_num > 4) {
+ unsigned char store_mask_value = (((unsigned char)0xf) >> (8-tail_num));
+ __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+ STORE4_COMPLETE_RESULT(result128_0, y+tag_m_32x)
+ STORE4_MASK_COMPLETE_RESULT(result128_1, y+tag_m_32x+4, store_mask)
+ } else {
+ unsigned char store_mask_value = (((unsigned char)0xf) >> (4-tail_num));
+ __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+ STORE4_MASK_COMPLETE_RESULT(result128_0, y+tag_m_32x, store_mask)
+ }
+ }
+
+ return 0;
+}
+
+// 32 rows parallel processing BF16 GEMV kernel for n=2 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x2_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x2_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x2_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_32x = m & (~31);
+
+ __m512i matrixArray_0, matrixArray_1;
+ __m512i xArray;
+ __m512 result_0, result_1;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
+ __mmask8 load_mask = *((__mmask8*) &load_mask_value);
+ xArray = _mm512_broadcastd_epi32(_mm_maskz_loadu_epi16(load_mask, x));
+
+ if (tag_m_32x > 0) {
+ for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*2]); // Load 16 rows as n=2
+ matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+16)*2]); // Load 16 rows as n=2
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray);
+
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+ }
+ }
+
+ if (m - tag_m_32x >= 16) {
+ result_0 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_32x)*2]); // Load 16 rows with n=2
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+
+ STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x)
+
+ tag_m_32x += 16;
+ }
+
+ BLASLONG tail_num = m - tag_m_32x;
+ if (tail_num > 8) {
+ result_0 = _mm512_setzero_ps();
+
+ unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(m&15)));
+ __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+ matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]); // Load 16 rows with n=2
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+
+ STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_32x, tail_mask)
+ } else if (tail_num == 8) {
+ __m256 result256 = _mm256_setzero_ps();
+
+ __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2
+ __m256i xArray256 = _mm512_castsi512_si256(xArray);
+ result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
+
+ STORE8_COMPLETE_RESULT(result256, y+tag_m_32x)
+ } else {
+ __m256 result256 = _mm256_setzero_ps();
+
+ unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-(m&7)));
+ __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+ __m256i matrixArray256 = _mm256_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]); // Load 8 rows with n=2
+ __m256i xArray256 = _mm512_castsi512_si256(xArray);
+ result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
+
+ STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_32x, tail_mask)
+ }
+
+ return 0;
+}
+
+// 32 rows parallel processing BF16 GEMV kernel for n=3 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x3_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x3_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x3_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_32x = m & (~31);
+
+ __m512 result_0, result_1;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i xTmp = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|0|0|0|0|0|
+ __m512i xArray_0 = _mm512_broadcastd_epi32(xTmp); // x0|x1|x0|x1|...|x0|x1|
+ __m512i xArray_1 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1)); // x2| 0|x2| 0|...|x2| 0|
+
+ __m512i load_idx_base;
+ __m512i M512_EPI16_2, M512_EPI16_8, M512_EPI16_16;
+ M512_EPI16_2 = _mm512_set1_epi16(2);
+ M512_EPI16_8 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2);
+ M512_EPI16_8 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8);
+ M512_EPI16_16 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8);
+ load_idx_base = _mm512_set_epi16(46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
+ 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 0);
+
+ if (tag_m_32x > 0) {
+ __m512i load_idx01_1st, load_idx01_2nd, load_idx2_1st, load_idx2_2nd;
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6;
+
+ unsigned int idx_blend_mask_value = ((unsigned int)0x80000000);
+ __mmask32 idx_blend_mask = *((__mmask32*) &idx_blend_mask_value);
+
+ load_idx01_1st = load_idx_base;
+ load_idx01_2nd = _mm512_add_epi16(load_idx01_1st, M512_EPI16_16);
+ load_idx2_1st = _mm512_add_epi16(load_idx01_1st, M512_EPI16_2);
+ load_idx2_2nd = _mm512_add_epi16(load_idx01_2nd, M512_EPI16_2);
+ load_idx2_2nd = _mm512_mask_blend_epi16(idx_blend_mask, load_idx2_2nd, _mm512_setzero_si512());
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*3]); // Load 10 rows with n=3 plus 2 element
+ matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+10)*3 + 2)]); // Load 10 rows with n=3 plus 2 element
+ matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+21)*3 + 1)]); // Load 10 rows with n=3 plus 2 element
+
+ matrixArray_3 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_1st, matrixArray_1); // Select the first 2 elements for each row
+ matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_1, load_idx01_2nd, matrixArray_2); // Select the first 2 elements for each row
+ matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_0, load_idx2_1st, matrixArray_1); // Select the third element for each row
+ matrixArray_6 = _mm512_permutex2var_epi16(matrixArray_1, load_idx2_2nd, matrixArray_2); // Select the third element for each row
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_3, (__m512bh) xArray_0);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_5, (__m512bh) xArray_1);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_4, (__m512bh) xArray_0);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_6, (__m512bh) xArray_1);
+
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+ }
+ }
+
+ if (tag_m_32x != m) {
+ __m256i load256_idx01_1st, load256_idx01_2nd, load256_idx2_1st, load256_idx2_2nd;
+ __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6;
+ __m256 result256_0, result256_1;
+
+ unsigned short idx256_blend_mask_value = ((unsigned short)0x8000);
+ __mmask16 idx256_blend_mask = *((__mmask16*) &idx256_blend_mask_value);
+
+ load256_idx01_1st = _mm512_castsi512_si256(load_idx_base);
+ load256_idx01_2nd = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_8));
+ load256_idx2_1st = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_2));
+ load256_idx2_2nd = _mm256_add_epi16(load256_idx01_2nd, _mm512_castsi512_si256(M512_EPI16_2));
+ load256_idx2_2nd = _mm256_mask_blend_epi16(idx256_blend_mask, load256_idx2_2nd, _mm256_setzero_si256());
+
+ if (m - tag_m_32x > 15) {
+ result256_0 = _mm256_setzero_ps();
+ result256_1 = _mm256_setzero_ps();
+
+ matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
+ matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
+ matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element
+
+ matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row
+ matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row
+ matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row
+ matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row
+
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+ result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+ result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+
+ STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x)
+ STORE8_COMPLETE_RESULT(result256_1, y+tag_m_32x+8)
+
+ tag_m_32x += 16;
+ }
+
+ if (tag_m_32x != m) {
+ result256_0 = _mm256_setzero_ps();
+ result256_1 = _mm256_setzero_ps();
+ BLASLONG tail_num = m-tag_m_32x;
+
+ if (tail_num > 10) {
+ unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1)));
+ __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+ matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
+ matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element
+ matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows
+
+ matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row
+ matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row
+ matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row
+ matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row
+
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+ result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+ result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+ } else if (tail_num > 5) {
+ unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2)));
+ __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+ matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element
+ matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows
+ matrixArray256_2 = _mm256_setzero_si256();
+
+ matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row
+ matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row
+ matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row
+ matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row
+
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+ result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+ result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+ } else {
+ unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num*3)));
+ __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+ matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)*3]); // Load m-tag_m_32x rows
+ matrixArray256_1 = _mm256_setzero_si256();
+
+ matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row
+ matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row
+
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+ }
+
+ unsigned short store_tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num)));
+ __mmask16 store_tail_mask = *((__mmask16*) &store_tail_mask_value);
+ __m512 result512 = _mm512_insertf32x8(_mm512_castps256_ps512(result256_0), result256_1, 0x1);
+ STORE16_MASK_COMPLETE_RESULT(result512, y+tag_m_32x, store_tail_mask)
+ }
+ }
+
+ return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=4 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x4_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x4_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x4_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_16x = m & (~15);
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+ __m512i xArray_01, xArray_23, xArray_remix;
+ __m512 result;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+ __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1);
+ __m512i idx_base_remix = _mm512_inserti32x8(idx_base_0, _mm512_castsi512_si256(idx_base_1), 0x1);
+
+ unsigned char x_load_mask_value = (((unsigned char)0xf) >> 2);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i xTmp = _mm_maskz_loadu_epi32(x_load_mask, x); // |x0|x1|x2|x3|0|0|0|0|
+ xArray_01 = _mm512_broadcastd_epi32(xTmp); // |x0|x1|x0|x1|...|x0|x1|
+ xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1)); // |x2|x3|x2|x3|...|x2|x3|
+ unsigned short blend_mask_value = ((unsigned short)0xff00);
+ __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+ xArray_remix = _mm512_mask_blend_epi32(blend_mask, xArray_01, xArray_23); // |x0|x1|x0|x1|x0|x1|x0|x1|...|x2|x3|x2|x3|x2|x3|x2|x3|
+
+ if (tag_m_16x > 0) {
+ for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+ result = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*4]); // Load 8 rows with n=4
+ matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+8)*4]); // Load 8 rows with n=4
+
+ matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_0, matrixArray_1); // |a0|a1|...|h0|h1|i0|i1|...|p0|p1|
+ matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_1, matrixArray_1); // |a2|a3|...|h2|h3|i2|i3|...|p2|p3|
+
+ result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_01);
+ result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_3, (__m512bh) xArray_23);
+
+ STORE16_COMPLETE_RESULT(result, y+idx_m)
+ }
+ }
+
+ if (m - tag_m_16x > 7) {
+ result = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*4]); // Load 8 rows with n=4
+ matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0); // a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+
+ result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix);
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1));
+
+ STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+ tag_m_16x += 8;
+ }
+
+ BLASLONG tail_num = m-tag_m_16x;
+ if (tail_num != 0) {
+ result = _mm512_setzero_ps();
+
+ unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num*2));
+ __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+ matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_16x)*4]); // Load 8 rows with n=4
+ matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0); // a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+
+ result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix);
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1));
+
+ unsigned char store_tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+ __mmask8 store_tail_mask = *((__mmask8*) &store_tail_mask_value);
+ STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, store_tail_mask)
+ }
+
+ return 0;
+}
+
+// 30 rows parallel processing BF16 GEMV kernel for n=5 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_30x5_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_30x5_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_30x5_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_30x = m - (m%30);
+
+ unsigned char x_load_mask_value = (((unsigned char)0xff) >> 3);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|x3|x4|0|0|0|
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512 result_0, result_1;
+ __m512i xArray_01 = _mm512_broadcastd_epi32(x128); // x0|x1|x0|x1|...|x0|x1|
+ __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)); // x2|x3|x2|x3|...|x2|x3|
+ __m512i xArray_4 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)); // x4| 0|x4| 0|...|x4| 0|
+
+ __m512i M512_EPI16_2 = _mm512_set1_epi16(2);
+ __m512i load_idx01_stage1_1st = _mm512_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0, 58, 57, 53, 52, 48, 47, 43, 42,
+ 38, 37, 33, 32, 26, 25, 21, 20, 16, 15, 11, 10, 6, 5, 1, 0);
+ __m512i load_idx01_stage1_2nd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x39);
+ __m512i load_idx01_stage1_3rd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x4f);
+
+ __m512i load_idx23_stage1_1st = _mm512_add_epi16(load_idx01_stage1_1st, M512_EPI16_2);
+ __m512i load_idx23_stage1_2nd = _mm512_add_epi16(load_idx01_stage1_2nd, M512_EPI16_2);
+ __m512i load_idx23_stage1_3rd = _mm512_add_epi16(load_idx01_stage1_3rd, M512_EPI16_2);
+
+ __m512i load_idx4_stage1_1st = _mm512_add_epi16(load_idx23_stage1_1st, M512_EPI16_2);
+ __m512i load_idx4_stage1_2nd = _mm512_add_epi16(load_idx23_stage1_2nd, M512_EPI16_2);
+ __m512i load_idx4_stage1_3rd = _mm512_add_epi16(load_idx23_stage1_3rd, M512_EPI16_2);
+
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4;
+ __m512i matrixArray_stage1_0, matrixArray_stage1_1, matrixArray_stage1_2;
+ __m512i matrixArray_stage2_0, matrixArray_stage2_1;
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+ unsigned short store_mask_value = (((unsigned short)0xffff) >> 2);
+ __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+ if (tag_m_30x > 0) {
+ unsigned short blend_mask_value_0 = ((unsigned short)0xf000);
+ __mmask16 blend_mask_0 = *((__mmask16*) &blend_mask_value_0);
+ unsigned short blend_mask_value_1 = ((unsigned short)0x3f00);
+ __mmask16 blend_mask_1 = *((__mmask16*) &blend_mask_value_1);
+ for (BLASLONG idx_m = 0; idx_m < tag_m_30x; idx_m+=30) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]); // Load 6 rows with n=5
+ matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]); // Load 6 rows with n=5
+ matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+12)*5)]); // Load 6 rows with n=5
+ matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+18)*5)]); // Load 6 rows with n=5
+ matrixArray_4 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+24)*5)]); // Load 6 rows with n=5
+
+ // Process the 0|1 elements
+ // Stage 1: Select the 0|1 elements for each row
+ matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+ matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx01_stage1_2nd, matrixArray_3);
+ matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx01_stage1_3rd, matrixArray_4);
+ // Stage 2: Reorder and compress all the 0|1 elements
+ matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+ matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+ // Calculate the result of the 0|1 elements
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_01);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_01);
+
+ // Process the 2|3 elements
+ // Stage 1: Select the 2|3 elements for each row
+ matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+ matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx23_stage1_2nd, matrixArray_3);
+ matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx23_stage1_3rd, matrixArray_4);
+ // Stage 2: Reorder and compress all the 2|3 elements
+ matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+ matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+ // Calculate the result of the 2|3 elements and accumulate the result of 0|1 elements
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_23);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_23);
+
+ // Process the for 4 elements
+ // Stage 1: Select the 4 elements for each row
+ matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+ matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx4_stage1_2nd, matrixArray_3);
+ matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx4_stage1_3rd, matrixArray_4);
+ // Stage 2: Reorder and compress all the 4 elements
+ matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+ matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+ // Calculate the result of the 4 element and accumulate the result of 0|1 and 2|3 elements
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_4);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_4);
+
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ STORE16_MASK_COMPLETE_RESULT(result_1, y+idx_m+16, store_mask)
+ }
+ }
+
+ if (m - tag_m_30x > 11) {
+ BLASLONG tag_m_12x = m - ((m-tag_m_30x)%12);
+ for (BLASLONG idx_m = tag_m_30x; idx_m < tag_m_12x; idx_m+=12) {
+ unsigned short store_less_mask_value = (((unsigned short)0xffff) >> 4);
+ __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value);
+ result_0 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]); // Load 6 rows with n=5
+ matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]); // Load 6 rows with n=5
+
+ // Interleave the elements
+ matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+ matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+ matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+ // Calculate and accumulate the result
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4);
+
+ STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_less_mask)
+ tag_m_30x += 12;
+ }
+ }
+
+ BLASLONG tail_num = m - tag_m_30x;
+ if (tail_num > 6) {
+ unsigned short store_less_mask_value = (((unsigned short)0xffff) >> (4+(12-tail_num)));
+ __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value);
+ unsigned int load_less_mask_value = (((unsigned int)0xffffffff) >> (2+(12-tail_num)*5));
+ __mmask32 load_less_mask = *((__mmask32*) &load_less_mask_value);
+ result_0 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_30x)*5]); // Load 6 rows with n=5
+ matrixArray_1 = _mm512_maskz_loadu_epi16(load_less_mask, &a[((tag_m_30x+6)*5)]); // Load x rows with n=5
+
+ // Interleave the elements
+ matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+ matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+ matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+ // Calculate and accumulate the result
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4);
+
+ STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_30x, store_less_mask)
+ } else {
+ __m128i matrixArray128;
+ __m128 result128, tmp128;
+ for (BLASLONG i = tag_m_30x; i < m; i++) {
+ result128 = _mm_setzero_ps();
+ matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*5]); // Load 1 rows with n=5
+ result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 14);
+ result128 = _mm_add_ps(result128, tmp128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 1);
+ result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * result128[0] + beta * y[i];
+#else
+ y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = result128[0] * alpha;
+#else
+ y[i] = result128[0];
+#endif
+#endif
+
+ }
+ }
+
+ return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=6 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x6_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x6_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x6_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_16x = m & (~15);
+
+ unsigned char x_load_mask_value = (((unsigned char)0xff) >> 2);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|x3|x4|x5|0|0|
+
+ if (tag_m_16x > 0) {
+ __m512 result_0;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+ __m512i load_idx01_1st = _mm512_set_epi32( 0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
+ __m512i load_idx01_2nd = _mm512_set_epi32(13, 10, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+ __m512i load_idx23_1st = _mm512_add_epi32(load_idx01_1st, M512_EPI32_1);
+ __m512i load_idx23_2nd = _mm512_add_epi32(load_idx01_2nd, M512_EPI32_1);
+
+ __m512i load_idx45_1st = _mm512_add_epi32(load_idx23_1st, M512_EPI32_1);
+ __m512i load_idx45_2nd = _mm512_add_epi32(load_idx23_2nd, M512_EPI32_1);
+
+ unsigned short blend_mask_value = ((unsigned short)0x0400);
+ __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+ // Set the 11th element to be 0 as invalid index for a 512 bit epi32 register
+ load_idx45_1st = _mm512_mask_blend_epi32(blend_mask, load_idx45_1st, load_idx01_2nd);
+ // Set the 11th element to be 0 as 0 is the correct index
+ load_idx45_2nd = _mm512_mask_blend_epi32(blend_mask, load_idx45_2nd, load_idx01_2nd);
+
+ __m512i xArray_01 = _mm512_broadcastd_epi32(x128); // x0|x1|x0|x1|...|x0|x1|
+ __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)); // x2|x3|x2|x3|...|x2|x3|
+ __m512i xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)); // x4|x5|x4|x5|...|x4|x5|
+
+ unsigned short permute_mask01_uint = (((unsigned short)0xf800));
+ __mmask16 permute_mask01 = *((__mmask16*) &permute_mask01_uint);
+ unsigned short permute_mask45_uint = (((unsigned short)0xfc00));
+ __mmask16 permute_mask45 = *((__mmask16*) &permute_mask45_uint);
+
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2;
+ __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2;
+ for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+ result_0 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*6]); // Load 5 rows with n=6 plus 2 element
+ matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+5)*6 + 2)]); // Load 5 rows with n=6 plus 2 element
+ matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+10)*6 + 4)]); // Load 5 rows with n=6 plus 2 element
+
+ // Stage 1: interleave for the a..k elements
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1);
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1);
+ matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1);
+
+ // Stage 2: interleave for the l..p elements and remix together
+ matrixArray_stage_0 = _mm512_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2);
+ matrixArray_stage_1 = _mm512_mask_permutexvar_epi32(matrixArray_stage_1, permute_mask01, load_idx23_2nd, matrixArray_2);
+ matrixArray_stage_2 = _mm512_mask_permutexvar_epi32(matrixArray_stage_2, permute_mask45, load_idx45_2nd, matrixArray_2);
+
+ // Calculate the result of the 0|1 elements
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_01);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_23);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_45);
+
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ }
+
+ if (m - tag_m_16x > 7) {
+ __m256i M256_EPI32_1 = _mm512_castsi512_si256(M512_EPI32_1);
+ __m256i load_idx01_1st = _mm256_set_epi32( 0, 0, 15, 12, 9, 6, 3, 0);
+ __m256i load_idx01_2nd = _mm256_set_epi32( 5, 2, 0, 0, 0, 0, 0, 0);
+
+ __m256i load_idx23_1st = _mm256_add_epi32(load_idx01_1st, M256_EPI32_1);
+ __m256i load_idx23_2nd = _mm256_add_epi32(load_idx01_2nd, M256_EPI32_1);
+ unsigned char blend_mask_value = ((unsigned char)0x20);
+ __mmask8 blend_mask = *((__mmask8*) &blend_mask_value);
+ // Set the 6th element to be 0 as invalid index for a 512 bit epi32 register
+ load_idx23_1st = _mm256_mask_blend_epi32(blend_mask, load_idx23_1st, load_idx01_2nd);
+ // Set the 6th element to be 0 as 0 is the correct index
+ load_idx23_2nd = _mm256_mask_blend_epi32(blend_mask, load_idx23_2nd, load_idx01_2nd);
+
+ __m256i load_idx45_1st = _mm256_add_epi32(load_idx23_1st, M256_EPI32_1);
+ __m256i load_idx45_2nd = _mm256_add_epi32(load_idx23_2nd, M256_EPI32_1);
+
+ unsigned char permute_mask01_uint = (((unsigned char)0xc0));
+ __mmask8 permute_mask01 = *((__mmask8*) &permute_mask01_uint);
+ unsigned char permute_mask45_uint = (((unsigned char)0xe0));
+ __mmask8 permute_mask45 = *((__mmask8*) &permute_mask45_uint);
+
+ __m256i matrixArray_0, matrixArray_1, matrixArray_2;
+ __m256i matrixArray_stage_0;
+ __m256 result256_0;
+
+ result256_0 = _mm256_setzero_ps();
+
+ matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element
+ matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element
+ matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element
+
+ // Process the 0|1 elements
+ // Select the 0|1 elements for each row
+ matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1);
+ matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2);
+ // Calculate the result of the 0|1 elements
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_01));
+
+ // Process the 2|3 elements
+ // Select the 2|3 elements for each row
+ matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1);
+ matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx23_2nd, matrixArray_2);
+ // Calculate the result of the 0|1 elements
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_23));
+
+ // Process the for 4 elements
+ // Select the 4|5 elements for each row
+ matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1);
+ matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx45_2nd, matrixArray_2);
+ // Calculate the result of the 0|1 elements
+ result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_45));
+
+ STORE8_COMPLETE_RESULT(result256_0, y+tag_m_16x)
+ tag_m_16x += 8;
+ }
+ }
+
+ if (tag_m_16x != m) {
+ __m128i matrixArray128;
+ __m128 result128, tmp128;
+ for (BLASLONG i = tag_m_16x; i < m; i++) {
+ result128 = _mm_setzero_ps();
+ matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*6]); // Load 1 rows with n=6
+ result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 14);
+ result128 = _mm_add_ps(result128, tmp128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 1);
+ result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * result128[0] + beta * y[i];
+#else
+ y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = result128[0] * alpha;
+#else
+ y[i] = result128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=7 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x7_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x7_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x7_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_16x = m & (~15);
+
+ unsigned char x_load_mask_value = (((unsigned char)0xff) >> 1);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|0|
+
+ if (tag_m_16x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+ __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+ __m512i xArray_0123, xArray_4567;
+ __m512 result_0, result_1, result_2, result_3;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
+ __m512i load_idx_stage1_0 = _mm512_set_epi16(31, 27, 26, 25, 24, 23, 22, 21, 31, 20, 19, 18, 17, 16, 15, 14,
+ 31, 13, 12, 11, 10, 9, 8, 7, 31, 6, 5, 4, 3, 2, 1, 0);
+ __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
+ __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2);
+
+ unsigned short x_blend_mask_value = ((unsigned short)0xff00);
+ __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value);
+ xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \
+ _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)));
+ xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \
+ _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3)));
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+ for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*7]); // Load 4 rows with n=7
+ matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+4)*7]); // Load 4 rows with n=7
+ matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+8)*7]); // Load 4 rows with n=7
+ matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+12)*7]); // Load 4 rows with n=7
+
+ // Stage 1: padding
+ matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+ matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+ matrixArray_2 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_2); // |i0|i1|i2|i3|...|j6|j7|k0|k1|k2|k3|...|l6|l7|
+ matrixArray_3 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_3); // |m0|m1|m2|m3|...|n6|n7|o0|o1|o2|o3|...|p6|p7|
+
+ // Stage 2: interleave per 32 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|...|h4|h5|a6|a7|...|h6|h7|
+ matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3); // |i0|i1|...|p0|p1|i2|i3|...|p2|p3|
+ matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3); // |i4|i5|...|p4|p5|i6|i7|...|p6|p7|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567);
+
+ // Stage 3: interleave per 256 bits
+ result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44);
+ result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee);
+
+ result_2 = _mm512_add_ps(result_2, result_3);
+
+ STORE16_COMPLETE_RESULT(result_2, y+idx_m)
+ }
+
+ if (m - tag_m_16x > 7) {
+ result_0 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]); // Load 4 rows with n=7
+ matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x+4)*7]); // Load 4 rows with n=7
+
+ // Stage 1: padding
+ matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+ matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+
+ // Stage 2: interleave per 32 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+ STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+
+ tag_m_16x += 8;
+ }
+
+ BLASLONG tail_num = m - tag_m_16x;
+ if (tail_num > 3) {
+ result_0 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]); // Load 4 rows with n=7
+ unsigned int tail_load_mask_value = (((unsigned int)0xffffffff) >> (4+(8-tail_num)*7));
+ __mmask32 tail_load_mask = *((__mmask32*) &tail_load_mask_value);
+ matrixArray_1 = _mm512_maskz_loadu_epi16(tail_load_mask, &a[(tag_m_16x+4)*7]); // Load 4 rows with n=7
+
+ // Stage 1: padding
+ matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+ matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+
+ // Stage 2: interleave per 32 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+ unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+ __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+ STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask)
+ tag_m_16x = m;
+ }
+ }
+
+ if (tag_m_16x != m) {
+ __m128i matrixArray128;
+ __m128 result128, tmp128;
+ for (BLASLONG i = tag_m_16x; i < m; i++) {
+ result128 = _mm_setzero_ps();
+ matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*7]); // Load 1 rows with n=7
+ result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 14);
+ result128 = _mm_add_ps(result128, tmp128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 1);
+ result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * result128[0] + beta * y[i];
+#else
+ y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = result128[0] * alpha;
+#else
+ y[i] = result128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=8 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x8_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x8_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x8_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_16x = m & (~15);
+
+ __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
+
+ if (tag_m_16x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+ __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+ __m512i xArray_0123, xArray_4567;
+ __m512 result_0, result_1, result_2, result_3;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
+ __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0);
+ __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2);
+
+ unsigned short x_blend_mask_value = ((unsigned short)0xff00);
+ __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value);
+ xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \
+ _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)));
+ xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \
+ _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3)));
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*8]); // Load 4 rows with n=8
+ matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+4)*8]); // Load 4 rows with n=8
+ matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+8)*8]); // Load 4 rows with n=8
+ matrixArray_3 = _mm512_loadu_si512(&a[(idx_m+12)*8]); // Load 4 rows with n=8
+
+ // Stage 1: interleave per 32 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|...|h4|h5|a6|a7|...|h6|h7|
+ matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3); // |i0|i1|...|p0|p1|i2|i3|...|p2|p3|
+ matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3); // |i4|i5|...|p4|p5|i6|i7|...|p6|p7|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567);
+
+ // Stage 2: interleave per 256 bits
+ result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44);
+ result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee);
+
+ result_2 = _mm512_add_ps(result_2, result_3);
+
+ STORE16_COMPLETE_RESULT(result_2, y+idx_m)
+ }
+
+ if (m - tag_m_16x > 7) {
+ result_0 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]); // Load 4 rows with n=8
+ matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+4)*8]); // Load 4 rows with n=8
+
+ // Stage 1: interleave per 32 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+ STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+ tag_m_16x += 8;
+ }
+
+ BLASLONG tail_num = m - tag_m_16x;
+ if (tail_num > 3) {
+ result_0 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]); // Load 4 rows with n=8
+ unsigned short tail_load_mask_value = (((unsigned int)0xffff) >> ((8-tail_num)*4));
+ __mmask16 tail_load_mask = *((__mmask16*) &tail_load_mask_value);
+ matrixArray_1 = _mm512_maskz_loadu_epi32(tail_load_mask, &a[(tag_m_16x+4)*8]); // Load 4 rows with n=8
+
+ // Stage 1: interleave per 32 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+ unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+ __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+ STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask)
+ tag_m_16x = m;
+ }
+ }
+
+ if (tag_m_16x != m) {
+ __m128i matrixArray128;
+ __m128 result128, tmp128;
+ for (BLASLONG i = tag_m_16x; i < m; i++) {
+ result128 = _mm_setzero_ps();
+ matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8
+ result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 14);
+ result128 = _mm_add_ps(result128, tmp128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 1);
+ result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * result128[0] + beta * y[i];
+#else
+ y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = result128[0] * alpha;
+#else
+ y[i] = result128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 14 rows parallel processing BF16 GEMV kernel for n=9 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_14x9_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_14x9_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_14x9_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_14x = m - (m%14);
+
+ unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
+ __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0|
+
+ if (tag_m_14x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+ __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+ __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89;
+ __m512 result_0, result_1;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m256i M256_EPI16_2 = _mm256_set1_epi16(2);
+ __m256i idx_base_0 = _mm256_set_epi16( 0, 0, 55, 54, 46, 45, 37, 36, 28, 27, 19, 18, 10, 9, 1, 0);
+ __m256i idx_base_1 = _mm256_add_epi16(idx_base_0, M256_EPI16_2);
+ __m256i idx_base_2 = _mm256_add_epi16(idx_base_1, M256_EPI16_2);
+ __m256i idx_base_3 = _mm256_add_epi16(idx_base_2, M256_EPI16_2);
+ __m256i idx_base_4 = _mm256_add_epi16(idx_base_3, M256_EPI16_2);
+ __m512i idx_idx = _mm512_set_epi32( 0, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0);
+
+ __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1));
+ __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3));
+ __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0));
+ __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2));
+ __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4));
+ __m512i load_idx_stage2_0 = _mm512_set_epi32( 0, 0, 22, 21, 20, 19, 18, 17, 16, 13, 12, 11, 10, 9, 8, 7);
+
+ xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0|x1|x0|x1| ... |x0|x1|
+ xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2|x3|x2|x3| ... |x2|x3|
+ xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4|x5|x4|x5| ... |x4|x5|
+ xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6|x7|x6|x7| ... |x6|x7|
+ xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8|0 |x8| 0| ... |x8| 0|
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 1);
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+ unsigned short blend_mask_value = ((unsigned short)0x3f80);
+ __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+ unsigned short store_mask_value = (((unsigned short)0xffff) >> 2);
+ __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+ for (BLASLONG idx_m = 0; idx_m < tag_m_14x; idx_m+=14) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*9]); // Load 3 rows with n=9 plus 5 elements
+ matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+3)*9 + 5]); // Load 3 rows with n=9 plus 4 elements
+ matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+7)*9]); // Load 3 rows with n=9 plus 5 elements
+ matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*9 + 5]); // Load 3 rows with n=9 plus 4 elements
+
+ // Stage 1: interleave per 16 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_0, matrixArray_1); // |a0|a1|...|g0|g1|a2|a3|...|g2|g3|x|x|x|x|
+ matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_1, matrixArray_1); // |a4|a5|...|g4|g5|a6|a7|...|g6|g7|x|x|x|x|
+ matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_2, matrixArray_3); // |h2|h3|...|n2|n3|h0|h1|...|n0|n1|x|x|x|x|
+ matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_3, matrixArray_3); // |h6|h7|...|n6|n7|h4|h5|...|n4|n5|x|x|x|x|
+ matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_4, matrixArray_1); // |a8| x|...|g8| x| x| x|...| x| x|x|x|x|x|
+ matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_4, matrixArray_3); // | x| x|...| x| x|h8| x|...|n8| x|x|x|x|x|
+
+ // Stage 2: interleave per 32 bits
+ matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|b0|b1|...|h0|h1|i0|i1|j0|j1|...|n0|n1|x|x|x|x|
+ matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2); // |a2|a3|b2|b3|...|h2|h3|i2|i3|j2|j3|...|n2|n3|x|x|x|x|
+ matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3); // |a4|a5|b4|b5|...|h4|h5|i4|i5|j4|j5|...|n4|n5|x|x|x|x|
+ matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3); // |a6|a7|b6|b7|...|h6|h7|i6|i7|j6|j7|...|n6|n7|x|x|x|x|
+ matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_4, matrixArray_5); // |a8| x|b8| x|...|h8| x|i8| x|j8| x|...|n8| x|x|x|x|x|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+ result_0 = _mm512_add_ps(result_0, result_1);
+
+ STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+ }
+ }
+
+ if (tag_m_14x != m) {
+ __m256i matrixArray256;
+ __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+ __m256 result256;
+ __m128 result128, tmp128;
+ unsigned short load256_mask_value = (((unsigned short)0xffff) >> 7);
+ __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+ for (BLASLONG i = tag_m_14x; i < m; i++) {
+ result256 = _mm256_setzero_ps();
+ matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*9]);
+ result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+ result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+ tmp128 = _mm_shuffle_ps(result128, result128, 14);
+ result128 = _mm_add_ps(result128, tmp128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 1);
+ result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * result128[0] + beta * y[i];
+#else
+ y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = result128[0] * alpha;
+#else
+ y[i] = result128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 12 rows parallel processing BF16 GEMV kernel for n=10 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_12x10_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_12x10_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_12x10_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_12x = m - (m%12);
+
+ unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7|
+ __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0|
+
+ if (tag_m_12x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4;
+ __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+ __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89;
+ __m512 result_0, result_1;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m256i M256_EPI32_1 = _mm256_set1_epi32(1);
+ __m256i idx_base_0 = _mm256_set_epi32( 0, 0, 26, 21, 16, 10, 5, 0);
+ __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_1);
+ __m256i idx_base_2 = _mm256_add_epi32(idx_base_1, M256_EPI32_1);
+ __m256i idx_base_3 = _mm256_add_epi32(idx_base_2, M256_EPI32_1);
+ __m256i idx_base_4 = _mm256_add_epi32(idx_base_3, M256_EPI32_1);
+ __m512i idx_idx = _mm512_set_epi32( 0, 0, 0, 0, 21, 20, 19, 18, 17, 16, 5, 4, 3, 2, 1, 0);
+
+ __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1));
+ __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3));
+ __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0));
+ __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2));
+ __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4));
+ __m512i load_idx_stage2_0 = _mm512_set_epi32( 0, 0, 0, 0, 21, 20, 19, 18, 17, 16, 11, 10, 9, 8, 7, 6);
+
+ xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0|x1|x0|x1| ... |x0|x1|
+ xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2|x3|x2|x3| ... |x2|x3|
+ xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4|x5|x4|x5| ... |x4|x5|
+ xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6|x7|x6|x7| ... |x6|x7|
+ xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8|x9|x8|x9| ... |x8|x9|
+
+ unsigned short blend_mask_value = ((unsigned short)0x0fc0);
+ __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+ unsigned short load_mask_value = (((unsigned short)0xffff) >> 1);
+ __mmask16 load_mask = *((__mmask16*) &load_mask_value);
+ unsigned short store_mask_value = (((unsigned short)0xffff) >> 4);
+ __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+ for (BLASLONG idx_m = 0; idx_m < tag_m_12x; idx_m+=12) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m)*10]); // Load 3 rows with n=10
+ matrixArray_1 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+3)*10]); // Load 3 rows with n=10
+ matrixArray_2 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+6)*10]); // Load 3 rows with n=10
+ matrixArray_3 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+9)*10]); // Load 3 rows with n=10
+
+ // Stage 1: interleave per 32 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_0, matrixArray_1); // |a0|a1|...|f0|f1|a2|a3|...|f2|f3|x|x|x|x|x|x|x|x|
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_1, matrixArray_1); // |a4|a5|...|f4|f5|a6|a7|...|f6|f7|x|x|x|x|x|x|x|x|
+ matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_2, matrixArray_3); // |g2|g3|...|l2|l3|g0|g1|...|l0|l1|x|x|x|x|x|x|x|x|
+ matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_3, matrixArray_3); // |g6|g7|...|l6|l7|g4|g5|...|l4|l5|x|x|x|x|x|x|x|x|
+ matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_4, matrixArray_1); // |a8|a9|...|f8|f9| x| x|...| x| x|x|x|x|x|x|x|x|x|
+ matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_4, matrixArray_3); // | x| x|...| x| x|g8|g9|...|l8|l9|x|x|x|x|x|x|x|x|
+
+ // Stage 3: interleave per 256 bits
+ matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|...|l0|l1|x|x|x|x|x|x|x|x|
+ matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2); // |a2|a3|...|l2|l3|x|x|x|x|x|x|x|x|
+ matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3); // |a4|a5|...|l4|l5|x|x|x|x|x|x|x|x|
+ matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3); // |a6|a7|...|l6|l7|x|x|x|x|x|x|x|x|
+ matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_4, matrixArray_stage_5); // |a8|a9|...|l8|l9|x|x|x|x|x|x|x|x|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+ result_0 = _mm512_add_ps(result_0, result_1);
+
+ STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+ }
+ }
+
+ if (tag_m_12x != m) {
+ __m256i matrixArray256;
+ __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+ __m256 result256;
+ __m128 result128, tmp128;
+ unsigned char load256_mask_value = (((unsigned char)0xff) >> 3);
+ __mmask8 load256_mask = *((__mmask8*) &load256_mask_value);
+ for (BLASLONG i = tag_m_12x; i < m; i++) {
+ result256 = _mm256_setzero_ps();
+ matrixArray256 = _mm256_maskz_loadu_epi32(load256_mask, &a[(i)*10]);
+ result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+ result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+ tmp128 = _mm_shuffle_ps(result128, result128, 14);
+ result128 = _mm_add_ps(result128, tmp128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 1);
+ result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * result128[0] + beta * y[i];
+#else
+ y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = result128[0] * alpha;
+#else
+ y[i] = result128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 15 rows parallel processing BF16 GEMV kernel for n=11 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_15x11_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_15x11_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_15x11_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_15x = m - (m%15);
+
+ unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7|
+ __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0|
+
+ if (tag_m_15x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+ __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+ __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10;
+ __m512 result_0, result_1;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
+ __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3;
+
+ __m512i M512_EPI16_2, M512_EPI16_4, M512_EPI16_6, M512_EPI32_5;
+ M512_EPI16_2 = _mm512_set1_epi16(2);
+ M512_EPI16_4 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2);
+ M512_EPI16_6 = _mm512_add_epi16(M512_EPI16_4, M512_EPI16_2);
+ M512_EPI32_5 = _mm512_set1_epi32(5);
+
+ unsigned int BASE_MASK_10_value = ((unsigned int)0x000003ff);
+ __mmask32 BASE_MASK_10 = *((__mmask32*) &BASE_MASK_10_value);
+ unsigned int BASE_MASK_20_value = ((unsigned int)0x000ffc00);
+ __mmask32 BASE_MASK_20 = *((__mmask32*) &BASE_MASK_20_value);
+ unsigned int BASE_MASK_30_value = ((unsigned int)0x3ff00000);
+ __mmask32 BASE_MASK_30 = *((__mmask32*) &BASE_MASK_30_value);
+
+ idx_stage1_base_0 = _mm512_set_epi16( 0, 0, 49, 48, 38, 37, 27, 26, 16, 15, 5, 4, 47, 46, 36, 35,
+ 25, 24, 14, 13, 3, 2, 45, 44, 34, 33, 23, 22, 12, 11, 1, 0);
+ idx_stage1_base_1 = _mm512_add_epi16(idx_stage1_base_0, M512_EPI16_6);
+
+ idx_stage1_base_2 = _mm512_mask_add_epi16(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI16_2);
+ idx_stage1_base_2 = _mm512_mask_sub_epi16(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI16_2);
+ idx_stage1_base_3 = _mm512_add_epi16(idx_stage1_base_2, M512_EPI16_6);
+
+ idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI16_2);
+ idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI16_2);
+ idx_stage1_base_4 = _mm512_mask_sub_epi16(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI16_4);
+ idx_stage1_base_5 = _mm512_add_epi16(idx_stage1_base_4, M512_EPI16_6);
+
+ unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0);
+ __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value);
+ unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00);
+ __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value);
+ idx_stage2_base_0 = _mm512_set_epi32( 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 9, 8, 7, 6, 5);
+ idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5);
+ idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5);
+ idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5);
+
+ xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0 |x1 |x0 |x1 | ... |x0 |x1 |
+ xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2 |x3 |x2 |x3 | ... |x2 |x3 |
+ xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4 |x5 |x4 |x5 | ... |x4 |x5 |
+ xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6 |x7 |x6 |x7 | ... |x6 |x7 |
+ xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8 |x9 |x8 |x9 | ... |x8 |x9 |
+ xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1)); // |x10|0 |x10|0 | ... |x10|0 |
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 9);
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+ unsigned short store_mask_value = (((unsigned short)0xffff) >> 1);
+ __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[idx_m*11]); // Load 2 rows with n=11 plus 10 elements
+ matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*11 + 32]); // Load 2 rows with n=11 plus 1 element
+ matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*11]); // Load 2 rows with n=11 plus 10 elements
+ matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*11 + 32]); // Load 2 rows with n=11 plus 1 element
+ matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*11]); // Load 2 rows with n=11 plus 10 elements
+ matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*11 + 32]); // Load 2 rows with n=11 plus 1 element
+
+ // Stage 1: interleave per 16 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_0, matrixArray_1); // |a0|a1|...|e0|e1|a2|a3|...|e2|e3|a4 |a5|...|e4 |e5|
+ matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_1, matrixArray_1); // |a6|a7|...|e6|e7|a8|a9|...|e8|e9|a10|x |...|e10|x |
+ matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_2, matrixArray_3); // |f2|f3|...|j2|j3|f0|f1|...|j0|j1|f4 |f5|...|j4 |j5|
+ matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_3, matrixArray_3); // |f8|f9|...|j8|j9|f6|f7|...|j6|j7|f10|x |...|j10|x |
+ matrixArray_stage_4 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_4, matrixArray_5); // |k4|k5|...|o4|o5|k2|k3|...|o2|o3|k0 |k1|...|o0 |o1|
+ matrixArray_stage_5 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_5, matrixArray_5); // |k10|x|...|o10|x|k8|k9|...|o8|o9|k6 |k7|...|o6 |o7|
+
+ // Stage 2: interleave per 32 bits
+ matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|...|j0|j1|x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3); // |a6|a7|...|j6|j7|x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2); // |a2|a3|...|j2|j3|x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2); // |a4|a5|...|j4|j5|x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3); // |a8|a9|...|j8|j9|x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3); // |a10|x|...|j10|x|x|x|x|x|x|x|x|x|x|x|x|x|
+
+ matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0, matrixArray_stage_4); // |a0|a1|.......................|o0|o1|x|x|
+ matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3, matrixArray_stage_5); // |a6|a7|.......................|o6|o7|x|x|
+ matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1 , idx_stage2_base_1, matrixArray_stage_4); // |a2|a3|.......................|o2|o3|x|x|
+ matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2 , idx_stage2_base_3, matrixArray_stage_4); // |a4|a5|.......................|o4|o5|x|x|
+ matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4 , idx_stage2_base_1, matrixArray_stage_5); // |a8|a9|.......................|o8|o9|x|x|
+ matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5 , idx_stage2_base_3, matrixArray_stage_5); // |a10|x|.......................|o10|x|x|x|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10);
+ result_0 = _mm512_add_ps(result_0, result_1);
+
+ STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+ }
+ }
+
+ if (tag_m_15x != m) {
+ __m256i matrixArray256;
+ __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+ __m256 result256;
+ __m128 result128, tmp128;
+ unsigned short load256_mask_value = (((unsigned short)0xffff) >> 5);
+ __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+ for (BLASLONG i = tag_m_15x; i < m; i++) {
+ result256 = _mm256_setzero_ps();
+ matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*11]);
+ result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+ result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+ tmp128 = _mm_shuffle_ps(result128, result128, 14);
+ result128 = _mm_add_ps(result128, tmp128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 1);
+ result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * result128[0] + beta * y[i];
+#else
+ y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = result128[0] * alpha;
+#else
+ y[i] = result128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 15 rows parallel processing BF16 GEMV kernel for n=12 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_15x12_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_15x12_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_15x12_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_15x = m - (m%15);
+
+ unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4);
+ __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+ __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7|
+ __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0|
+
+ if (tag_m_15x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+ __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+ __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10;
+ __m512 result_0, result_1;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
+ __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3;
+
+ __m512i M512_EPI32_1, M512_EPI32_2, M512_EPI32_3, M512_EPI32_5;
+ M512_EPI32_1 = _mm512_set1_epi32(1);
+ M512_EPI32_2 = _mm512_add_epi32(M512_EPI32_1, M512_EPI32_1);
+ M512_EPI32_3 = _mm512_add_epi32(M512_EPI32_2, M512_EPI32_1);
+ M512_EPI32_5 = _mm512_add_epi32(M512_EPI32_3, M512_EPI32_2);
+
+ unsigned short BASE_MASK_10_value = ((unsigned short)0x001f);
+ __mmask16 BASE_MASK_10 = *((__mmask16*) &BASE_MASK_10_value);
+ unsigned short BASE_MASK_20_value = ((unsigned short)0x03e0);
+ __mmask16 BASE_MASK_20 = *((__mmask16*) &BASE_MASK_20_value);
+ unsigned short BASE_MASK_30_value = ((unsigned short)0xfc00);
+ __mmask16 BASE_MASK_30 = *((__mmask16*) &BASE_MASK_30_value);
+
+ idx_stage1_base_0 = _mm512_set_epi32( 0, 26, 20, 14, 8, 2, 25, 19, 13, 7, 1, 24, 18, 12, 6, 0);
+ idx_stage1_base_1 = _mm512_add_epi32(idx_stage1_base_0, M512_EPI32_3);
+
+ idx_stage1_base_2 = _mm512_mask_add_epi32(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI32_1);
+ idx_stage1_base_2 = _mm512_mask_sub_epi32(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI32_1);
+ idx_stage1_base_3 = _mm512_add_epi32(idx_stage1_base_2, M512_EPI32_3);
+
+ idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI32_1);
+ idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI32_1);
+ idx_stage1_base_4 = _mm512_mask_sub_epi32(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI32_2);
+ idx_stage1_base_5 = _mm512_add_epi32(idx_stage1_base_4, M512_EPI32_3);
+
+ unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0);
+ __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value);
+ unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00);
+ __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value);
+ idx_stage2_base_0 = _mm512_set_epi32( 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 9, 8, 7, 6, 5);
+ idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5);
+ idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5);
+ idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5);
+
+ xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0 |x1 |x0 |x1 | ... |x0 |x1 |
+ xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2 |x3 |x2 |x3 | ... |x2 |x3 |
+ xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4 |x5 |x4 |x5 | ... |x4 |x5 |
+ xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6 |x7 |x6 |x7 | ... |x6 |x7 |
+ xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8 |x9 |x8 |x9 | ... |x8 |x9 |
+ xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1)); // |x10|x11|x10|x11| ... |x10|x11|
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+ unsigned short store_mask_value = (((unsigned short)0xffff) >> 1);
+ __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) {
+ result_0 = _mm512_setzero_ps();
+ result_1 = _mm512_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[idx_m*12]); // Load 2 rows with n=12 plus 8 elements
+ matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*12 + 32]); // Load 2 rows with n=12 plus 4 element
+ matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*12]); // Load 2 rows with n=12 plus 8 elements
+ matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*12 + 32]); // Load 2 rows with n=12 plus 4 element
+ matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*12]); // Load 2 rows with n=12 plus 8 elements
+ matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*12 + 32]); // Load 2 rows with n=12 plus 4 element
+
+ // Stage 1: interleave per 16 bits
+ matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_0, matrixArray_1); // |a0 |a1 |...|e0 |e1 |a2|a3|...|e2|e3|a4 |a5 |...|e4 |e5 |
+ matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_1, matrixArray_1); // |a6 |a7 |...|e6 |e7 |a8|a9|...|e8|e9|a10|a11|...|e10|e11|
+ matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_2, matrixArray_3); // |f2 |f3 |...|j2 |j3 |f0|f1|...|j0|j1|f4 |f5 |...|j4 |j5 |
+ matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_3, matrixArray_3); // |f8 |f9 |...|j8 |j9 |f6|f7|...|j6|j7|f10|f11|...|j10|j11|
+ matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_4, matrixArray_5); // |k4 |k5 |...|o4 |o5 |k2|k3|...|o2|o3|k0 |k1 |...|o0 |o1 |
+ matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_5, matrixArray_5); // |k10|k11|...|o10|o11|k8|k9|...|o8|o9|k6 |k7 |...|o6 |o7 |
+
+ // Stage 2: interleave per 32 bits
+ matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2); // |a0 |a1 |...|j0 |j1 |x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3); // |a6 |a7 |...|j6 |j7 |x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2); // |a2 |a3 |...|j2 |j3 |x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2); // |a4 |a5 |...|j4 |j5 |x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3); // |a8 |a9 |...|j8 |j9 |x|x|x|x|x|x|x|x|x|x|x|x|
+ matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3); // |a10|a11|...|j10|j11|x|x|x|x|x|x|x|x|x|x|x|x|
+
+ matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0, matrixArray_stage_4); // |a0|a1|.......................|o0|o1|x|x|
+ matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3, matrixArray_stage_5); // |a6|a7|.......................|o6|o7|x|x|
+ matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1 , idx_stage2_base_1, matrixArray_stage_4); // |a2|a3|.......................|o2|o3|x|x|
+ matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2 , idx_stage2_base_3, matrixArray_stage_4); // |a4|a5|.......................|o4|o5|x|x|
+ matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4 , idx_stage2_base_1, matrixArray_stage_5); // |a8|a9|.......................|o8|o9|x|x|
+ matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5 , idx_stage2_base_3, matrixArray_stage_5); // |a10|x|.......................|o10|x|x|x|
+
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+ result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+ result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10);
+ result_0 = _mm512_add_ps(result_0, result_1);
+
+ STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+ }
+ }
+
+ if (tag_m_15x != m) {
+ __m256i matrixArray256;
+ __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+ __m256 result256;
+ __m128 result128, tmp128;
+ unsigned short load256_mask_value = (((unsigned short)0xffff) >> 4);
+ __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+ for (BLASLONG i = tag_m_15x; i < m; i++) {
+ result256 = _mm256_setzero_ps();
+ matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*12]);
+ result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+ result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+ tmp128 = _mm_shuffle_ps(result128, result128, 14);
+ result128 = _mm_add_ps(result128, tmp128);
+ tmp128 = _mm_shuffle_ps(result128, result128, 1);
+ result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * result128[0] + beta * y[i];
+#else
+ y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = result128[0] * alpha;
+#else
+ y[i] = result128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+
+// 16 rows parallel processing BF16 GEMV kernel for n=13 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x13_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x13_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x13_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_16x = m & (~15);
+
+ unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 3);
+ __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+ __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|0|0|0|
+
+ if (tag_m_16x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
+ matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+ __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+ __m512 accum512_0, accum512_1;
+ __m512 result_0, result_1;
+
+ __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+ __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6);
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+ // Prepare X with 2-step interleave way
+ xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+ BF16_INTERLEAVE_1x32(xArray)
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ // Load matrix
+ BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m, 0, x_load_mask)
+
+ matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+ matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+ matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+ matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+ BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m+8, 0, x_load_mask)
+
+ matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+ matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+ matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+ matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+ // interleave per 256 bits
+ BF16_INTERLEAVE256_8x32(matrixArray)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_8x32(matrixArray)
+
+ // Calculate the temp result for a..p[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+ // Reorder and add up the final result
+ result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+ result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+ result_0 = _mm512_add_ps(result_0, result_1);
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ }
+
+ if (m - tag_m_16x > 7) {
+ __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0);
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ // Load matrix
+ BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask)
+
+ matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+ matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+ matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+ matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+ // interleave per 256 bits
+ matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0x44);
+ matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0xee);
+ matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0x44);
+ matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0xee);
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x32(matrixArray)
+
+ // Calculate the temp result for a..h[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+ accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+ accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+ STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+ tag_m_16x += 8;
+ }
+
+ if (m - tag_m_16x > 3) {
+ __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+ __m256 accum256_0, accum256_1;
+
+ xArray256_0 = _mm512_castsi512_si256(xArray_0);
+ xArray256_1 = _mm512_castsi512_si256(xArray_1);
+ xArray256_2 = _mm512_castsi512_si256(xArray_2);
+ xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+ accum256_0 = _mm256_setzero_ps();
+ accum256_1 = _mm256_setzero_ps();
+
+ BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x16(matrixArray256)
+
+ // Calculate the temp result for a..d[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+ accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+ __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+ STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+ tag_m_16x += 4;
+ }
+ }
+
+ if (tag_m_16x != m) {
+ __m256i matrixArray256;
+ __m256 accum256;
+ __m128 accum128, tmp128;
+ for (BLASLONG i = tag_m_16x; i < m; i++) {
+ accum256 = _mm256_setzero_ps();
+ matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*13]); // Load 1 rows with n=13
+ accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * accum128[0] + beta * y[i];
+#else
+ y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = accum128[0] * alpha;
+#else
+ y[i] = accum128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=14 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x14_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x14_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x14_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_16x = m & (~15);
+
+ unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 2);
+ __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+ __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|0|0|
+
+ if (tag_m_16x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
+ matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+ __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+ __m512 accum512_0, accum512_1;
+ __m512 result_0, result_1;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+ __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+ __m512i shift_idx = _mm512_set_epi32(0, 13, 12, 11, 10, 9, 8, 7, 0, 6, 5, 4, 3, 2, 1, 0);
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+ // Prepare X with 2-step interleave way
+ xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+ BF16_INTERLEAVE_1x32(xArray)
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ // Load matrix
+ BF16_MATRIX_MASKZ_LOAD_8x32_2(matrixArray, a, 14, idx_m, 0, load_mask)
+
+ // Pre-stage: shift the 2nd vector 1 position right for each register
+ BF16_PERMUTE_8x32_2(shift_idx, matrixArray)
+
+ // interleave per 256 bits
+ BF16_INTERLEAVE256_8x32(matrixArray)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_8x32(matrixArray)
+
+ // Calculate the temp result for a..p[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+ // Reorder and add up the final result
+ result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+ result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+ result_0 = _mm512_add_ps(result_0, result_1);
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ }
+
+ if (m - tag_m_16x > 7) {
+ __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0);
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ // Load matrix
+ BF16_MATRIX_MASKZ_LOAD_4x32_2(matrixArray, a, 14, tag_m_16x, 0, load_mask)
+
+ // Pre-stage: shift the 2nd vector 1 position right for each register
+ BF16_PERMUTE_4x32_2(shift_idx, matrixArray)
+
+ // interleave per 256 bits
+ BF16_INTERLEAVE256_4x32(matrixArray)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x32(matrixArray)
+
+ // Calculate the temp result for a..h[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+ accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+ accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+ STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+ tag_m_16x += 8;
+ }
+
+ if (m - tag_m_16x > 3) {
+ __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+ __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+ __m256 accum256_0, accum256_1;
+
+ xArray256_0 = _mm512_castsi512_si256(xArray_0);
+ xArray256_1 = _mm512_castsi512_si256(xArray_1);
+ xArray256_2 = _mm512_castsi512_si256(xArray_2);
+ xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+ accum256_0 = _mm256_setzero_ps();
+ accum256_1 = _mm256_setzero_ps();
+
+ BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 14, tag_m_16x, 0, x_load_mask)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x16(matrixArray256)
+
+ // Calculate the temp result for a..d[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+ accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+ __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+ STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+ tag_m_16x += 4;
+ }
+ }
+
+ if (tag_m_16x != m) {
+ __m256i matrixArray256;
+ __m256 accum256;
+ __m128 accum128, tmp128;
+ for (BLASLONG i = tag_m_16x; i < m; i++) {
+ accum256 = _mm256_setzero_ps();
+ matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*14]); // Load 1 rows with n=14
+ accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * accum128[0] + beta * y[i];
+#else
+ y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = accum128[0] * alpha;
+#else
+ y[i] = accum128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=15 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x15_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x15_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x15_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_16x = m & (~15);
+
+ unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 1);
+ __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+ __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|0|
+
+ if (tag_m_16x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
+ matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+ __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+ __m512 accum512_0, accum512_1;
+ __m512 result_0, result_1;
+
+ __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+ __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+ // Prepare X with 2-step interleave way
+ xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+ BF16_INTERLEAVE_1x32(xArray)
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ // Load matrix
+ BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m, 0, x_load_mask)
+
+ matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+ matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+ matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+ matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+ BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m+8, 0, x_load_mask)
+
+ matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+ matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+ matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+ matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+ // interleave per 256 bits
+ BF16_INTERLEAVE256_8x32(matrixArray)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_8x32(matrixArray)
+
+ // Calculate the temp result for a..p[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+ // Reorder and add up the final result
+ result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+ result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+ result_0 = _mm512_add_ps(result_0, result_1);
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ }
+
+ if (m - tag_m_16x > 7) {
+ __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0);
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ // Load matrix
+ BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask)
+
+ matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+ matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+ matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+ matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+ // interleave per 256 bits
+ matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0x44);
+ matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0xee);
+ matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0x44);
+ matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0xee);
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x32(matrixArray)
+
+ // Calculate the temp result for a..h[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+ accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+ accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+ STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+ tag_m_16x += 8;
+ }
+
+ if (m - tag_m_16x > 3) {
+ __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+ __m256 accum256_0, accum256_1;
+
+ xArray256_0 = _mm512_castsi512_si256(xArray_0);
+ xArray256_1 = _mm512_castsi512_si256(xArray_1);
+ xArray256_2 = _mm512_castsi512_si256(xArray_2);
+ xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+ accum256_0 = _mm256_setzero_ps();
+ accum256_1 = _mm256_setzero_ps();
+
+ BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x16(matrixArray256)
+
+ // Calculate the temp result for a..d[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+ accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+ __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+ STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+ tag_m_16x += 4;
+ }
+ }
+
+ if (tag_m_16x != m) {
+ __m256i matrixArray256;
+ __m256 accum256;
+ __m128 accum128, tmp128;
+ for (BLASLONG i = tag_m_16x; i < m; i++) {
+ accum256 = _mm256_setzero_ps();
+ matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*15]); // Load 1 rows with n=15
+ accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * accum128[0] + beta * y[i];
+#else
+ y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = accum128[0] * alpha;
+#else
+ y[i] = accum128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=16 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x16_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x16_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x16_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_16x = m & (~15);
+
+ __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
+
+ if (tag_m_16x > 0) {
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
+ matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+ __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+ __m512 accum512_0, accum512_1;
+ __m512 result_0, result_1;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+ __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+ // Prepare X with 2-step interleave way
+ xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+ BF16_INTERLEAVE_1x32(xArray)
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ matrixArray_8 = _mm512_loadu_si512(&a[(idx_m )*16]); // Load 2 rows with n=16
+ matrixArray_9 = _mm512_loadu_si512(&a[(idx_m+2 )*16]); // Load 2 rows with n=16
+ matrixArray_10 = _mm512_loadu_si512(&a[(idx_m+4 )*16]); // Load 2 rows with n=16
+ matrixArray_11 = _mm512_loadu_si512(&a[(idx_m+6 )*16]); // Load 2 rows with n=16
+ matrixArray_12 = _mm512_loadu_si512(&a[(idx_m+8 )*16]); // Load 2 rows with n=16
+ matrixArray_13 = _mm512_loadu_si512(&a[(idx_m+10)*16]); // Load 2 rows with n=16
+ matrixArray_14 = _mm512_loadu_si512(&a[(idx_m+12)*16]); // Load 2 rows with n=16
+ matrixArray_15 = _mm512_loadu_si512(&a[(idx_m+14)*16]); // Load 2 rows with n=16
+
+ // interleave per 256 bits
+ BF16_INTERLEAVE256_8x32(matrixArray)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_8x32(matrixArray)
+
+ // Calculate the temp result for a..p[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+ // Reorder and add up the final result
+ result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+ result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+ result_0 = _mm512_add_ps(result_0, result_1);
+ STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+ }
+
+ if (m - tag_m_16x > 7) {
+ __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0);
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ matrixArray_4 = _mm512_loadu_si512(&a[(tag_m_16x )*16]); // Load 2 rows with n=16
+ matrixArray_5 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]); // Load 2 rows with n=16
+ matrixArray_6 = _mm512_loadu_si512(&a[(tag_m_16x+4 )*16]); // Load 2 rows with n=16
+ matrixArray_7 = _mm512_loadu_si512(&a[(tag_m_16x+6 )*16]); // Load 2 rows with n=16
+
+ // interleave per 256 bits
+ BF16_INTERLEAVE256_4x32(matrixArray)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x32(matrixArray)
+
+ // Calculate the temp result for a..h[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+ accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+ accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+ __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+ STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+ tag_m_16x += 8;
+ }
+
+ if (m - tag_m_16x > 3) {
+ __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, \
+ matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+ __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+ __m256 accum256_0, accum256_1;
+
+ xArray256_0 = _mm512_castsi512_si256(xArray_0);
+ xArray256_1 = _mm512_castsi512_si256(xArray_1);
+ xArray256_2 = _mm512_castsi512_si256(xArray_2);
+ xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+ accum256_0 = _mm256_setzero_ps();
+ accum256_1 = _mm256_setzero_ps();
+
+ matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x )*16]); // Load 2 rows with n=16
+ matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]); // Load 2 rows with n=16
+
+ matrixArray256_0 = _mm512_castsi512_si256(matrixArray_0);
+ matrixArray256_1 = _mm512_extracti32x8_epi32(matrixArray_0, 0x1);
+ matrixArray256_2 = _mm512_castsi512_si256(matrixArray_1);
+ matrixArray256_3 = _mm512_extracti32x8_epi32(matrixArray_1, 0x1);
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x16(matrixArray256)
+
+ // Calculate the temp result for a..d[0:15]
+ BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+ accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+ __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+ STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+ tag_m_16x += 4;
+ }
+ }
+
+ if (tag_m_16x != m) {
+ __m256i matrixArray256;
+ __m256 accum256;
+ __m128 accum128, tmp128;
+ for (BLASLONG i = tag_m_16x; i < m; i++) {
+ accum256 = _mm256_setzero_ps();
+ matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16
+ accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * accum128[0] + beta * y[i];
+#else
+ y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = accum128[0] * alpha;
+#else
+ y[i] = accum128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n>16 && lda effective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x16p_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x16p_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x16p_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_8x = m & (~7);
+
+ unsigned int load_mask_value = (((unsigned int)0xffffffff) >> (32-n));
+ __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+ __m512i x512 = _mm512_maskz_loadu_epi16(load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|...
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \
+ matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+ __m512 accum512_0, accum512_1, accum512_2, accum512_3;
+ __m256 accum256;
+ __m128 accum128;
+
+ if (tag_m_8x > 0) {
+ __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+
+ __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+ __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+ // Prepare X with 2-step interleave way
+ xArray_0 = x512;
+ BF16_INTERLEAVE_1x32(xArray)
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ // Load 8 rows from matrix
+ BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, 0, load_mask)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_8x32(matrixArray)
+
+ // Calculate the temp result for a..h[0:31]
+ BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+ // Reorder and add up the final result
+ accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+ accum512_3 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+ accum512_2 = _mm512_add_ps(accum512_2, accum512_3);
+ accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_2), _mm512_extractf32x8_ps(accum512_2, 1));
+ STORE8_COMPLETE_RESULT(accum256, y+idx_m)
+ }
+
+ if (m - tag_m_8x > 3) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+
+ // Load 4 rows from matrix
+ BF16_MATRIX_MASKZ_LOAD_4x32(matrixArray, a, lda, tag_m_8x, 0, load_mask)
+
+ // 2-step interleave for matrix
+ BF16_INTERLEAVE_4x32(matrixArray)
+
+ // Calculate the temp result for a..d[0:31]
+ BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+ accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+ accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+ STORE4_COMPLETE_RESULT(accum128, y+tag_m_8x)
+ tag_m_8x += 4;
+ }
+ }
+
+ if (tag_m_8x != m) {
+ __m128 tmp128;
+ for (BLASLONG i = tag_m_8x; i < m; i++) {
+ accum512_0 = _mm512_setzero_ps();
+ matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(i)*lda]); // Load 1 rows with n=16
+ accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) matrixArray_0, (__m512bh) x512);
+ accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * accum128[0] + beta * y[i];
+#else
+ y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = accum128[0] * alpha;
+#else
+ y[i] = accum128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_1x128_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_1x128_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_1x128_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_8x = m & (~7);
+ BLASLONG tag_n_32x = n & (~31);
+ BLASLONG tag_n_128x = n & (~127);
+
+ __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+ accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+ __m512 accum512_bridge[8];
+ __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
+ __m256 accum256_0;
+ __m128 accum128;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+ __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+
+ unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31)));
+ __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+ __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+ __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+ if (tag_m_8x > 0) {
+ for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+ for (int j = idx_m; j < idx_m + 8; j++) {
+ accum512_t_0 = _mm512_setzero_ps();
+ accum512_t_1 = _mm512_setzero_ps();
+ accum512_t_2 = _mm512_setzero_ps();
+ accum512_t_3 = _mm512_setzero_ps();
+ /* Processing the main chunk with 128-elements per round */
+ for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) {
+ BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n + 0)
+ BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32)
+ BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64)
+ BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96)
+
+ BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0)
+ BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32)
+ BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64)
+ BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96)
+
+ BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+ BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1)
+ BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2)
+ BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3)
+ }
+
+ /* Processing the remaining <128 chunk with 32-elements per round */
+ for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) {
+ BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n)
+ BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+ BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+ }
+
+ /* Processing the remaining <32 chunk with masked 32-elements processing */
+ if ((n&31) != 0) {
+ BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask)
+ BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+ BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0)
+ }
+
+ /* Accumulate the 4 registers into 1 register */
+ accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1);
+ accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3);
+ accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2);
+
+ // Temply save the result into a ZMM
+ accum512_bridge[j-idx_m] = accum512_t_0;
+ }
+
+ FP32_INTERLEAVE_8x16_ARRAY(accum512_bridge)
+ FP32_ACCUM2_8x16_ARRAY(accum512_bridge)
+ accum512_bridge[1] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_0, accum512_bridge[4]);
+ accum512_bridge[2] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_1, accum512_bridge[4]);
+ accum512_bridge[1] = _mm512_add_ps(accum512_bridge[1], accum512_bridge[2]);
+ accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_bridge[1]), _mm512_extractf32x8_ps(accum512_bridge[1], 1));
+ STORE8_COMPLETE_RESULT(accum256_0, y+idx_m)
+ }
+ }
+
+ if (tag_m_8x != m) {
+ __m128 tmp128;
+ for (BLASLONG j = tag_m_8x; j < m; j++) {
+ accum512_t_0 = _mm512_setzero_ps();
+ accum512_t_1 = _mm512_setzero_ps();
+ accum512_t_2 = _mm512_setzero_ps();
+ accum512_t_3 = _mm512_setzero_ps();
+ /* Processing the main chunk with 128-elements per round */
+ for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) {
+ BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n + 0)
+ BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32)
+ BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64)
+ BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96)
+
+ BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0)
+ BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32)
+ BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64)
+ BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96)
+
+ BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+ BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1)
+ BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2)
+ BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3)
+ }
+
+ /* Processing the remaining <128 chunk with 32-elements per round */
+ for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) {
+ BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n)
+ BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+ BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+ }
+
+ /* Processing the remaining <32 chunk with masked 32-elements processing */
+ if ((n&31) != 0) {
+ BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask)
+ BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+ BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0)
+ }
+
+ /* Accumulate the 4 registers into 1 register */
+ accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1);
+ accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3);
+ accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2);
+
+ accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_t_0), _mm512_extractf32x8_ps(accum512_t_0, 1));
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[j] = alpha * accum128[0] + beta * y[j];
+#else
+ y[j] = alpha * accum128[0] + y[j];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[j] = accum128[0] * alpha;
+#else
+ y[j] = accum128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n=32 && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x32_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x32_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x32_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_8x = m & (~7);
+ BLASLONG tag_n_32x = n & (~31);
+
+ __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+ accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+ __m256 accum256_0;
+ __m128 accum128;
+
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_set1_ps(beta);
+#endif
+
+ __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
+ __m512i xArray_0;
+
+ unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31)));
+ __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+ if (tag_m_8x > 0) {
+ __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+ __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0);
+ __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+ for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+ accum512_0 = _mm512_setzero_ps();
+ accum512_1 = _mm512_setzero_ps();
+ accum512_2 = _mm512_setzero_ps();
+ accum512_3 = _mm512_setzero_ps();
+ accum512_4 = _mm512_setzero_ps();
+ accum512_5 = _mm512_setzero_ps();
+ accum512_6 = _mm512_setzero_ps();
+ accum512_7 = _mm512_setzero_ps();
+
+ for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) {
+ // Load 8 rows from matrix
+ BF16_MATRIX_LOAD_8x32(matrixArray, a, lda, idx_m, idx_n)
+
+ // Load x
+ BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+
+ // Calculate the temp result for a..h[0:31]
+ BF16_DOT_8x32(accum512, matrixArray, xArray_0)
+ }
+
+ if (tag_n_32x != n) { // Go with masked 512
+ // Load 8 rows from matrix
+ BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, tag_n_32x, tail_mask)
+
+ // Load x
+ BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+
+ // Calculate the temp result for a..h[0:31]
+ BF16_DOT_8x32(accum512, matrixArray, xArray_0)
+ }
+
+ // 2-step interleave for FP32 regsiter array
+ FP32_INTERLEAVE_8x16(accum512)
+
+ // Accumulate the 2 batch of registers into 2 register (0 and 4)
+ FP32_ACCUM2_8x16(accum512)
+
+ accum512_1 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_4);
+ accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_4);
+ accum512_1 = _mm512_add_ps(accum512_1, accum512_2);
+ accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_1), _mm512_extractf32x8_ps(accum512_1, 1));
+ STORE8_COMPLETE_RESULT(accum256_0, y+idx_m)
+ }
+ }
+
+ if (tag_m_8x != m) {
+ __m128 tmp128;
+ for (BLASLONG i = tag_m_8x; i < m; i++) {
+ accum512_0 = _mm512_setzero_ps();
+ for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) {
+ // Load 32 elements from matrix
+ BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, i, idx_n)
+
+ // Load 32 elements from x
+ BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+
+ // Calculate and accumulate the temp result
+ BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+ }
+
+ if (tag_n_32x != n) {
+ // Load tail elements from matrix
+ BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, i, tag_n_32x, tail_mask)
+
+ // Load 32 elements from x
+ BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+
+ // Calculate and accumulate the temp result
+ BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+ }
+
+ accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * accum128[0] + beta * y[i];
+#else
+ y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = accum128[0] * alpha;
+#else
+ y[i] = accum128[0];
+#endif
+#endif
+ }
+ }
+
+ return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n<16 && lda effective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x16m_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x16m_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x16m_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+ BLASLONG tag_m_8x = m & (~7);
+
+ __m256i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
+ __m256i xArray256;
+
+ // Keep align with other kernels and macro definition, the high 256bit is never used
+#ifndef ONE_ALPHA
+ __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha));
+#endif
+#ifndef ZERO_BETA
+ __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta));
+#endif
+
+ __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \
+ accum256_8, accum256_9, accum256_10, accum256_11, accum256_12, accum256_13, accum256_14, accum256_15;
+
+ __m256i M256_EPI32_4 = _mm256_set1_epi32(4);
+ __m256i idx_base_0 = _mm256_set_epi32(11, 10, 9, 8, 3, 2, 1, 0);
+ __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_4);
+
+ unsigned short load_mask_value = (((unsigned short)0xffff) >> (16-n));
+ __mmask16 load_mask = *((__mmask16*) &load_mask_value);
+
+ if (n == 16) {
+ BF16_VECTOR_LOAD_1x16(xArray256, x, 0)
+ } else {
+ BF16_VECTOR_MASKZ_LOAD_1x16(xArray256, x, 0, load_mask)
+ }
+
+ if (n == 16) {
+ for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+ accum256_0 = _mm256_setzero_ps();
+ accum256_1 = _mm256_setzero_ps();
+ accum256_2 = _mm256_setzero_ps();
+ accum256_3 = _mm256_setzero_ps();
+ accum256_4 = _mm256_setzero_ps();
+ accum256_5 = _mm256_setzero_ps();
+ accum256_6 = _mm256_setzero_ps();
+ accum256_7 = _mm256_setzero_ps();
+
+ BF16_MATRIX_LOAD_8x16(matrixArray, a, lda, idx_m, 0)
+
+ BF16_DOT_8x16(accum256, matrixArray, xArray256)
+
+ // 2-step interleave for FP32 regsiter array
+ FP32_INTERLEAVE_8x8(accum256)
+
+ // Accumulate the 2 batch of registers into 2 register (0 and 4)
+ FP32_ACCUM2_8x8(accum256)
+
+ accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4);
+ accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4);
+ accum256_1 = _mm256_add_ps(accum256_1, accum256_2);
+
+ STORE8_COMPLETE_RESULT(accum256_1, y+idx_m)
+ }
+
+ if (tag_m_8x != m) {
+ __m128 accum128, tmp128;
+ for (BLASLONG i = tag_m_8x; i < m; i++) {
+ accum256_0 = _mm256_setzero_ps();
+ matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16
+ accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ y[i] += accum128[0] * alpha;
+ }
+ }
+ } else {
+ for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+ accum256_0 = _mm256_setzero_ps();
+ accum256_1 = _mm256_setzero_ps();
+ accum256_2 = _mm256_setzero_ps();
+ accum256_3 = _mm256_setzero_ps();
+ accum256_4 = _mm256_setzero_ps();
+ accum256_5 = _mm256_setzero_ps();
+ accum256_6 = _mm256_setzero_ps();
+ accum256_7 = _mm256_setzero_ps();
+
+ BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray, a, lda, idx_m, 0, load_mask)
+
+ BF16_DOT_8x16(accum256, matrixArray, xArray256)
+
+ // 2-step interleave for FP32 regsiter array
+ FP32_INTERLEAVE_8x8(accum256)
+
+ // Accumulate the 2 batch of registers into 2 register (0 and 4)
+ FP32_ACCUM2_8x8(accum256)
+
+ accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4);
+ accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4);
+ accum256_1 = _mm256_add_ps(accum256_1, accum256_2);
+
+ STORE8_COMPLETE_RESULT(accum256_1, y+idx_m)
+ }
+
+ if (tag_m_8x != m) {
+ __m128 accum128, tmp128;
+ for (BLASLONG i = tag_m_8x; i < m; i++) {
+ accum256_0 = _mm256_setzero_ps();
+ matrixArray_0 = _mm256_maskz_loadu_epi16(load_mask, &a[(i)*lda]); // Load 1 rows with n=16
+ accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
+ accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+ accum128 = _mm_add_ps(accum128, tmp128);
+ tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+ accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+ y[i] = alpha * accum128[0] + beta * y[i];
+#else
+ y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+ y[i] = accum128[0] * alpha;
+#else
+ y[i] = accum128[0];
+#endif
+#endif
+ }
+ }
+ }
+
+ return 0;
+}
"r" (y), // 3
"r" (dot) // 4
: "cc",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (y), // 3
"r" (dot) // 4
: "cc",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
-#if defined(SKYLAKEX) || defined (COOPERLAKE)
/* the direct sgemm code written by Arjan van der Ven */
#include <immintrin.h>
#include "common.h"
+
+#if defined(SKYLAKEX) || defined (COOPERLAKE)
/*
* "Direct sgemm" code. This code operates directly on the inputs and outputs
* of the sgemm call, avoiding the copies, memory realignments and threading,
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
- "%xmm0", "%xmm1",
- "%xmm2", "%xmm3",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
- "%xmm8", "%xmm9",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
- "%xmm4", "%xmm5",
- "%xmm6", "%xmm7",
- "%xmm8", "%xmm9",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (ap[2]), // 6
"r" (ap[3]) // 7
: "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
--- /dev/null
+#include "common.h"
+
+#if defined(SKYLAKEX)
+#include "srot_microk_skylakex-2.c"
+#elif defined(HASWELL)
+#include "srot_microk_haswell-2.c"
+#endif
+
+#ifndef HAVE_SROT_KERNEL
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+ BLASLONG i = 0;
+ FLOAT f0, f1, f2, f3;
+ FLOAT x0, x1, x2, x3;
+ FLOAT g0, g1, g2, g3;
+ FLOAT y0, y1, y2, y3;
+
+ FLOAT* xp = x;
+ FLOAT* yp = y;
+
+ BLASLONG n1 = n & (~7);
+
+ while (i < n1) {
+ x0 = xp[0];
+ y0 = yp[0];
+ x1 = xp[1];
+ y1 = yp[1];
+ x2 = xp[2];
+ y2 = yp[2];
+ x3 = xp[3];
+ y3 = yp[3];
+
+ f0 = c*x0 + s*y0;
+ g0 = c*y0 - s*x0;
+ f1 = c*x1 + s*y1;
+ g1 = c*y1 - s*x1;
+ f2 = c*x2 + s*y2;
+ g2 = c*y2 - s*x2;
+ f3 = c*x3 + s*y3;
+ g3 = c*y3 - s*x3;
+
+ xp[0] = f0;
+ yp[0] = g0;
+ xp[1] = f1;
+ yp[1] = g1;
+ xp[2] = f2;
+ yp[2] = g2;
+ xp[3] = f3;
+ yp[3] = g3;
+
+ xp += 4;
+ yp += 4;
+ i += 4;
+ }
+
+ while (i < n) {
+ FLOAT temp = c*x[i] + s*y[i];
+ y[i] = c*y[i] - s*x[i];
+ x[i] = temp;
+
+ i++;
+ }
+}
+
+#endif
+static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+ BLASLONG i = 0;
+ BLASLONG ix = 0, iy = 0;
+
+ FLOAT temp;
+
+ if (n <= 0)
+ return;
+ if ((inc_x == 1) && (inc_y == 1)) {
+ srot_kernel(n, x, y, c, s);
+ }
+ else {
+ while (i < n) {
+ temp = c * x[ix] + s * y[iy];
+ y[iy] = c * y[iy] - s * x[ix];
+ x[ix] = temp;
+
+ ix += inc_x;
+ iy += inc_y;
+ i++;
+ }
+ }
+ return;
+}
+
+
+#if defined(SMP)
+static int rot_thread_function(blas_arg_t *args)
+{
+
+ rot_compute(args->m,
+ args->a, args->lda,
+ args->b, args->ldb,
+ ((float *)args->alpha)[0],
+ ((float *)args->alpha)[1]);
+ return 0;
+}
+
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+#if defined(SMP)
+ int nthreads;
+ FLOAT alpha[2]={c, s};
+ FLOAT dummy_c;
+#endif
+
+#if defined(SMP)
+ if (inc_x == 0 || inc_y == 0 || n <= 100000) {
+ nthreads = 1;
+ }
+ else {
+ nthreads = num_cpu_avail(1);
+ }
+
+ if (nthreads == 1) {
+ rot_compute(n, x, inc_x, y, inc_y, c, s);
+ }
+ else {
+#if defined(DOUBLE)
+ int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
+#else
+ int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
+#endif
+ blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+ }
+#else
+ rot_compute(n, x, inc_x, y, inc_y, c, s);
+#endif
+ return 0;
+}
--- /dev/null
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+ BLASLONG i = 0;
+
+ BLASLONG tail_index_8 = n&(~7);
+ BLASLONG tail_index_32 = n&(~31);
+
+ __m256 c_256, s_256;
+ if (n >= 8) {
+ c_256 = _mm256_set1_ps(c);
+ s_256 = _mm256_set1_ps(s);
+ }
+
+ __m256 x0, x1, x2, x3;
+ __m256 y0, y1, y2, y3;
+ __m256 t0, t1, t2, t3;
+
+ for (i = 0; i < tail_index_32; i += 32) {
+ x0 = _mm256_loadu_ps(&x[i + 0]);
+ x1 = _mm256_loadu_ps(&x[i + 8]);
+ x2 = _mm256_loadu_ps(&x[i +16]);
+ x3 = _mm256_loadu_ps(&x[i +24]);
+ y0 = _mm256_loadu_ps(&y[i + 0]);
+ y1 = _mm256_loadu_ps(&y[i + 8]);
+ y2 = _mm256_loadu_ps(&y[i +16]);
+ y3 = _mm256_loadu_ps(&y[i +24]);
+
+ t0 = _mm256_mul_ps(s_256, y0);
+ t1 = _mm256_mul_ps(s_256, y1);
+ t2 = _mm256_mul_ps(s_256, y2);
+ t3 = _mm256_mul_ps(s_256, y3);
+
+ t0 = _mm256_fmadd_ps(c_256, x0, t0);
+ t1 = _mm256_fmadd_ps(c_256, x1, t1);
+ t2 = _mm256_fmadd_ps(c_256, x2, t2);
+ t3 = _mm256_fmadd_ps(c_256, x3, t3);
+
+ _mm256_storeu_ps(&x[i + 0], t0);
+ _mm256_storeu_ps(&x[i + 8], t1);
+ _mm256_storeu_ps(&x[i +16], t2);
+ _mm256_storeu_ps(&x[i +24], t3);
+
+ t0 = _mm256_mul_ps(s_256, x0);
+ t1 = _mm256_mul_ps(s_256, x1);
+ t2 = _mm256_mul_ps(s_256, x2);
+ t3 = _mm256_mul_ps(s_256, x3);
+
+ t0 = _mm256_fmsub_ps(c_256, y0, t0);
+ t1 = _mm256_fmsub_ps(c_256, y1, t1);
+ t2 = _mm256_fmsub_ps(c_256, y2, t2);
+ t3 = _mm256_fmsub_ps(c_256, y3, t3);
+
+ _mm256_storeu_ps(&y[i + 0], t0);
+ _mm256_storeu_ps(&y[i + 8], t1);
+ _mm256_storeu_ps(&y[i +16], t2);
+ _mm256_storeu_ps(&y[i +24], t3);
+
+ }
+
+ for (i = tail_index_32; i < tail_index_8; i += 8) {
+ x0 = _mm256_loadu_ps(&x[i]);
+ y0 = _mm256_loadu_ps(&y[i]);
+
+ t0 = _mm256_mul_ps(s_256, y0);
+ t0 = _mm256_fmadd_ps(c_256, x0, t0);
+ _mm256_storeu_ps(&x[i], t0);
+
+ t0 = _mm256_mul_ps(s_256, x0);
+ t0 = _mm256_fmsub_ps(c_256, y0, t0);
+ _mm256_storeu_ps(&y[i], t0);
+ }
+
+ for (i = tail_index_8; i < n; ++i) {
+ FLOAT temp = c * x[i] + s * y[i];
+ y[i] = c * y[i] - s * x[i];
+ x[i] = temp;
+ }
+}
+#endif
--- /dev/null
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+ BLASLONG i = 0;
+ __m512 c_512, s_512;
+ c_512 = _mm512_set1_ps(c);
+ s_512 = _mm512_set1_ps(s);
+
+ BLASLONG tail_index_16 = n&(~15);
+ BLASLONG tail_index_64 = n&(~63);
+
+
+ __m512 x0, x1, x2, x3;
+ __m512 y0, y1, y2, y3;
+ __m512 t0, t1, t2, t3;
+
+ for (i = 0; i < tail_index_64; i += 64) {
+ x0 = _mm512_loadu_ps(&x[i + 0]);
+ x1 = _mm512_loadu_ps(&x[i +16]);
+ x2 = _mm512_loadu_ps(&x[i +32]);
+ x3 = _mm512_loadu_ps(&x[i +48]);
+ y0 = _mm512_loadu_ps(&y[i + 0]);
+ y1 = _mm512_loadu_ps(&y[i +16]);
+ y2 = _mm512_loadu_ps(&y[i +32]);
+ y3 = _mm512_loadu_ps(&y[i +48]);
+
+ t0 = _mm512_mul_ps(s_512, y0);
+ t1 = _mm512_mul_ps(s_512, y1);
+ t2 = _mm512_mul_ps(s_512, y2);
+ t3 = _mm512_mul_ps(s_512, y3);
+
+ t0 = _mm512_fmadd_ps(c_512, x0, t0);
+ t1 = _mm512_fmadd_ps(c_512, x1, t1);
+ t2 = _mm512_fmadd_ps(c_512, x2, t2);
+ t3 = _mm512_fmadd_ps(c_512, x3, t3);
+
+ _mm512_storeu_ps(&x[i + 0], t0);
+ _mm512_storeu_ps(&x[i +16], t1);
+ _mm512_storeu_ps(&x[i +32], t2);
+ _mm512_storeu_ps(&x[i +48], t3);
+
+ t0 = _mm512_mul_ps(s_512, x0);
+ t1 = _mm512_mul_ps(s_512, x1);
+ t2 = _mm512_mul_ps(s_512, x2);
+ t3 = _mm512_mul_ps(s_512, x3);
+
+ t0 = _mm512_fmsub_ps(c_512, y0, t0);
+ t1 = _mm512_fmsub_ps(c_512, y1, t1);
+ t2 = _mm512_fmsub_ps(c_512, y2, t2);
+ t3 = _mm512_fmsub_ps(c_512, y3, t3);
+
+ _mm512_storeu_ps(&y[i + 0], t0);
+ _mm512_storeu_ps(&y[i +16], t1);
+ _mm512_storeu_ps(&y[i +32], t2);
+ _mm512_storeu_ps(&y[i +48], t3);
+ }
+
+ for (i = tail_index_64; i < tail_index_16; i += 16) {
+ x0 = _mm512_loadu_ps(&x[i]);
+ y0 = _mm512_loadu_ps(&y[i]);
+
+ t0 = _mm512_mul_ps(s_512, y0);
+ t0 = _mm512_fmadd_ps(c_512, x0, t0);
+ _mm512_storeu_ps(&x[i], t0);
+
+ t0 = _mm512_mul_ps(s_512, x0);
+ t0 = _mm512_fmsub_ps(c_512, y0, t0);
+ _mm512_storeu_ps(&y[i], t0);
+ }
+
+
+ if ((n & 15) > 0) {
+ uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15)));
+ __m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]);
+ __m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]);
+ __m512 temp = _mm512_mul_ps(s_512, tail_y);
+ temp = _mm512_fmadd_ps(c_512, tail_x, temp);
+ _mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp);
+ temp = _mm512_mul_ps(s_512, tail_x);
+ temp = _mm512_fmsub_ps(c_512, tail_y, temp);
+ _mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp);
+ }
+}
+#endif
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
return;
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"r" (alpha), // 4
"r" (mvec) // 5
: "cc",
- "%xmm0", "%xmm1",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
goto exit_level_0;
}
liwork = iwork_query;
- lcwork = LAPACK_C2INT(cwork_query);
+ lcwork = LAPACK_Z2INT(cwork_query);
lrwork = (lapack_int)rwork_query;
/* Allocate memory for work arrays */
iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
#define SBGEMM_DEFAULT_P 832
#define SBGEMM_DEFAULT_Q 1026
#define SBGEMM_DEFAULT_R 4096
+#undef DGEMM_DEFAULT_UNROLL_M
+#undef DGEMM_DEFAULT_UNROLL_N
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 8
#endif
#if defined(SPARC) && defined(V7)