OpenBLAS ChangeLog
====================================================================
+Version 0.2.6
+2-Mar-2013
+common:
+ * Improved OpenMP performance slightly. (d744c9)
+ * Improved cblas.h compatibility with Intel MKL.(#185)
+ * Fixed the overflowing bug in single thread cholesky factorization.
+ * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)
+
+x86/x86-64:
+ * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
+ We will tune the performance in future.
+ * Auto-detect Intel Xeon E7540.
+ * Fixed the overflowing buffer bug of gemv. (#173)
+ * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)
+
+MIPS64:
+
+====================================================================
Version 0.2.5
26-Nov-2012
common:
#endif
@$(MAKE) -C reference clean
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
- @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
+ @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d $(NETLIB_LAPACK_DIR); then \
echo deleting $(NETLIB_LAPACK_DIR); \
rm -rf $(NETLIB_LAPACK_DIR) ;\
+# This is triggered by Makefile.system and runs before any of the code is built.
+
export BINARY
export USE_OPENMP
EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif
-all: getarch_2nd
+all: getarch_2nd cblas_noconst.h
./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF)
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
endif
+cblas_noconst.h : cblas.h
+ perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h
+
dummy:
#
# This library's version
-VERSION = 0.2.5
+VERSION = 0.2.6
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h
-DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
+DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
endif
endif
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
-- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes.
+- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
SHANGHAI
ISTANBUL
BOBCAT
+BULLDOZER
c)VIA CPU:
SSE_GENERIC
#ifndef CBLAS_H
#define CBLAS_H
+#include <stddef.h>
+#include "common.h"
+
#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */
-#include <stddef.h>
-#include "common.h"
-
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
+/*Get the build configure on runtime.*/
+char* openblas_get_config(void);
+
#define CBLAS_INDEX size_t
-enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
-enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114};
-enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
-enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
-enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
-
-float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy);
-double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
-float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy);
-double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
-
-openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy);
-openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy);
-openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
-openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
-
-void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
-void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
-void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
-void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
-
-float cblas_sasum (blasint n, float *x, blasint incx);
-double cblas_dasum (blasint n, double *x, blasint incx);
-float cblas_scasum(blasint n, float *x, blasint incx);
-double cblas_dzasum(blasint n, double *x, blasint incx);
-
-float cblas_snrm2 (blasint N, float *X, blasint incX);
-double cblas_dnrm2 (blasint N, double *X, blasint incX);
-float cblas_scnrm2(blasint N, float *X, blasint incX);
-double cblas_dznrm2(blasint N, double *X, blasint incX);
-
-CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx);
-CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
-CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx);
-CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
-
-void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy);
-void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy);
-void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy);
-void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy);
-
-void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
-void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
-void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
-void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
-
-void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
-void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
-void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
-void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
-
-void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
-void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s);
+typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
+typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
+typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
+typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
+typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
+
+float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy);
+double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
+float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
+double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
+
+openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
+openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
+openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
+openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
+
+void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
+void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
+void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
+void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
+
+float cblas_sasum (const blasint n, const float *x, const blasint incx);
+double cblas_dasum (const blasint n, const double *x, const blasint incx);
+float cblas_scasum(const blasint n, const float *x, const blasint incx);
+double cblas_dzasum(const blasint n, const double *x, const blasint incx);
+
+float cblas_snrm2 (const blasint N, const float *X, const blasint incX);
+double cblas_dnrm2 (const blasint N, const double *X, const blasint incX);
+float cblas_scnrm2(const blasint N, const float *X, const blasint incX);
+double cblas_dznrm2(const blasint N, const double *X, const blasint incX);
+
+CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx);
+CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx);
+CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx);
+CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx);
+
+void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy);
+void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy);
+void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy);
+void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy);
+
+void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
+void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
+void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
+void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
+
+void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
+void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
+void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
+void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
+
+void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s);
+void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s);
void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);
-void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
-void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
-
-void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
-void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
-
-void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
-void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
-void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
-void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
-void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
-void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
-
-void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
- float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy);
-void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
- double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy);
-void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
- float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy);
-void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
- double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy);
-
-void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
-void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
-void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
-void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
-void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
-void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
-
-void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
-void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
-void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
-void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
-
-void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
-void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
-void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
-void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
-
-void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
-void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
-void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
-void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
-
-void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
- blasint incX, float *Y, blasint incY, float *A, blasint lda);
-void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
- blasint incX, double *Y, blasint incY, double *A, blasint lda);
-void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
- float *Y, blasint incY, float *A, blasint lda);
-void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
- double *Y, blasint incY, double *A, blasint lda);
-
-void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
- blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
-void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
- blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
-void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
- blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
-void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
- blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
-
-void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
- blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
-void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
- blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
-
-
-void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
-void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
-void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
-void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
-
-void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
-void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
-void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
-void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
-
-void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, float *Ap, float *X, blasint incX);
-void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, double *Ap, double *X, blasint incX);
-void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, float *Ap, float *X, blasint incX);
-void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, double *Ap, double *X, blasint incX);
-
-void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, float *Ap, float *X, blasint incX);
-void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, double *Ap, double *X, blasint incX);
-void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, float *Ap, float *X, blasint incX);
-void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
- blasint N, double *Ap, double *X, blasint incX);
-
-void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
- blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
-void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
- blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
-void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
- blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
-void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
- blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
-
-
-void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
- float *X, blasint incX, float beta, float *Y, blasint incY);
-void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
- double *X, blasint incX, double beta, double *Y, blasint incY);
-
-void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
-void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
-
-void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
-void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
-
-void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
-void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
-void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
-void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
-
-void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
- float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
-void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
- double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
-
-void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
- float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
-void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
- double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
-
-void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
- float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
-void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
- double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
-void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
- float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
-void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
- double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
-
-void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
- float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
-void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
- double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
-void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
- float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
-void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
- double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
-
-void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
- blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
-void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
- blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
-void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
- blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
-void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
- blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
-
-void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
- blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
-void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
- blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
-void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
- blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
-void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
- blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
-
-void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
- enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
-void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
- enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
-void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
- enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
-void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
- enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
-
-void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
- enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
-void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
- enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
-void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
- enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
-void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
- enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
-
-void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
- float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
-void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
- double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
-
-void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
- float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
-void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
- double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
-
-void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
- float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
-void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
- double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
+void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P);
+void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P);
+
+void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
+void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+
+void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX);
+void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX);
+void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX);
+void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX);
+void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX);
+void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX);
+
+void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
+ const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy);
+void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
+ const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy);
+void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
+ const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy);
+void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
+ const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy);
+
+void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
+void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
+void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
+void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
+void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
+void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
+
+void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
+void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
+void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
+void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
+
+void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
+void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
+void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
+void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
+
+void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
+void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
+void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
+void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
+
+void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X,
+ const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
+void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,
+ const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
+void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX,
+ const float *Y, const blasint incY, float *A, const blasint lda);
+void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX,
+ const double *Y, const blasint incY, double *A, const blasint lda);
+
+void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
+ const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
+void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
+ const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
+void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
+ const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
+void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
+ const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
+
+void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A,
+ const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
+void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A,
+ const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
+
+
+void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
+void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
+void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
+void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
+
+void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
+void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
+void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
+void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
+
+void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const float *Ap, float *X, const blasint incX);
+void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const double *Ap, double *X, const blasint incX);
+void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const float *Ap, float *X, const blasint incX);
+void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const double *Ap, double *X, const blasint incX);
+
+void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const float *Ap, float *X, const blasint incX);
+void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const double *Ap, double *X, const blasint incX);
+void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const float *Ap, float *X, const blasint incX);
+void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+ const blasint N, const double *Ap, double *X, const blasint incX);
+
+void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A,
+ const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
+void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A,
+ const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
+void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A,
+ const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
+void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A,
+ const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
+
+
+void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap,
+ const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
+void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap,
+ const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
+
+void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap);
+void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap);
+
+void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A);
+void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A);
+
+void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A);
+void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A);
+void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap);
+void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap);
+
+void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
+ const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
+void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
+ const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
+
+void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
+ const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
+void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
+ const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
+
+void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
+ const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
+void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
+ const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
+void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
+ const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
+void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
+ const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
+
+void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
+ const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
+void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
+ const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
+void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
+ const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
+void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
+ const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
+
+void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
+ const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
+void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
+ const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
+void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
+ const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc);
+void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
+ const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc);
+
+void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
+ const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
+void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
+ const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
+void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
+ const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
+void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
+ const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
+
+void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
+void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
+void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
+void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
+
+void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
+void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
+void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
+void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+ const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
+
+void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
+ const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
+void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
+ const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
+
+void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
+ const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
+void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
+ const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
+
+void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
+ const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
+void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
+ const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
#ifdef __cplusplus
}
-
#endif /* __cplusplus */
#endif
/* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
-#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
+#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
+ (__GNUC__ >= 3 && !defined(__cplusplus)))
#define OPENBLAS_COMPLEX_C99
typedef float _Complex openblas_complex_float;
typedef double _Complex openblas_complex_double;
#include "common_level3.h"
#include "common_lapack.h"
#ifdef CBLAS
-#include "cblas.h"
+/* This header file is generated from "cblas.h" (see Makefile.prebuild). */
+#include "cblas_noconst.h"
#endif
#ifndef ASSEMBLER
#define HAVE_MISALIGNSSE (1 << 15)
#define HAVE_128BITFPU (1 << 16)
#define HAVE_FASTMOVU (1 << 17)
-#define HAVE_AVX (1 << 18)
+#define HAVE_AVX (1 << 18)
+#define HAVE_FMA4 (1 << 19)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
#ifdef NO_AVX
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM
+#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
+#define CORE_BULLDOZER CORE_BARCELONA
#endif
#ifndef CPUIDEMU
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
+ //Use binary code for xgetbv
__asm__ __volatile__
- ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
+ (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A;
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE;
+#ifndef NO_AVX
+ if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4;
+#endif
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
}
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
+ case 14:
+ // Xeon E7540
case 15:
//Xeon Processor E7 (Westmere-EX)
return CPUTYPE_NEHALEM;
return CPUTYPE_OPTERON;
case 1:
case 10:
- case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
return CPUTYPE_BARCELONA;
+ case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(support_avx())
+ return CPUTYPE_BULLDOZER;
+ else
+ return CPUTYPE_BARCELONA; //OS don't support AVX.
case 5:
return CPUTYPE_BOBCAT;
}
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM; //OS doesn't support AVX
+ case 14:
+ //Xeon E7540
case 15:
//Xeon Processor E7 (Westmere-EX)
return CORE_NEHALEM;
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
- else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
- else return CORE_BARCELONA;
+ else if (exfamily == 6) {
+ //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(support_avx())
+ return CORE_BULLDOZER;
+ else
+ return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
+ }else return CORE_BARCELONA;
}
}
printf("#define DTB_SIZE %d\n", info.size * 1024);
printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
+ } else {
+ //fall back for some virtual machines.
+ printf("#define DTB_DEFAULT_ENTRIES 32\n");
}
features = get_cputype(GET_FEATURE);
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
+ if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
+ if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
}
a = (FLOAT *)args -> a;
x = (FLOAT *)args -> b;
- y = (FLOAT *)args -> c;
lda = args -> lda;
incx = args -> ldb;
n_from = 0;
n_to = n;
+ //Use y as each thread's n* COMPSIZE elements in sb buffer
+ y = buffer;
+ buffer += ((COMPSIZE * n + 1023) & ~1023);
+
if (range_m) {
n_from = *(range_m + 0);
n_to = *(range_m + 1);
a += n_from * lda * COMPSIZE;
}
- if (range_n) y += *range_n * COMPSIZE;
if (incx != 1) {
COPY_K(n, x, incx, buffer, 1);
if (num_cpu) {
queue[0].sa = NULL;
- queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
+ queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
#else
ONE, ZERO,
#endif
- buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
+ (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
}
AXPYU_K(n, 0, 0,
TOPDIR = ../..
include ../../Makefile.system
-COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
+COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
$(CC) $(CFLAGS) -c $< -o $(@F)
+openblas_get_config.$(SUFFIX) : openblas_get_config.c
+ $(CC) $(CFLAGS) -c $< -o $(@F)
+
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
+ queue->sb=sb;
}
#ifdef MONITOR
int blas_server_avail = 0;
+static void * blas_thread_buffer[MAX_CPU_NUMBER];
+
void goto_set_num_threads(int num_threads) {
+ int i=0;
+
if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
blas_cpu_number = num_threads;
omp_set_num_threads(blas_cpu_number);
-
+
+ //adjust buffer for each thread
+ for(i=0; i<blas_cpu_number; i++){
+ if(blas_thread_buffer[i]==NULL){
+ blas_thread_buffer[i]=blas_memory_alloc(2);
+ }
+ }
+ for(; i<MAX_CPU_NUMBER; i++){
+ if(blas_thread_buffer[i]!=NULL){
+ blas_memory_free(blas_thread_buffer[i]);
+ blas_thread_buffer[i]=NULL;
+ }
+ }
#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
int blas_thread_init(void){
+ int i=0;
+
blas_get_cpu_number();
blas_server_avail = 1;
+ for(i=0; i<blas_num_threads; i++){
+ blas_thread_buffer[i]=blas_memory_alloc(2);
+ }
+ for(; i<MAX_CPU_NUMBER; i++){
+ blas_thread_buffer[i]=NULL;
+ }
+
return 0;
}
int BLASFUNC(blas_thread_shutdown)(void){
-
+ int i=0;
blas_server_avail = 0;
+ for(i=0; i<MAX_CPU_NUMBER; i++){
+ if(blas_thread_buffer[i]!=NULL){
+ blas_memory_free(blas_thread_buffer[i]);
+ blas_thread_buffer[i]=NULL;
+ }
+ }
+
return 0;
}
static void exec_threads(blas_queue_t *queue){
void *buffer, *sa, *sb;
-
+ int pos=0, release_flag=0;
+
buffer = NULL;
sa = queue -> sa;
sb = queue -> sb;
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
- buffer = blas_memory_alloc(2);
+ pos = omp_get_thread_num();
+ buffer = blas_thread_buffer[pos];
+
+ //fallback
+ if(buffer==NULL) {
+ buffer = blas_memory_alloc(2);
+ release_flag=1;
+ }
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
+ queue->sb=sb;
}
}
}
- if (buffer != NULL) blas_memory_free(buffer);
+ if (release_flag) blas_memory_free(buffer);
}
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
+ queue->sb=sb;
}
#ifdef MONITOR
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
-}
\ No newline at end of file
+}
extern gotoblas_t gotoblas_BOBCAT;
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
+extern gotoblas_t gotoblas_BULLDOZER;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
+#define gotoblas_BULLDOZER gotoblas_BARCELONA
#endif
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
+ //Use binary code for xgetbv
__asm__ __volatile__
- ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
+ (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
//Intel Xeon Processor 5600 (Westmere-EP)
//Xeon Processor E7 (Westmere-EX)
- if (model == 12 || model == 15) return &gotoblas_NEHALEM;
+ //Xeon E7540
+ if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
- fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
+ fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
- fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n");
+ fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
else return &gotoblas_OPTERON;
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
+ } else if (exfamily == 6) {
+ //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(support_avx())
+ return &gotoblas_BULLDOZER;
+ else{
+ fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
+ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+ }
} else {
return &gotoblas_BARCELONA;
}
"Nano",
"Sandybridge",
"Bobcat",
+ "Bulldozer",
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
+ if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
return corename[0];
}
--- /dev/null
+/*****************************************************************************
+Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. Neither the name of the ISCAS nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common.h"
+
+static char* openblas_config_str=""
+#ifdef USE64BITINT
+ "USE64BITINT "
+#endif
+#ifdef NO_CBLAS
+ "NO_CBLAS "
+#endif
+#ifdef NO_LAPACK
+ "NO_LAPACK "
+#endif
+#ifdef NO_LAPACKE
+ "NO_LAPACKE "
+#endif
+#ifdef DYNAMIC_ARCH
+ "DYNAMIC_ARCH "
+#endif
+#ifdef NO_AFFINITY
+ "NO_AFFINITY "
+#endif
+ ;
+
+char* CNAME() {
+ return openblas_config_str;
+}
+
int eax, ebx, ecx, edx;
-#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
+#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
ifeq ($(F_COMPILER), GFORTRAN)
EXTRALIB += -lgfortran
endif
+ifeq ($(USE_OPENMP), 1)
+ifeq ($(C_COMPILER), GCC)
+EXTRALIB += -lgomp
+endif
+endif
endif
ifeq ($(OSNAME), CYGWIN_NT)
@misc_no_underscore_objs = (
openblas_set_num_threads, goto_set_num_threads,
+ openblas_get_config,
);
@misc_underscore_objs = (
#define CORENAME "OPTERON"
#endif
-#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER)
+#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define CORENAME "BOBCAT"
#endif
+#if defined (FORCE_BULLDOZER)
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE "X86"
+#define SUBARCHITECTURE "BULLDOZER"
+#define ARCHCONFIG "-DBULLDOZER " \
+ "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
+ "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \
+ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
+ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
+ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
+ "-DHAVE_AVX -DHAVE_FMA4"
+#define LIBNAME "bulldozer"
+#define CORENAME "BULLDOZER"
+#endif
+
#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL
#ifdef USE64BITINT
printf("#define USE64BITINT\n");
#endif
- printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD);
+ printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD);
}
return 0;
#endif
#endif
+#ifdef BULLDOZER
+
+#ifdef DEBUG
+ fprintf(stderr, "Bulldozer\n");
+#endif
+
+ TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+ TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+ TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+ TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+#ifdef EXPRECISION
+ TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
+ TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
+#endif
+#endif
+
#ifdef NANO
#ifdef DEBUG
--- /dev/null
+SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
+SGEMMINCOPY =
+SGEMMITCOPY =
+SGEMMONCOPY = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
+DGEMMINCOPY = ../generic/gemm_ncopy_2.c
+DGEMMITCOPY = ../generic/gemm_tcopy_2.c
+DGEMMONCOPY = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
+CGEMMINCOPY =
+CGEMMITCOPY =
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ =
+CGEMMITCOPYOBJ =
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
+ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
+STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
+STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
+STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
+
+DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
+DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
+DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
+DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
+
+CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
+CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
+CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
+CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
+
+ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
+
+CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
+ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 4 * SIZE(BB), %xmm2
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
.L52:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
ALIGN_4
.L62:
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
.L72:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
.L92:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
.L102:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
.L112:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
#endif
#define STACKSIZE 16
-
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA 16 + STACKSIZE(%esp)
-#define A 20 + STACKSIZE(%esp)
-#define STACK_LDA 24 + STACKSIZE(%esp)
-#define STACK_X 28 + STACKSIZE(%esp)
-#define STACK_INCX 32 + STACKSIZE(%esp)
-#define Y 36 + STACKSIZE(%esp)
-#define STACK_INCY 40 + STACKSIZE(%esp)
-#define BUFFER 44 + STACKSIZE(%esp)
+#define ARGS 16
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA 16 + STACKSIZE+ARGS(%esp)
+#define A 20 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
+#define STACK_X 28 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
+#define Y 36 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
+#define BUFFER 44 + STACKSIZE+ARGS(%esp)
+#define MMM 0+ARGS(%esp)
+#define YY 4+ARGS(%esp)
+#define AA 8+ARGS(%esp)
+#define LDAX 12+ARGS(%esp)
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
+ movl Y,J
+ movl J,YY # backup Y
+ movl A,J
+ movl J,AA # backup A
+ movl M,J
+ movl J,MMM # backup MM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $21,J
+ subl J,MMM
+ movl J,M
+ jge .L00t
+ ALIGN_4
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A
+
+ movl YY,J
+ movl J,Y
movl STACK_LDA, LDA
+
movl STACK_X, X
movl STACK_INCX, INCX
addss 0 * SIZE(X), %xmm0
movss %xmm0, (Y1)
ALIGN_3
-
.L999:
+ movl M,J
+ leal (,J,SIZE),%eax
+ addl %eax,AA
+ movl YY,J
+ addl %eax,J
+ movl J,YY
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+ addl $ARGS,%esp
ret
EPILOGUE
#endif
#define STACKSIZE 16
-
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA 16 + STACKSIZE(%esp)
-#define A 24 + STACKSIZE(%esp)
-#define STACK_LDA 28 + STACKSIZE(%esp)
-#define STACK_X 32 + STACKSIZE(%esp)
-#define STACK_INCX 36 + STACKSIZE(%esp)
-#define Y 40 + STACKSIZE(%esp)
-#define STACK_INCY 44 + STACKSIZE(%esp)
-#define BUFFER 48 + STACKSIZE(%esp)
+#define ARGS 16
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA 16 + STACKSIZE+ARGS(%esp)
+#define A 24 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
+#define STACK_X 32 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
+#define Y 40 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
+#define BUFFER 48 + STACKSIZE+ARGS(%esp)
+
+#define MMM 0+ARGS(%esp)
+#define YY 4+ARGS(%esp)
+#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
PROLOGUE
+
+ subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
+ movl Y,J
+ movl J,YY # backup Y
+ movl A,J
+ movl J,AA # backup A
+ movl M,J
+ movl J,MMM # backup MM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $20,J
+ subl J,MMM
+ movl J,M
+ jge .L00t
+ ALIGN_4
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A
+
+ movl YY,J
+ movl J,Y
+
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
ALIGN_3
.L999:
+ movl M,J
+ leal (,J,SIZE),%eax
+ addl %eax,AA
+ movl YY,J
+ addl %eax,J
+ movl J,YY
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
+
popl %ebx
popl %esi
popl %edi
popl %ebp
+ addl $ARGS,%esp
ret
EPILOGUE
#endif
#define STACKSIZE 16
-
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA 16 + STACKSIZE(%esp)
-#define A 20 + STACKSIZE(%esp)
-#define STACK_LDA 24 + STACKSIZE(%esp)
-#define STACK_X 28 + STACKSIZE(%esp)
-#define STACK_INCX 32 + STACKSIZE(%esp)
-#define Y 36 + STACKSIZE(%esp)
-#define STACK_INCY 40 + STACKSIZE(%esp)
-#define BUFFER 44 + STACKSIZE(%esp)
+#define ARGS 20
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA 16 + STACKSIZE+ARGS(%esp)
+#define A 20 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
+#define STACK_X 28 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
+#define Y 36 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
+#define BUFFER 44 + STACKSIZE+ARGS(%esp)
+
+#define MMM 0+STACKSIZE(%esp)
+#define NN 4+STACKSIZE(%esp)
+#define AA 8+STACKSIZE(%esp)
+#define LDAX 12+STACKSIZE(%esp)
+#define XX 16+STACKSIZE(%esp)
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
movl STACK_LDA, LDA
+ movl LDA,LDAX # backup LDA
movl STACK_X, X
+ movl X,XX
+ movl N,J
+ movl J,NN # backup N
+ movl A,J
+ movl J,AA # backup A
+ movl M,J
+ movl J,MMM # mov M to MMM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
+ subl $8, J # Don't use last 8 float in the buffer.
+ # Now, split M by block J
+ subl J,MMM # MMM=MMM-J
+ movl J,M
+ jge .L00t
+ ALIGN_4
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A # mov AA to A
+
+ movl NN,%eax
+ movl %eax,N # reset N
+
+
+ movl LDAX, LDA # reset LDA
+ movl XX,X
+
movl STACK_INCX, INCX
movl STACK_INCY, INCY
jg .L06
ALIGN_4
+//Padding zero to prevent loading the dirty number from buffer.
+ movl M, I
+ movl $8, J
+ andl $7, I
+ xorps %xmm0, %xmm0
+ subl I, J
+ ALIGN_2
+.L07:
+ movss %xmm0, 0 * SIZE(Y1)
+ addl $SIZE, Y1
+ decl J
+ jg .L07
+ ALIGN_4
+
.L10:
movl Y, Y1
ALIGN_4
.L999:
+ movl M,J
+ leal (,J,SIZE),%eax
+ addl %eax,AA
+ movl XX,J
+ addl %eax,J
+ movl J,XX
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+
+ addl $ARGS,%esp
ret
EPILOGUE
#endif
#define STACKSIZE 16
+#define ARGS 16
+
+#define M 4 + STACKSIZE+ARGS(%esp)
+#define N 8 + STACKSIZE+ARGS(%esp)
+#define ALPHA 16 + STACKSIZE+ARGS(%esp)
+#define A 24 + STACKSIZE+ARGS(%esp)
+#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
+#define STACK_X 32 + STACKSIZE+ARGS(%esp)
+#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
+#define Y 40 + STACKSIZE+ARGS(%esp)
+#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
+#define BUFFER 48 + STACKSIZE+ARGS(%esp)
+
+#define MMM 0+STACKSIZE(%esp)
+#define AA 4+STACKSIZE(%esp)
+#define LDAX 8+STACKSIZE(%esp)
+#define NN 12+STACKSIZE(%esp)
-#define M 4 + STACKSIZE(%esp)
-#define N 8 + STACKSIZE(%esp)
-#define ALPHA 16 + STACKSIZE(%esp)
-#define A 24 + STACKSIZE(%esp)
-#define STACK_LDA 28 + STACKSIZE(%esp)
-#define STACK_X 32 + STACKSIZE(%esp)
-#define STACK_INCX 36 + STACKSIZE(%esp)
-#define Y 40 + STACKSIZE(%esp)
-#define STACK_INCY 44 + STACKSIZE(%esp)
-#define BUFFER 48 + STACKSIZE(%esp)
-
#define I %eax
#define J %ebx
PROLOGUE
+ subl $ARGS,%esp
+
pushl %ebp
pushl %edi
pushl %esi
PROFCODE
+
movl STACK_LDA, LDA
+ movl LDA,LDAX # backup LDA
+ movl N,J
+ movl J,NN # backup N
+ movl A,J
+ movl J,AA # backup A
+ movl M,J
+ movl J,MMM # mov M to MMM
+.L0t:
+ xorl J,J
+ addl $1,J
+ sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
+ subl $4, J # Don't use last 4 double in the buffer.
+ # Now, split M by block J
+ subl J,MMM # MMM=MMM-J
+ movl J,M
+ jge .L00t
+ ALIGN_4
+
+ movl MMM,%eax
+ addl J,%eax
+ jle .L999x
+ movl %eax,M
+
+.L00t:
+ movl AA,%eax
+ movl %eax,A # mov AA to A
+
+ movl NN,%eax
+ movl %eax,N # reset N
+
+
+ movl LDAX, LDA # reset LDA
movl STACK_X, X
movl STACK_INCX, INCX
movl STACK_INCY, INCY
leal (,INCY, SIZE), INCY
leal (,LDA, SIZE), LDA
+
subl $-16 * SIZE, A
cmpl $0, N
ALIGN_4
.L999:
+ movl M,J
+ leal (,J,SIZE),%eax
+ addl %eax,AA
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
+
+ addl $ARGS,%esp
ret
EPILOGUE
sarl $5, I
jle .L113
-#if defined(BARCELONA)
+#if defined(BARCELONA) || defined(BULLDOZER)
movaps %xmm0, %xmm1
mulps -32 * SIZE(X), %xmm1
sarl $4, I
jle .L113
-#if defined(BARCELONA)
+#if defined(BARCELONA) || defined(BULLDOZER)
movaps %xmm0, %xmm1
mulpd -16 * SIZE(X), %xmm1
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
.L42:
mulpd %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
.L72:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
ALIGN_4
.L62:
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
.L52:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
.L112:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
.L102:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
.L92:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
.L42:
mulpd %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
.L52:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
ALIGN_4
.L62:
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
.L72:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
.L92:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
.L102:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
.L112:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
.L42:
mulpd %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
.L92:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
.L102:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
.L112:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
.L52:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
ALIGN_4
.L62:
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
.L72:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
#define BB %ecx
#define LDC %ebp
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define movsd movlps
#endif
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 4 * SIZE(BB), %xmm2
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
.L52:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
ALIGN_4
.L62:
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
.L72:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
.L92:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
.L102:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
.L112:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
#define movsd movlps
#endif
-#ifdef BARCELONA
+#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5)
#define movsd movlps
#endif
-#ifdef BARCELONA
+#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta
#define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5)
#define movsd movlps
#endif
-#ifdef BARCELONA
+#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5)
#define movsd movlps
#endif
-#ifdef BARCELONA
+#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta
#define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5)
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4
--- /dev/null
+ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVTKERNEL = zgemv_t_dup.S
+
+SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
+SGEMMINCOPY = ../generic/gemm_ncopy_8.c
+SGEMMITCOPY = ../generic/gemm_tcopy_8.c
+SGEMMONCOPY = gemm_ncopy_4_opteron.S
+SGEMMOTCOPY = gemm_tcopy_4_opteron.S
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S
+DGEMMINCOPY =
+DGEMMITCOPY =
+DGEMMONCOPY = gemm_ncopy_4_opteron.S
+DGEMMOTCOPY = gemm_tcopy_4_opteron.S
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPY = zgemm_ncopy_2.S
+CGEMMOTCOPY = zgemm_tcopy_2.S
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
+ZGEMMINCOPY =
+ZGEMMITCOPY =
+ZGEMMONCOPY = zgemm_ncopy_2.S
+ZGEMMOTCOPY = zgemm_tcopy_2.S
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
+STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
+STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
+STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
+
+DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
+DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
+DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
+DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
+
+CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
+CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
+CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
+CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
+
+ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
+ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
+ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
+ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
+
+CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
+ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
--- /dev/null
+/*********************************************************************/\r
+/* Copyright 2009, 2010 The University of Texas at Austin. */\r
+/* All rights reserved. */\r
+/* */\r
+/* Redistribution and use in source and binary forms, with or */\r
+/* without modification, are permitted provided that the following */\r
+/* conditions are met: */\r
+/* */\r
+/* 1. Redistributions of source code must retain the above */\r
+/* copyright notice, this list of conditions and the following */\r
+/* disclaimer. */\r
+/* */\r
+/* 2. Redistributions in binary form must reproduce the above */\r
+/* copyright notice, this list of conditions and the following */\r
+/* disclaimer in the documentation and/or other materials */\r
+/* provided with the distribution. */\r
+/* */\r
+/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */\r
+/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */\r
+/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */\r
+/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */\r
+/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */\r
+/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */\r
+/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */\r
+/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */\r
+/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */\r
+/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */\r
+/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */\r
+/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */\r
+/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */\r
+/* POSSIBILITY OF SUCH DAMAGE. */\r
+/* */\r
+/* The views and conclusions contained in the software and */\r
+/* documentation are those of the authors and should not be */\r
+/* interpreted as representing official policies, either expressed */\r
+/* or implied, of The University of Texas at Austin. */\r
+/*********************************************************************/\r
+\r
+#define ASSEMBLER\r
+#include "common.h"\r
+ \r
+#define OLD_M %rdi\r
+#define OLD_N %rsi\r
+#define M %r13\r
+#define N %r14\r
+#define K %rdx\r
+\r
+#define A %rcx\r
+#define B %r8\r
+#define C %r9\r
+#define LDC %r10\r
+ \r
+#define I %r11\r
+#define AO %rdi\r
+#define BO %rsi\r
+#define CO1 %r15\r
+#define CO2 %r12\r
+#define BB %rbp\r
+#define J %rbx\r
+\r
+#ifndef WINDOWS_ABI\r
+\r
+#define STACKSIZE 96\r
+\r
+#define ALPHA 48(%rsp)\r
+#define OFFSET 56(%rsp)\r
+#define KK 64(%rsp)\r
+#define KKK 72(%rsp)\r
+\r
+#else\r
+\r
+#define STACKSIZE 256\r
+\r
+#define OLD_A 40 + STACKSIZE(%rsp)\r
+#define OLD_B 48 + STACKSIZE(%rsp)\r
+#define OLD_C 56 + STACKSIZE(%rsp)\r
+#define OLD_LDC 64 + STACKSIZE(%rsp)\r
+#define OLD_OFFSET 72 + STACKSIZE(%rsp)\r
+\r
+#define ALPHA 224(%rsp)\r
+#define OFFSET 232(%rsp)\r
+#define KK 240(%rsp)\r
+#define KKK 248(%rsp)\r
+\r
+#endif\r
+\r
+#define movapd movaps\r
+#define movupd movups\r
+\r
+#define KERNEL1(xx) \\r
+ vfmaddpd %xmm8,%xmm1,%xmm0,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm0 ;\\r
+ vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\\r
+ vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\\r
+ vfmaddpd %xmm12,%xmm2,%xmm1,%xmm12 ;\\r
+ vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\\r
+ vfmaddpd %xmm13,%xmm2,%xmm3,%xmm13 ;\\r
+ vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\\r
+ vfmaddpd %xmm11,%xmm3,%xmm0,%xmm11 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\\r
+ vmovups -12 * SIZE(AO, %rax, 4), %xmm0 ;\\r
+ vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\\r
+ vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vmovaps %xmm0, %xmm2\r
+\r
+#define KERNEL2(xx) \\r
+ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm0 ;\\r
+ vmovups -10 * SIZE(AO, %rax, 4),%xmm2 ;\\r
+/*A*/ vmovups (AO, %rax, 4), %xmm6 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\\r
+ vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\\r
+/**/ vmovddup (BO, %rax, 4), %xmm1 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vmovddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vmovaps %xmm4, %xmm2\r
+\r
+#define KERNEL3(xx) \\r
+ vfmaddpd %xmm8,%xmm5, %xmm4, %xmm8 ;\\r
+ vmovaps %xmm2, %xmm4 ;\\r
+ vmovups -6 * SIZE(AO, %rax, 4),%xmm2 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\\r
+ vmovddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm4, %xmm11 ;\\r
+ vmovups -4 * SIZE(AO, %rax, 4), %xmm4 ;\\r
+ vmovddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\\r
+ vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\\r
+ vmovddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vmovaps %xmm4, %xmm2\r
+\r
+#define KERNEL4(xx) \\r
+ vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm4 ;\\r
+ vmovups -2 * SIZE(AO, %rax, 4),%xmm2 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm5 ,%xmm12;\\r
+/*A*/ vmovups 8 * SIZE(AO, %rax, 4), %xmm7 ;\\r
+ vmovddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\\r
+/**/ vmovddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vmovddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vmovaps %xmm6, %xmm2\r
+\r
+#define KERNEL5(xx) \\r
+ vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm6 ;\\r
+ vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\\r
+ vmovddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\\r
+ vmovups 4 * SIZE(AO, %rax, 4), %xmm6 ;\\r
+ vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vmovaps %xmm6, %xmm2\r
+\r
+#define KERNEL6(xx) \\r
+ vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm6 ;\\r
+ vmovups 6 * SIZE(AO, %rax, 4),%xmm2 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\\r
+/*A*/ vmovups 16 * SIZE(AO, %rax, 4), %xmm0 ;\\r
+ vmovddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\\r
+/**/ vmovddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\\r
+ vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\\r
+ vmovddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vmovaps %xmm7, %xmm2\r
+\r
+#define KERNEL7(xx) \\r
+ vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm7 ;\\r
+ vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\\r
+ vmovddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\\r
+ vmovups 12 * SIZE(AO, %rax, 4), %xmm7 ;\\r
+ vmovddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vmovddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vmovaps %xmm7, %xmm2\r
+\r
+#define KERNEL8(xx) \\r
+ vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm7 ;\\r
+ vmovups 14 * SIZE(AO, %rax, 4),%xmm2 ;\\r
+/*A*/ vmovups 24 * SIZE(AO, %rax, 4), %xmm4 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\\r
+ vmovddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\\r
+ vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\\r
+/**/ vmovddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vmovddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\\r
+ vmovaps %xmm0, %xmm2 ;\\r
+ addq $8 * SIZE, %rax ;\\r
+\r
+#define KERNEL_SUB1(xx) \\r
+ vfmaddpd %xmm8, %xmm1, %xmm0,%xmm8 ;\\r
+ vmovapd %xmm2, %xmm0 ;\\r
+ vmovups -14 * SIZE(AO),%xmm2 ;\\r
+ vfmaddpd %xmm12, %xmm2, %xmm1,%xmm12 ;\\r
+ vmovddup -14 * SIZE(BO), %xmm1 ;\\r
+ vfmaddpd %xmm9, %xmm3, %xmm0,%xmm9 ;\\r
+ vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup -13 * SIZE(BO), %xmm3 ;\\r
+ vfmaddpd %xmm10, %xmm1, %xmm0,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm1 ,%xmm14 ;\\r
+ vfmaddpd %xmm11, %xmm3, %xmm0,%xmm11 ;\\r
+ vfmaddpd %xmm15, %xmm2, %xmm3,%xmm15 ;\\r
+ vmovups -12 * SIZE(AO), %xmm0 ;\\r
+ vmovddup -12 * SIZE(BO), %xmm1 ;\\r
+ vmovddup -11 * SIZE(BO), %xmm3 ;\\r
+ vmovapd %xmm0, %xmm2\r
+\r
+\r
+#define KERNEL_SUB2(xx) \\r
+ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm0 ;\\r
+ vmovups -10 * SIZE(AO),%xmm2 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup -10 * SIZE(BO), %xmm1 ;\\r
+ vmovddup -9 * SIZE(BO), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vmovups (AO), %xmm0 ;\\r
+ vmovddup (BO), %xmm1 ;\\r
+ vmovddup -7 * SIZE(BO), %xmm3 ;\\r
+ vmovaps %xmm4, %xmm2\r
+\r
+#define KERNEL_SUB3(xx) \\r
+ vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm4 ;\\r
+ vmovups -6 * SIZE(AO),%xmm2 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\\r
+ vmovddup -6 * SIZE(BO), %xmm5 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup -5 * SIZE(BO), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\\r
+ vmovups -4 * SIZE(AO), %xmm4 ;\\r
+ vmovddup -4 * SIZE(BO), %xmm5 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vmovddup -3 * SIZE(BO), %xmm3 ;\\r
+ vmovaps %xmm4, %xmm2\r
+\r
+#define KERNEL_SUB4(xx) \\r
+ vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\\r
+ vmovaps %xmm2, %xmm4 ;\\r
+ vmovups -2 * SIZE(AO),%xmm2 ;\\r
+ vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\\r
+ vmovddup -2 * SIZE(BO), %xmm5 ;\\r
+ vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\\r
+ vmovddup -1 * SIZE(BO), %xmm3 ;\\r
+ vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\\r
+ vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\\r
+ vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\\r
+ vmovddup 1 * SIZE(BO), %xmm3 ;\\r
+ vmovaps %xmm0, %xmm2\r
+\r
+ PROLOGUE\r
+ PROFCODE\r
+ \r
+ subq $STACKSIZE, %rsp\r
+ movq %rbx, (%rsp)\r
+ movq %rbp, 8(%rsp)\r
+ movq %r12, 16(%rsp)\r
+ movq %r13, 24(%rsp)\r
+ movq %r14, 32(%rsp)\r
+ movq %r15, 40(%rsp)\r
+\r
+ vzeroupper\r
+\r
+#ifdef WINDOWS_ABI\r
+ movq %rdi, 48(%rsp)\r
+ movq %rsi, 56(%rsp)\r
+ movups %xmm6, 64(%rsp)\r
+ movups %xmm7, 80(%rsp)\r
+ movups %xmm8, 96(%rsp)\r
+ movups %xmm9, 112(%rsp)\r
+ movups %xmm10, 128(%rsp)\r
+ movups %xmm11, 144(%rsp)\r
+ movups %xmm12, 160(%rsp)\r
+ movups %xmm13, 176(%rsp)\r
+ movups %xmm14, 192(%rsp)\r
+ movups %xmm15, 208(%rsp)\r
+\r
+ movq ARG1, OLD_M\r
+ movq ARG2, OLD_N\r
+ movq ARG3, K\r
+ movq OLD_A, A\r
+ movq OLD_B, B\r
+ movq OLD_C, C\r
+ movq OLD_LDC, LDC\r
+#ifdef TRMMKERNEL\r
+ movsd OLD_OFFSET, %xmm12\r
+#endif\r
+ vmovaps %xmm3, %xmm0\r
+\r
+#else\r
+ movq STACKSIZE + 8(%rsp), LDC\r
+#ifdef TRMMKERNEL\r
+ movsd STACKSIZE + 16(%rsp), %xmm12\r
+#endif\r
+\r
+#endif\r
+\r
+ movq OLD_M, M\r
+ movq OLD_N, N\r
+\r
+ subq $-16 * SIZE, A\r
+ subq $-16 * SIZE, B\r
+\r
+ vmovsd %xmm0, ALPHA\r
+\r
+ salq $BASE_SHIFT, LDC\r
+\r
+#ifdef TRMMKERNEL\r
+ vmovsd %xmm12, OFFSET\r
+ vmovsd %xmm12, KK\r
+#ifndef LEFT\r
+ negq KK\r
+#endif \r
+#endif\r
+ movq N, J\r
+ sarq $2, J # j = (n >> 2)\r
+ jle .L40\r
+ ALIGN_4\r
+\r
+.L01:\r
+ movq C, CO1 # coffset1 = c\r
+ leaq (C, LDC, 2), CO2 # coffset2 = c + ldc\r
+ \r
+ leaq (C, LDC, 4), C # c += 4 * ldc\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ movq OFFSET, %rax\r
+ movq %rax, KK\r
+#endif \r
+\r
+ movq A, AO # aoffset = a\r
+\r
+ movq K, %rax\r
+ salq $BASE_SHIFT + 2, %rax\r
+ leaq (B, %rax), BB\r
+\r
+ movq M, I\r
+ sarq $2, I # i = (m >> 2)\r
+ jle .L20\r
+ ALIGN_4\r
+\r
+ .align 16\r
+.L11:\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (B, %rax, 4), BO\r
+#endif \r
+\r
+ vzeroall\r
+ prefetcht0 256(CO1)\r
+ prefetcht0 320(CO1)\r
+ prefetcht0 256(CO2)\r
+ prefetcht0 320(CO2)\r
+ vmovups -16 * SIZE(AO), %xmm0\r
+ vmovddup -16 * SIZE(BO), %xmm1\r
+ vmovddup -15 * SIZE(BO), %xmm3\r
+ vmovups -8 * SIZE(AO), %xmm4\r
+ vmovddup -8 * SIZE(BO), %xmm5\r
+\r
+ vmovaps %xmm0, %xmm2\r
+\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $4, %rax\r
+#else\r
+ addq $4, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+ andq $-8, %rax\r
+ salq $BASE_SHIFT, %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 4), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L15\r
+ ALIGN_4\r
+\r
+ .align 16\r
+.L12:\r
+ prefetcht0 (AO,%rax,4)\r
+ prefetcht0 (BO,%rax,4)\r
+ KERNEL1(16 * 0)\r
+ KERNEL2(16 * 0)\r
+ KERNEL3(16 * 0)\r
+ KERNEL4(16 * 0)\r
+ KERNEL5(16 * 0)\r
+ KERNEL6(16 * 0)\r
+ KERNEL7(16 * 0)\r
+ KERNEL8(16 * 0)\r
+ NOBRANCH\r
+ je .L15\r
+ KERNEL1(16 * 0)\r
+ KERNEL2(16 * 0)\r
+ KERNEL3(16 * 0)\r
+ KERNEL4(16 * 0)\r
+ KERNEL5(16 * 0)\r
+ KERNEL6(16 * 0)\r
+ KERNEL7(16 * 0)\r
+ KERNEL8(16 * 0)\r
+ NOBRANCH\r
+ je .L15\r
+ KERNEL1(16 * 0)\r
+ KERNEL2(16 * 0)\r
+ KERNEL3(16 * 0)\r
+ KERNEL4(16 * 0)\r
+ KERNEL5(16 * 0)\r
+ KERNEL6(16 * 0)\r
+ KERNEL7(16 * 0)\r
+ KERNEL8(16 * 0)\r
+ NOBRANCH\r
+ je .L15\r
+ KERNEL1(16 * 0)\r
+ KERNEL2(16 * 0)\r
+ KERNEL3(16 * 0)\r
+ KERNEL4(16 * 0)\r
+ KERNEL5(16 * 0)\r
+ KERNEL6(16 * 0)\r
+ KERNEL7(16 * 0)\r
+ KERNEL8(16 * 0)\r
+ NOBRANCH\r
+ je .L15\r
+ KERNEL1(16 * 0)\r
+ KERNEL2(16 * 0)\r
+ KERNEL3(16 * 0)\r
+ KERNEL4(16 * 0)\r
+ KERNEL5(16 * 0)\r
+ KERNEL6(16 * 0)\r
+ KERNEL7(16 * 0)\r
+ KERNEL8(16 * 0)\r
+ NOBRANCH\r
+ je .L15\r
+ KERNEL1(16 * 0)\r
+ KERNEL2(16 * 0)\r
+ KERNEL3(16 * 0)\r
+ KERNEL4(16 * 0)\r
+ KERNEL5(16 * 0)\r
+ KERNEL6(16 * 0)\r
+ KERNEL7(16 * 0)\r
+ KERNEL8(16 * 0)\r
+ NOBRANCH\r
+ je .L15\r
+ KERNEL1(16 * 0)\r
+ KERNEL2(16 * 0)\r
+ KERNEL3(16 * 0)\r
+ KERNEL4(16 * 0)\r
+ KERNEL5(16 * 0)\r
+ KERNEL6(16 * 0)\r
+ KERNEL7(16 * 0)\r
+ KERNEL8(16 * 0)\r
+ NOBRANCH\r
+ je .L15\r
+ KERNEL1(16 * 0)\r
+ KERNEL2(16 * 0)\r
+ KERNEL3(16 * 0)\r
+ KERNEL4(16 * 0)\r
+ KERNEL5(16 * 0)\r
+ KERNEL6(16 * 0)\r
+ KERNEL7(16 * 0)\r
+ KERNEL8(16 * 0)\r
+ jl .L12\r
+ ALIGN_4\r
+\r
+.L15:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ testq $4, %rax\r
+ je .L16\r
+ ALIGN_4\r
+\r
+ KERNEL_SUB1(16 * 0)\r
+ KERNEL_SUB2(16 * 0)\r
+ KERNEL_SUB3(16 * 0)\r
+ KERNEL_SUB4(16 * 0)\r
+\r
+ subq $-16 * SIZE, BO\r
+ subq $-16 * SIZE, AO\r
+ ALIGN_4\r
+\r
+.L16:\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L19\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 4), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L17:\r
+ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8\r
+ vmovaps %xmm2, %xmm0\r
+ vmovups -14 * SIZE(AO, %rax, 4),%xmm2\r
+ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12\r
+ vmovddup -14 * SIZE(BO, %rax, 4), %xmm1\r
+ vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9\r
+ vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13\r
+ vmovddup -13 * SIZE(BO, %rax, 4), %xmm3\r
+ vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10\r
+ vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14\r
+ vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11\r
+ vmovups -12 * SIZE(AO, %rax, 4), %xmm0\r
+ vmovddup -12 * SIZE(BO, %rax, 4), %xmm1\r
+ vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15\r
+ vmovddup -11 * SIZE(BO, %rax, 4), %xmm3\r
+ vmovaps %xmm0, %xmm2\r
+\r
+ addq $SIZE, %rax\r
+ jl .L17\r
+ ALIGN_4\r
+\r
+.L19:\r
+ // prefetch -8 * SIZE(BB)\r
+ subq $-16 * SIZE, BB\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ vfmaddpd (CO1),%xmm7, %xmm8,%xmm8\r
+ vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12,%xmm12\r
+ .align 2\r
+ vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9\r
+ vfmaddpd 2 * SIZE(CO1, LDC),%xmm7, %xmm13,%xmm13\r
+ .align 2\r
+ vfmaddpd (CO2),%xmm7, %xmm10,%xmm10\r
+ vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm14,%xmm14\r
+ .align 2\r
+ vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11\r
+ vfmaddpd 2 * SIZE(CO2, LDC),%xmm7, %xmm15,%xmm15\r
+\r
+#else\r
+ vmulpd %xmm7, %xmm8,%xmm8\r
+ vmulpd %xmm7, %xmm12,%xmm12\r
+ vmulpd %xmm7, %xmm9,%xmm9\r
+ vmulpd %xmm7, %xmm13,%xmm13\r
+ vmulpd %xmm7, %xmm10,%xmm10\r
+ vmulpd %xmm7, %xmm14,%xmm14\r
+ vmulpd %xmm7, %xmm11,%xmm11\r
+ vmulpd %xmm7, %xmm15,%xmm15\r
+\r
+#endif\r
+\r
+ .align 2\r
+ vmovups %xmm8, (CO1)\r
+ vmovups %xmm12, 2 * SIZE(CO1)\r
+ .align 2\r
+ vmovups %xmm9, (CO1, LDC)\r
+ vmovups %xmm13, 2 * SIZE(CO1, LDC)\r
+ .align 2\r
+ vmovups %xmm10, (CO2)\r
+ vmovups %xmm14, 2 * SIZE(CO2)\r
+ .align 2\r
+ vmovups %xmm11, (CO2, LDC)\r
+ vmovups %xmm15, 2 * SIZE(CO2, LDC)\r
+\r
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq K, %rax\r
+ subq KKK, %rax\r
+ leaq (,%rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 4), BO\r
+#endif\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ addq $4, KK\r
+#endif\r
+\r
+ addq $4 * SIZE, CO1 # coffset += 4\r
+ addq $4 * SIZE, CO2 # coffset += 4\r
+ decq I # i --\r
+ BRANCH\r
+ jg .L11\r
+ ALIGN_4 \r
+\r
+.L20:\r
+ testq $3, M\r
+ je .L39\r
+\r
+ testq $2, M\r
+ je .L30\r
+ ALIGN_4\r
+\r
+.L21:\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (B, %rax, 4), BO\r
+#endif \r
+\r
+ vmovups -16 * SIZE(AO), %xmm0\r
+ vxorps %xmm8, %xmm8, %xmm8\r
+ vmovups -12 * SIZE(AO), %xmm2\r
+ vxorps %xmm9, %xmm9 ,%xmm9\r
+ vmovddup -16 * SIZE(BO), %xmm1\r
+ vxorps %xmm10, %xmm10, %xmm10\r
+ vmovddup -15 * SIZE(BO), %xmm5\r
+ vxorps %xmm11, %xmm11, %xmm11\r
+ vmovddup -8 * SIZE(BO), %xmm3\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $2, %rax\r
+#else\r
+ addq $4, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+ andq $-4, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 4), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L26\r
+ ALIGN_4\r
+\r
+.L22:\r
+ vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8\r
+ vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9\r
+ vmovddup -14 * SIZE(BO, %rax, 4), %xmm1\r
+ vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10\r
+ vmovddup -13 * SIZE(BO, %rax, 4), %xmm5\r
+ vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11\r
+ vmovups -14 * SIZE(AO, %rax, 2), %xmm0\r
+ vmovddup -12 * SIZE(BO, %rax, 4), %xmm1\r
+ vmovddup -11 * SIZE(BO, %rax, 4), %xmm5\r
+ vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8\r
+ vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9\r
+ vmovddup -10 * SIZE(BO, %rax, 4), %xmm1\r
+ vmovddup -9 * SIZE(BO, %rax, 4), %xmm5\r
+ vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10\r
+ vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11\r
+ vmovddup (BO, %rax, 4), %xmm1\r
+ vmovddup -7 * SIZE(BO, %rax, 4), %xmm5\r
+ vmovups -8 * SIZE(AO, %rax, 2), %xmm0\r
+ vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8\r
+ vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9\r
+ vmovddup -6 * SIZE(BO, %rax, 4), %xmm3\r
+ vmovddup -5 * SIZE(BO, %rax, 4), %xmm5\r
+ vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10\r
+ vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11\r
+ vmovups -10 * SIZE(AO, %rax, 2), %xmm2\r
+ vmovddup -4 * SIZE(BO, %rax, 4), %xmm3\r
+ vmovddup -3 * SIZE(BO, %rax, 4), %xmm5\r
+ vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8\r
+ vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9\r
+ vmovddup -2 * SIZE(BO, %rax, 4), %xmm3\r
+ vmovddup -1 * SIZE(BO, %rax, 4), %xmm5\r
+ vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10\r
+ vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11\r
+ vmovddup 8 * SIZE(BO, %rax, 4), %xmm3\r
+ vmovups -4 * SIZE(AO, %rax, 2), %xmm2\r
+ vmovddup 1 * SIZE(BO, %rax, 4), %xmm5\r
+\r
+ addq $4 * SIZE, %rax\r
+ BRANCH\r
+ jl .L22\r
+ ALIGN_4\r
+\r
+.L26:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L29\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 4), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L27:\r
+ vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8\r
+ vmovddup -14 * SIZE(BO, %rax, 4), %xmm1\r
+ vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9\r
+ vmovddup -13 * SIZE(BO, %rax, 4), %xmm5\r
+ vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10\r
+ vmovddup -12 * SIZE(BO, %rax, 4), %xmm1\r
+ vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11\r
+ vmovups -14 * SIZE(AO, %rax, 2), %xmm0\r
+ vmovddup -11 * SIZE(BO, %rax, 4), %xmm5\r
+\r
+ addq $SIZE, %rax\r
+ jl .L27\r
+ ALIGN_4\r
+\r
+.L29:\r
+#ifndef TRMMKERNEL\r
+\r
+ vfmaddpd (CO1),%xmm7, %xmm8,%xmm8\r
+ vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9\r
+ vfmaddpd (CO2),%xmm7, %xmm10,%xmm10\r
+ vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11\r
+\r
+#else\r
+ vmulpd %xmm7, %xmm8,%xmm8\r
+ vmulpd %xmm7, %xmm9,%xmm9\r
+ vmulpd %xmm7, %xmm10,%xmm10\r
+ vmulpd %xmm7, %xmm11,%xmm11\r
+\r
+#endif\r
+\r
+ vmovups %xmm8, (CO1)\r
+ vmovups %xmm9, (CO1, LDC)\r
+\r
+ vmovups %xmm10, (CO2)\r
+ vmovups %xmm11, (CO2, LDC)\r
+\r
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq K, %rax\r
+ subq KKK, %rax\r
+ leaq (,%rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 4), BO\r
+#endif\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ addq $2, KK\r
+#endif\r
+\r
+ addq $2 * SIZE, CO1\r
+ addq $2 * SIZE, CO2\r
+ ALIGN_4 \r
+\r
+.L30:\r
+ testq $1, M\r
+ je .L39\r
+\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (B, %rax, 4), BO\r
+#endif \r
+\r
+ vmovddup -16 * SIZE(AO), %xmm0\r
+ vxorps %xmm8, %xmm8, %xmm8\r
+ vmovddup -14 * SIZE(AO), %xmm2\r
+ vxorps %xmm9, %xmm9, %xmm9\r
+ vmovddup -15 * SIZE(AO), %xmm4\r
+ vxorps %xmm10, %xmm10,%xmm10\r
+ vmovups -16 * SIZE(BO), %xmm1\r
+ vxorps %xmm11, %xmm11,%xmm11\r
+ vmovups -8 * SIZE(BO), %xmm3\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $1, %rax\r
+#else\r
+ addq $4, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+ andq $-4, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (BO, %rax, 4), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L36\r
+ ALIGN_4\r
+\r
+.L32:\r
+ vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8\r
+ vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9\r
+ vmovups -12 * SIZE(BO, %rax, 4), %xmm1\r
+ vmovddup -12 * SIZE(AO, %rax, 1), %xmm0\r
+ vfmaddpd %xmm10,%xmm4, %xmm1,%xmm10\r
+ vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 4), %xmm4,%xmm11\r
+ vmovups (BO, %rax, 4), %xmm1\r
+ vmovddup -11 * SIZE(AO, %rax, 1), %xmm4\r
+ vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8\r
+ vfmaddpd %xmm9,-6 * SIZE(BO, %rax, 4), %xmm2,%xmm9\r
+ vmovups -4 * SIZE(BO, %rax, 4), %xmm3\r
+ vmovddup -13 * SIZE(AO, %rax, 1), %xmm2\r
+ vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10\r
+ vfmaddpd %xmm11,-2 * SIZE(BO, %rax, 4), %xmm2,%xmm11\r
+ vmovups 8 * SIZE(BO, %rax, 4), %xmm3\r
+ vmovddup -10 * SIZE(AO, %rax, 1), %xmm2\r
+\r
+ addq $4 * SIZE, %rax\r
+ BRANCH\r
+ jl .L32\r
+ ALIGN_4\r
+\r
+.L36:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L38\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (BO, %rax, 4), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L37:\r
+ vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8\r
+ vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9\r
+ vmovups -12 * SIZE(BO, %rax, 4), %xmm1\r
+ vmovddup -15 * SIZE(AO, %rax, 1), %xmm0\r
+\r
+ addq $SIZE, %rax\r
+ jl .L37\r
+ ALIGN_4\r
+\r
+.L38:\r
+ vaddpd %xmm10, %xmm8,%xmm8\r
+ vaddpd %xmm11, %xmm9,%xmm9\r
+\r
+#ifndef TRMMKERNEL\r
+ vmovsd (CO1), %xmm0\r
+ vmovhpd (CO1, LDC), %xmm0,%xmm0\r
+ vmovsd (CO2), %xmm1\r
+ vmovhpd (CO2, LDC), %xmm1,%xmm1\r
+\r
+\r
+ vfmaddpd %xmm0, %xmm7,%xmm8,%xmm8\r
+ vfmaddpd %xmm1, %xmm7,%xmm9,%xmm9\r
+#else\r
+\r
+ vmulpd %xmm7, %xmm8,%xmm8\r
+ vmulpd %xmm7, %xmm9,%xmm9\r
+\r
+#endif\r
+\r
+ vmovsd %xmm8, (CO1)\r
+ vmovhpd %xmm8, (CO1, LDC)\r
+ vmovsd %xmm9, (CO2)\r
+ vmovhpd %xmm9, (CO2, LDC)\r
+\r
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq K, %rax\r
+ subq KKK, %rax\r
+ leaq (,%rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (BO, %rax, 4), BO\r
+#endif\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ addq $1, KK\r
+#endif\r
+ ALIGN_4 \r
+ \r
+.L39:\r
+#if defined(TRMMKERNEL) && !defined(LEFT)\r
+ addq $4, KK\r
+#endif\r
+ \r
+ movq BO, B\r
+\r
+ decq J # j --\r
+ jg .L01\r
+ ALIGN_4\r
+\r
+.L40:\r
+ testq $3, N\r
+ je .L999\r
+\r
+ testq $2, N\r
+ je .L80\r
+ ALIGN_4\r
+\r
+.L41:\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ movq OFFSET, %rax\r
+ movq %rax, KK\r
+#endif \r
+\r
+ movq C, CO1 # coffset1 = c\r
+ leaq (C, LDC, 1), CO2 # coffset2 = c + ldc\r
+ movq A, AO # aoffset = a\r
+\r
+ movq K, %rax\r
+ salq $BASE_SHIFT + 1, %rax\r
+ leaq (B, %rax), BB\r
+\r
+ movq M, I\r
+ sarq $2, I # i = (m >> 2)\r
+ jle .L60\r
+ ALIGN_4\r
+\r
+.L51:\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (B, %rax, 2), BO\r
+#endif \r
+\r
+ vmovddup -16 * SIZE(BO), %xmm1\r
+ vmovddup -15 * SIZE(BO), %xmm5\r
+ vmovddup -12 * SIZE(BO), %xmm3\r
+ vxorps %xmm8, %xmm8,%xmm8\r
+ vxorps %xmm9, %xmm9,%xmm9\r
+ vxorps %xmm12, %xmm12,%xmm12\r
+ vxorps %xmm13, %xmm13,%xmm13\r
+ vmovups -16 * SIZE(AO), %xmm0\r
+ vmovups -8 * SIZE(AO), %xmm4\r
+ // prefetcht0 256(CO1)\r
+ // prefetcht0 320(CO1)\r
+ // prefetcht0 256(CO2)\r
+ // prefetcht0 320(CO2)\r
+ // prefetchnta 24 * SIZE(CO1)\r
+ // prefetchnta 32 * SIZE(CO1)\r
+ // prefetchw 3 * SIZE(CO1)\r
+ vmovups %xmm0, %xmm2\r
+ // prefetchw 3 * SIZE(CO2)\r
+ // prefetchnta -16 * SIZE(BB)\r
+ // prefetch -16 * SIZE(BB)\r
+ subq $-8 * SIZE, BB\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $4, %rax\r
+#else\r
+ addq $2, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+\r
+ andq $-4, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 2), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L56\r
+ ALIGN_4\r
+\r
+.L52:\r
+ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8\r
+ vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9\r
+ vmovups -14 * SIZE(AO, %rax, 4),%xmm2\r
+ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12\r
+ vmovups -12 * SIZE(AO, %rax, 4), %xmm0\r
+ vmovddup -14 * SIZE(BO, %rax, 2), %xmm1\r
+ vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13\r
+ vmovddup -13 * SIZE(BO, %rax, 2), %xmm5\r
+ vmovups -10 * SIZE(AO, %rax, 4), %xmm2\r
+ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8\r
+ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12\r
+ vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9\r
+ vmovups (AO, %rax, 4), %xmm0\r
+ vmovddup -8 * SIZE(BO, %rax, 2), %xmm1\r
+ vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13\r
+ vmovddup -11 * SIZE(BO, %rax, 2), %xmm5\r
+ vmovups -6 * SIZE(AO, %rax, 4), %xmm2\r
+ vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8\r
+ vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12\r
+ vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9\r
+ vmovups -4 * SIZE(AO, %rax, 4), %xmm4\r
+ vmovddup -10 * SIZE(BO, %rax, 2), %xmm3\r
+ vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13\r
+ vmovddup -9 * SIZE(BO, %rax, 2), %xmm5\r
+ vmovups -2 * SIZE(AO, %rax, 4), %xmm2\r
+ vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8\r
+ vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12\r
+ vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9\r
+ vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13\r
+ vmovups 8 * SIZE(AO, %rax, 4), %xmm4\r
+ vmovddup -4 * SIZE(BO, %rax, 2), %xmm3\r
+ vmovddup -7 * SIZE(BO, %rax, 2), %xmm5\r
+ vmovaps %xmm0, %xmm2\r
+\r
+ addq $4 * SIZE, %rax\r
+ BRANCH\r
+ jl .L52\r
+ ALIGN_4\r
+\r
+.L56:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L59\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 2), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L57:\r
+ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8\r
+ vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9\r
+ vmovups -14 * SIZE(AO, %rax, 4),%xmm2\r
+ vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12\r
+ vmovups -12 * SIZE(AO, %rax, 4), %xmm0\r
+ vmovddup -14 * SIZE(BO, %rax, 2), %xmm1\r
+ vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13\r
+ vmovddup -13 * SIZE(BO, %rax, 2), %xmm5\r
+ vmovaps %xmm0, %xmm2\r
+\r
+ addq $SIZE, %rax\r
+ jl .L57\r
+ ALIGN_4\r
+\r
+.L59:\r
+#ifndef TRMMKERNEL\r
+ vfmaddpd (CO1),%xmm7, %xmm8, %xmm8\r
+ vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12, %xmm12\r
+ vfmaddpd (CO2),%xmm7, %xmm9, %xmm9\r
+ vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm13, %xmm13\r
+\r
+#else\r
+ vmulpd %xmm7, %xmm8,%xmm8\r
+ vmulpd %xmm7, %xmm9,%xmm9\r
+ vmulpd %xmm7, %xmm12,%xmm12\r
+ vmulpd %xmm7, %xmm13,%xmm13\r
+\r
+#endif\r
+\r
+ vmovups %xmm8, (CO1)\r
+ vmovups %xmm12, 2 * SIZE(CO1)\r
+\r
+ vmovups %xmm9, (CO2)\r
+ vmovups %xmm13, 2 * SIZE(CO2)\r
+\r
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq K, %rax\r
+ subq KKK, %rax\r
+ leaq (,%rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 2), BO\r
+#endif\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ addq $4, KK\r
+#endif\r
+\r
+ addq $4 * SIZE, CO1 # coffset += 4\r
+ addq $4 * SIZE, CO2 # coffset += 4\r
+ decq I # i --\r
+ jg .L51\r
+ ALIGN_4 \r
+\r
+.L60:\r
+ testq $2, M\r
+ je .L70\r
+ ALIGN_4\r
+\r
+.L61:\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (B, %rax, 2), BO\r
+#endif \r
+\r
+ vmovups -16 * SIZE(AO), %xmm0\r
+ vxorps %xmm8, %xmm8,%xmm8\r
+ vmovups -12 * SIZE(AO), %xmm2\r
+ vxorps %xmm9, %xmm9,%xmm9\r
+ vmovddup -16 * SIZE(BO), %xmm1\r
+ vxorps %xmm10, %xmm10,%xmm10\r
+ vmovddup -15 * SIZE(BO), %xmm3\r
+ vxorps %xmm11, %xmm11,%xmm11\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $2, %rax\r
+#else\r
+ addq $2, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+ andq $-4, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 2), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L66\r
+ ALIGN_4\r
+\r
+.L62:\r
+ vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8\r
+ vmovddup -14 * SIZE(BO, %rax, 2), %xmm1\r
+ vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9\r
+ vmovups -14 * SIZE(AO, %rax, 2), %xmm0\r
+ vmovddup -13 * SIZE(BO, %rax, 2), %xmm3\r
+ vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10\r
+ vmovddup -12 * SIZE(BO, %rax, 2), %xmm1\r
+ vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11\r
+ vmovups -8 * SIZE(AO, %rax, 2), %xmm0\r
+ vmovddup -11 * SIZE(BO, %rax, 2), %xmm3\r
+ vfmaddpd %xmm8,%xmm2, %xmm1,%xmm8\r
+ vmovddup -10 * SIZE(BO, %rax, 2), %xmm1\r
+ vfmaddpd %xmm9,%xmm2, %xmm3,%xmm9\r
+ vmovups -10 * SIZE(AO, %rax, 2), %xmm2\r
+ vmovddup -9 * SIZE(BO, %rax, 2), %xmm3\r
+ vfmaddpd %xmm10,%xmm2, %xmm1,%xmm10\r
+ vmovddup -8 * SIZE(BO, %rax, 2), %xmm1\r
+ vfmaddpd %xmm11,%xmm2, %xmm3,%xmm11\r
+ vmovups -4 * SIZE(AO, %rax, 2), %xmm2\r
+ vmovddup -7 * SIZE(BO, %rax, 2), %xmm3\r
+\r
+ addq $4 * SIZE, %rax\r
+ BRANCH\r
+ jl .L62\r
+ ALIGN_4\r
+\r
+.L66:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L69\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 2), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L67:\r
+ vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8\r
+ vmovddup -14 * SIZE(BO, %rax, 2), %xmm1\r
+ vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9\r
+ vmovups -14 * SIZE(AO, %rax, 2), %xmm0\r
+ vmovddup -13 * SIZE(BO, %rax, 2), %xmm3\r
+\r
+ addq $SIZE, %rax\r
+ jl .L67\r
+ ALIGN_4\r
+\r
+.L69:\r
+ vaddpd %xmm10, %xmm8,%xmm8\r
+ vaddpd %xmm11, %xmm9,%xmm9\r
+ \r
+#ifndef TRMMKERNEL\r
+\r
+ vfmaddpd (CO1),%xmm7, %xmm8,%xmm8\r
+ vfmaddpd (CO2),%xmm7, %xmm9,%xmm9\r
+\r
+#else\r
+\r
+ vmulpd %xmm7, %xmm8,%xmm8\r
+ vmulpd %xmm7, %xmm9,%xmm9\r
+\r
+#endif\r
+\r
+ vmovups %xmm8, (CO1)\r
+ vmovups %xmm9, (CO2)\r
+\r
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq K, %rax\r
+ subq KKK, %rax\r
+ leaq (,%rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 2), BO\r
+#endif\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ addq $2, KK\r
+#endif\r
+\r
+ addq $2 * SIZE, CO1 # coffset += 4\r
+ addq $2 * SIZE, CO2 # coffset += 4\r
+ ALIGN_4 \r
+\r
+.L70:\r
+ testq $1, M\r
+ je .L79\r
+ ALIGN_4\r
+\r
+.L71:\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (B, %rax, 2), BO\r
+#endif \r
+\r
+ vmovddup -16 * SIZE(AO), %xmm0\r
+ vxorps %xmm8, %xmm8,%xmm8\r
+ vmovddup -15 * SIZE(AO), %xmm1\r
+ vxorps %xmm9, %xmm9,%xmm9\r
+ vmovddup -14 * SIZE(AO), %xmm2\r
+ vxorps %xmm10, %xmm10,%xmm10\r
+ vmovddup -13 * SIZE(AO), %xmm3\r
+ vxorps %xmm11, %xmm11,%xmm11\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $1, %rax\r
+#else\r
+ addq $2, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+ andq $-4, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (BO, %rax, 2), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L76\r
+ ALIGN_4\r
+\r
+.L72:\r
+ vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8\r
+ vmovddup -12 * SIZE(AO, %rax, 1), %xmm0\r
+\r
+ vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 2), %xmm1,%xmm9\r
+ vmovddup -11 * SIZE(AO, %rax, 1), %xmm1\r
+\r
+ vfmaddpd %xmm10,-12 * SIZE(BO, %rax, 2), %xmm2,%xmm10\r
+ vmovddup -10 * SIZE(AO, %rax, 1), %xmm2\r
+\r
+ vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 2), %xmm3,%xmm11\r
+ vmovddup -9 * SIZE(AO, %rax, 1), %xmm3\r
+\r
+ addq $4 * SIZE, %rax\r
+ BRANCH\r
+ jl .L72\r
+ ALIGN_4\r
+\r
+.L76:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L78\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (BO, %rax, 2), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L77:\r
+ vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8\r
+ vmovddup -15 * SIZE(AO, %rax, 1), %xmm0\r
+\r
+ addq $SIZE, %rax\r
+ jl .L77\r
+ ALIGN_4\r
+\r
+.L78:\r
+ vaddpd %xmm9, %xmm8,%xmm8\r
+ vaddpd %xmm11, %xmm10,%xmm10\r
+ vaddpd %xmm10, %xmm8,%xmm8\r
+\r
+#ifndef TRMMKERNEL\r
+ vmovsd (CO1), %xmm0\r
+ vmovhpd (CO2), %xmm0,%xmm0\r
+#endif\r
+\r
+ vmulpd %xmm7, %xmm8,%xmm8\r
+\r
+#ifndef TRMMKERNEL\r
+ vaddpd %xmm0, %xmm8,%xmm8\r
+#endif\r
+\r
+ vmovsd %xmm8, (CO1)\r
+ vmovhpd %xmm8, (CO2)\r
+\r
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq K, %rax\r
+ subq KKK, %rax\r
+ leaq (,%rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (BO, %rax, 2), BO\r
+#endif\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ addq $1, KK\r
+#endif\r
+ ALIGN_4 \r
+ \r
+.L79:\r
+#if defined(TRMMKERNEL) && !defined(LEFT)\r
+ addq $2, KK\r
+#endif\r
+\r
+ movq BO, B\r
+\r
+ leaq (C, LDC, 2), C\r
+ ALIGN_4\r
+\r
+.L80:\r
+ testq $1, N\r
+ je .L999\r
+ ALIGN_4\r
+\r
+.L81:\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ movq OFFSET, %rax\r
+ movq %rax, KK\r
+#endif \r
+\r
+ movq C, CO1 # coffset1 = c\r
+ movq A, AO # aoffset = a\r
+\r
+ movq M, I\r
+ sarq $2, I # i = (m >> 2)\r
+ jle .L100\r
+ ALIGN_4\r
+\r
+.L91:\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (B, %rax, 1), BO\r
+#endif \r
+\r
+ vmovups -8 * SIZE(AO), %xmm2\r
+ vxorps %xmm8, %xmm8,%xmm8\r
+ vmovups -16 * SIZE(AO), %xmm0\r
+ vxorps %xmm9, %xmm9,%xmm9\r
+ vmovddup -16 * SIZE(BO), %xmm1\r
+ vxorps %xmm12, %xmm12,%xmm12\r
+ vmovddup -14 * SIZE(BO), %xmm3\r
+ vxorps %xmm13, %xmm13,%xmm13\r
+ vmovddup -15 * SIZE(BO), %xmm5\r
+\r
+ // prefetchw 3 * SIZE(CO1)\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $4, %rax\r
+#else\r
+ addq $1, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+ andq $-4, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 1), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L96\r
+ ALIGN_4\r
+\r
+.L92:\r
+ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8\r
+ vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12\r
+ vmovapd -12 * SIZE(AO, %rax, 4), %xmm0\r
+ vmovddup -12 * SIZE(BO, %rax, 1), %xmm1\r
+ vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9\r
+ vfmaddpd %xmm13,-10 * SIZE(AO, %rax, 4), %xmm5,%xmm13\r
+ vmovapd (AO, %rax, 4), %xmm0\r
+ vmovddup -13 * SIZE(BO, %rax, 1), %xmm5\r
+ vfmaddpd %xmm8,%xmm3, %xmm2,%xmm8\r
+ vfmaddpd %xmm12,-6 * SIZE(AO, %rax, 4), %xmm3,%xmm12\r
+ vmovapd -4 * SIZE(AO, %rax, 4), %xmm2\r
+ vmovddup -10 * SIZE(BO, %rax, 1), %xmm3\r
+ vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9\r
+ vfmaddpd %xmm13,-2 * SIZE(AO, %rax, 4), %xmm5,%xmm13\r
+ vmovapd 8 * SIZE(AO, %rax, 4), %xmm2\r
+ vmovddup -11 * SIZE(BO, %rax, 1), %xmm5\r
+\r
+ addq $4 * SIZE, %rax\r
+ BRANCH\r
+ jl .L92\r
+ ALIGN_4\r
+\r
+.L96:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L99\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 1), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L97:\r
+ vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8\r
+ vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12\r
+ vmovups -12 * SIZE(AO, %rax, 4), %xmm0\r
+ vmovddup -15 * SIZE(BO, %rax, 1), %xmm1\r
+\r
+ addq $SIZE, %rax\r
+ jl .L97\r
+ ALIGN_4\r
+\r
+.L99:\r
+ vaddpd %xmm9, %xmm8,%xmm8\r
+ vaddpd %xmm13, %xmm12,%xmm12\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ vfmaddpd (CO1),%xmm7, %xmm8,%xmm8\r
+ vfmaddpd 2 * SIZE(CO1),%xmm7,%xmm12,%xmm12\r
+\r
+#else\r
+ vmulpd %xmm7, %xmm8,%xmm8\r
+ vmulpd %xmm7, %xmm12,%xmm12\r
+\r
+#endif\r
+\r
+ vmovups %xmm8, (CO1)\r
+ vmovups %xmm12, 2 * SIZE(CO1)\r
+\r
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq K, %rax\r
+ subq KKK, %rax\r
+ leaq (,%rax, SIZE), %rax\r
+ leaq (AO, %rax, 4), AO\r
+ leaq (BO, %rax, 1), BO\r
+#endif\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ addq $4, KK\r
+#endif\r
+\r
+ addq $4 * SIZE, CO1 # coffset += 4\r
+ decq I # i --\r
+ jg .L91\r
+ ALIGN_4 \r
+\r
+.L100:\r
+ testq $2, M\r
+ je .L110\r
+ ALIGN_4\r
+\r
+.L101:\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (B, %rax, 1), BO\r
+#endif \r
+\r
+ vmovddup -16 * SIZE(BO), %xmm0\r
+ vxorps %xmm8, %xmm8,%xmm8\r
+ vmovddup -15 * SIZE(BO), %xmm1\r
+ vxorps %xmm9, %xmm9,%xmm9\r
+ vmovddup -14 * SIZE(BO), %xmm2\r
+ vxorps %xmm10, %xmm10,%xmm10\r
+ vmovddup -13 * SIZE(BO), %xmm3\r
+ vxorps %xmm11, %xmm11,%xmm11\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $2, %rax\r
+#else\r
+ addq $1, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+ andq $-4, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 1), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L106\r
+ ALIGN_4\r
+\r
+.L102:\r
+ vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8\r
+ vmovddup -12 * SIZE(BO, %rax, 1), %xmm0\r
+\r
+ vfmaddpd %xmm9,-14 * SIZE(AO, %rax, 2), %xmm1,%xmm9\r
+ vmovddup -11 * SIZE(BO, %rax, 1), %xmm1\r
+\r
+ vfmaddpd %xmm10,-12 * SIZE(AO, %rax, 2), %xmm2,%xmm10\r
+ vmovddup -10 * SIZE(BO, %rax, 1), %xmm2\r
+\r
+ vfmaddpd %xmm11,-10 * SIZE(AO, %rax, 2), %xmm3,%xmm11\r
+ vmovddup -9 * SIZE(BO, %rax, 1), %xmm3\r
+\r
+ addq $4 * SIZE, %rax\r
+ BRANCH\r
+ jl .L102\r
+ ALIGN_4\r
+\r
+.L106:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L109\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 1), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L107:\r
+ vmovddup -16 * SIZE(BO, %rax, 1), %xmm0\r
+ vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8\r
+\r
+ addq $SIZE, %rax\r
+ jl .L107\r
+ ALIGN_4\r
+\r
+.L109:\r
+ vaddpd %xmm9, %xmm8,%xmm8\r
+ vaddpd %xmm11, %xmm10,%xmm10\r
+ vaddpd %xmm10, %xmm8,%xmm8\r
+\r
+#ifndef TRMMKERNEL\r
+\r
+ vfmaddpd (CO1),%xmm7, %xmm8,%xmm8\r
+#else\r
+ vmulpd %xmm7, %xmm8,%xmm8\r
+\r
+#endif\r
+\r
+ vmovups %xmm8, (CO1)\r
+\r
+#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq K, %rax\r
+ subq KKK, %rax\r
+ leaq (,%rax, SIZE), %rax\r
+ leaq (AO, %rax, 2), AO\r
+ leaq (BO, %rax, 1), BO\r
+#endif\r
+\r
+#if defined(TRMMKERNEL) && defined(LEFT)\r
+ addq $2, KK\r
+#endif\r
+\r
+ addq $2 * SIZE, CO1 # coffset += 4\r
+\r
+ ALIGN_4 \r
+\r
+.L110:\r
+ testq $1, M\r
+ je .L999\r
+ ALIGN_4\r
+\r
+.L111:\r
+#if !defined(TRMMKERNEL) || \\r
+ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \\r
+ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))\r
+ movq B, BO\r
+#else\r
+ movq KK, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (B, %rax, 1), BO\r
+#endif \r
+\r
+ vmovups -16 * SIZE(AO), %xmm0\r
+ vxorps %xmm8, %xmm8,%xmm8\r
+ movups -14 * SIZE(AO), %xmm1\r
+ vxorps %xmm9, %xmm9,%xmm9\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))\r
+ movq K, %rax\r
+ subq KK, %rax\r
+ movq %rax, KKK \r
+#else\r
+ movq KK, %rax\r
+#ifdef LEFT\r
+ addq $1, %rax\r
+#else\r
+ addq $1, %rax\r
+#endif\r
+ movq %rax, KKK\r
+#endif\r
+\r
+ andq $-4, %rax\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (BO, %rax, 1), BO\r
+ negq %rax\r
+ NOBRANCH\r
+ je .L116\r
+ ALIGN_4\r
+\r
+.L112:\r
+ vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 1), %xmm0,%xmm8\r
+ vmovups -12 * SIZE(AO, %rax, 1), %xmm0\r
+\r
+ vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 1), %xmm1,%xmm9\r
+ vmovups -10 * SIZE(AO, %rax, 1), %xmm1\r
+\r
+ addq $4 * SIZE, %rax\r
+ BRANCH\r
+ jl .L112\r
+ ALIGN_4\r
+\r
+.L116:\r
+ vmovddup ALPHA, %xmm7\r
+\r
+#ifndef TRMMKERNEL\r
+ movq K, %rax\r
+#else\r
+ movq KKK, %rax\r
+#endif\r
+ andq $3, %rax # if (k & 1)\r
+ je .L118\r
+\r
+ leaq (, %rax, SIZE), %rax\r
+ leaq (AO, %rax, 1), AO\r
+ leaq (BO, %rax, 1), BO\r
+ negq %rax\r
+ ALIGN_4\r
+\r
+.L117:\r
+ vmulsd -16 * SIZE(BO, %rax, 1), %xmm0,%xmm0\r
+ vaddsd %xmm0, %xmm8,%xmm8\r
+ vmovsd -15 * SIZE(AO, %rax, 1), %xmm0\r
+\r
+ addq $SIZE, %rax\r
+ jl .L117\r
+ ALIGN_4\r
+\r
+.L118:\r
+ vaddpd %xmm9, %xmm8,%xmm8\r
+ vhaddpd %xmm8, %xmm8,%xmm8\r
+\r
+#ifndef TRMMKERNEL\r
+ vmovsd (CO1), %xmm0\r
+#endif\r
+\r
+ vmulsd %xmm7, %xmm8,%xmm8\r
+\r
+#ifndef TRMMKERNEL\r
+ vaddsd %xmm0, %xmm8,%xmm8\r
+#endif\r
+\r
+ vmovsd %xmm8, (CO1)\r
+ ALIGN_4\r
+ \r
+.L999:\r
+ movq (%rsp), %rbx\r
+ movq 8(%rsp), %rbp\r
+ movq 16(%rsp), %r12\r
+ movq 24(%rsp), %r13\r
+ movq 32(%rsp), %r14\r
+ movq 40(%rsp), %r15\r
+\r
+#ifdef WINDOWS_ABI\r
+ movq 48(%rsp), %rdi\r
+ movq 56(%rsp), %rsi\r
+ movups 64(%rsp), %xmm6\r
+ movups 80(%rsp), %xmm7\r
+ movups 96(%rsp), %xmm8\r
+ movups 112(%rsp), %xmm9\r
+ movups 128(%rsp), %xmm10\r
+ movups 144(%rsp), %xmm11\r
+ movups 160(%rsp), %xmm12\r
+ movups 176(%rsp), %xmm13\r
+ movups 192(%rsp), %xmm14\r
+ movups 208(%rsp), %xmm15\r
+#endif\r
+\r
+ addq $STACKSIZE, %rsp\r
+ ret\r
+\r
+ EPILOGUE\r
#endif
movsd -32 * SIZE(Y), %xmm8
- pshufd $0x39, %xmm4, %xmm5
+ pshufd $0x29, %xmm4, %xmm5
mulps %xmm8, %xmm5
addps %xmm5, %xmm3
xorps %xmm5, %xmm5
movhlps %xmm4, %xmm5
- mulps -32 * SIZE(Y), %xmm5
+ movlps -32 * SIZE(Y), %xmm4
+ mulps %xmm4, %xmm5
addps %xmm5, %xmm0
addq $2 * SIZE, X
movsd -32 * SIZE(Y), %xmm8
movss %xmm5, %xmm4
- shufps $0x93, %xmm5, %xmm4
+ shufps $0x93, %xmm4, %xmm4
mulps %xmm8, %xmm4
addps %xmm4, %xmm3
.L22:
mulps %xmm8, %xmm9
addps %xmm9, %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
movaps 4 * SIZE(BO), %xmm9
addps %xmm8, %xmm3
movaps 0 * SIZE(AO), %xmm8
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif
mulps %xmm10, %xmm9
.L32:
mulps %xmm8, %xmm9
addps %xmm9, %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
movsd 4 * SIZE(BO), %xmm9
.L42:
mulss %xmm8, %xmm9
addss %xmm9, %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
movss 4 * SIZE(BO), %xmm9
.L62:
mulps %xmm8, %xmm9
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
mulps 4 * SIZE(BO), %xmm8
addps %xmm8, %xmm5
movaps 32 * SIZE(AO), %xmm8
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif
mulps %xmm10, %xmm11
addps %xmm10, %xmm5
movaps 48 * SIZE(AO), %xmm10
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
#endif
mulps %xmm12, %xmm13
addps %xmm12, %xmm5
movaps 64 * SIZE(AO), %xmm12
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
#endif
mulps %xmm14, %xmm15
.L72:
mulps %xmm8, %xmm9
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
.L82:
mulps %xmm8, %xmm9
addps %xmm9, %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
movsd 4 * SIZE(BO), %xmm9
.L92:
mulps %xmm8, %xmm9
addps %xmm9, %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
movss 4 * SIZE(BO), %xmm9
.L112:
mulps %xmm9, %xmm8
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
addps %xmm9, %xmm4
movaps 8 * SIZE(BO), %xmm9
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif
mulps %xmm9, %xmm10
addps %xmm9, %xmm4
movaps 32 * SIZE(BO), %xmm9
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
#endif
mulps %xmm11, %xmm12
addps %xmm11, %xmm4
movaps 24 * SIZE(BO), %xmm11
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
#endif
mulps %xmm11, %xmm14
.L122:
mulps %xmm8, %xmm9
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
movaps -28 * SIZE(AO), %xmm8
addps %xmm8, %xmm3
movaps 0 * SIZE(AO), %xmm8
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
#endif
mulps %xmm10, %xmm11
.L132:
mulps %xmm8, %xmm9
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
movsd -30 * SIZE(AO), %xmm8
.L142:
mulss %xmm8, %xmm9
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
#endif
movss -31 * SIZE(AO), %xmm8
#define ASSEMBLER
#include "common.h"
-#if defined(BARCELONA) || defined(SHANGHAI)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE (12 + 4)
#define WPREFETCHSIZE (48 + 4)
#define MOVNTQ MOVQ
#define AO3 %r13
#define AO4 %rax
-#if defined(BARCELONA) || defined(SHANGHAI)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCH prefetch
#else
#define RPREFETCH prefetch
#define ASSEMBLER
#include "common.h"
-#if defined(BARCELONA) || defined(SHANGHAI)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE (12 + 4)
#define WPREFETCHSIZE (12 + 4)
#define MOVNTQ MOVQ
#endif
-#if defined(BARCELONA) || defined(SHANGHAI)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCH prefetch
#else
#define RPREFETCH prefetch
ALIGN_4
.L71:
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
prefetch PREFETCHSIZE * SIZE(X)
#endif
sarq $5, I
jle .L113
-#if defined(BARCELONA) || defined(SHANGHAI)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
movaps %xmm0, %xmm1
mulps -32 * SIZE(X), %xmm1
sarq $4, I
jle .L113
-#if defined(BARCELONA) || defined(SHANGHAI)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
movaps %xmm0, %xmm1
mulpd -16 * SIZE(X), %xmm1
-/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
#ifndef WINDOWS_ABI
-#define STACKSIZE 64
+#define STACKSIZE 128
#define OLD_M %rdi
#define OLD_N %rsi
#define STACK_Y 16 + STACKSIZE(%rsp)
#define STACK_INCY 24 + STACKSIZE(%rsp)
#define STACK_BUFFER 32 + STACKSIZE(%rsp)
+#define MMM 56(%rsp)
+#define NN 64(%rsp)
+#define AA 72(%rsp)
+#define LDAX 80(%rsp)
#else
#define STACK_Y 72 + STACKSIZE(%rsp)
#define STACK_INCY 80 + STACKSIZE(%rsp)
#define STACK_BUFFER 88 + STACKSIZE(%rsp)
+#define MMM 216(%rsp)
+#define NN 224(%rsp)
+#define AA 232(%rsp)
+#define LDAX 240(%rsp)
#endif
movups %xmm14, 192(%rsp)
movups %xmm15, 208(%rsp)
- movq OLD_M, M
- movq OLD_N, N
- movq OLD_A, A
- movq OLD_LDA, LDA
+ movq OLD_M, MMM
+ movq OLD_N, NN
+ movq OLD_A, X
+ movq X, AA
+ movq OLD_LDA, X
+ movq X, LDAX
movq OLD_X, X
#else
- movq OLD_M, M
- movq OLD_N, N
- movq OLD_A, A
- movq OLD_LDA, LDA
+ movq OLD_M, MMM
+ movq OLD_N, NN
+ movq OLD_A, AA
+ movq OLD_LDA, LDAX
#endif
-
- movq STACK_INCX, INCX
- movq STACK_Y, Y
- movq STACK_INCY, INCY
- movq STACK_BUFFER, BUFFER
-
#ifndef WINDOWS_ABI
pshufd $0, %xmm0, ALPHA
#else
pshufd $0, %xmm3, ALPHA
#endif
+
+.L0t:
+ xorq M,M
+ addq $1,M
+ salq $22,M
+ subq M,MMM
+ jge .L00t
+ ALIGN_4
+
+ movq MMM,%rax
+ addq M,%rax
+ jle .L999x
+ movq %rax,M
+
+.L00t:
+ movq LDAX,LDA
+ movq NN,N
+ movq AA,A
+ movq STACK_INCX, INCX
+ movq STACK_Y, Y
+ movq STACK_INCY, INCY
+ movq STACK_BUFFER, BUFFER
+
leaq (,INCX, SIZE), INCX
leaq (,INCY, SIZE), INCY
leaq (,LDA, SIZE), LDA
ALIGN_4
.L999:
+ leaq (,M,SIZE),%rax
+ addq %rax,AA
+ jmp .L0t
+ ALIGN_4
+
+.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12
#define movsd movlps
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define movsd movlps
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define PREFETCHW prefetcht0
#endif
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define movsd movlps
#define PREFETCHW prefetcht0
#endif
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define movsd movlps
#define PREFETCHW prefetcht0
#endif
-#if defined(OPTERON) || defined(BARCELONA)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define movsd movlps
movsd -32 * SIZE(X), %xmm4
pshufd $0xb1, %xmm4, %xmm12
- shufps $0x39, %xmm8, %xmm8
+ shufps $0x59, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
movss %xmm9, %xmm8
pshufd $0xb1, %xmm4, %xmm12
- shufps $0x93, %xmm8, %xmm8
+ shufps $0x03, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
movsd -32 * SIZE(Y), %xmm4
pshufd $0xb1, %xmm4, %xmm12
- shufps $0x39, %xmm8, %xmm8
+ shufps $0xa9, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
movss %xmm9, %xmm8
pshufd $0xb1, %xmm4, %xmm12
- shufps $0x93, %xmm8, %xmm8
+ shufps $0x03, %xmm8, %xmm8
mulps %xmm8, %xmm4
addps %xmm4, %xmm0
mulps %xmm8, %xmm12
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define RPREFETCHSIZE 32
#define WPREFETCHSIZE 48
#endif
#define a3 %xmm14
#define xt1 %xmm15
-#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define a3 %xmm14
#define xt1 %xmm15
-#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
+#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define xt1 %xmm14
#define xt2 %xmm15
-#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
+#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define a3 %xmm14
#define xt1 %xmm15
-#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
+#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else
#define BORIG 72(%rsp)
#define BUFFER 128(%rsp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta
#define PREFETCHSIZE (8 * 6 + 4)
#endif
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta
#define BORIG 72(%rsp)
#define BUFFER 128(%rsp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta
#define PREFETCHSIZE (8 * 6 + 4)
#endif
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta
#define BORIG 72(%rsp)
#define BUFFER 128(%rsp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta
#define PREFETCHSIZE (8 * 6 + 4)
#endif
-#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta
#define ALIGNED_ACCESS
#endif
+#ifdef BULLDOZER
+#define PREFETCH prefetch
+#define PREFETCHW prefetchw
+#define PREFETCHSIZE (128 * 5)
+#define ALIGNED_ACCESS
+#endif
+
#ifdef NANO
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define movsd movlps
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps
#endif
#define GEMM_PQ MAX(GEMM_P, GEMM_Q)
-#define REAL_GEMM_R (GEMM_R - GEMM_PQ)
+
+//leave some space for GEMM_ALIGN in sb2
+#define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ)
#if 0
#define SHARED_ARRAY
sa,
sb2,
a + (is + js * lda) * COMPSIZE, lda,
- - is + js);
+ is - js);
#endif
}
LOADER = $(FORTRAN)
TIMER = NONE
ARCHFLAGS= -ru
-RANLIB = ranlib
+#RANLIB = ranlib
BLASLIB =
TMGLIB = tmglib.a
EIGSRCLIB = eigsrc.a
/* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
-#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
+#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
+ (__GNUC__ >= 3 && !defined(__cplusplus)))
#define OPENBLAS_COMPLEX_C99
#include <complex.h>
typedef float _Complex openblas_complex_float;
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define SNUMOPT 8
#define DNUMOPT 4