# TARGET = PENRYN
# If you want to support multiple architecture in one binary
-# DYNAMIC_ARCH = 1
+DYNAMIC_ARCH = 1
# C compiler including binary type(32bit / 64bit). Default is gcc.
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
ifeq ($(TARGET), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
+ifeq ($(TARGET), STEAMROLLER)
+GETARCH_FLAGS := -DFORCE_BARCELONA
+endif
endif
ifeq ($(TARGET_CORE), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
+ifeq ($(TARGET_CORE), STEAMROLLER)
+GETARCH_FLAGS := -DFORCE_BARCELONA
+endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER
endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
+- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
BOBCAT
BULLDOZER
PILEDRIVER
+STEAMROLLER
c)VIA CPU:
SSE_GENERIC
#define MMXSTORE movd
#endif
-#if defined(PILEDRIVER) || defined(BULLDOZER)
+#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif
#ifdef ASSEMBLER
-#if defined(PILEDRIVER) || defined(BULLDOZER)
+#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif
#define CORE_ATOM 18
#define CORE_NANO 19
#define CORE_SANDYBRIDGE 20
-#define CORE_BOBCAT 21
-#define CORE_BULLDOZER 22
+#define CORE_BOBCAT 21
+#define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23
-#define CORE_HASWELL 24
+#define CORE_HASWELL 24
+#define CORE_STEAMROLLER 25
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47
-#define CPUTYPE_HASWELL 48
+#define CPUTYPE_HASWELL 48
+#define CPUTYPE_STEAMROLLER 49
#endif
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
+ case 0:
+ if(support_avx())
+ return CPUTYPE_STEAMROLLER;
+ else
+ return CPUTYPE_BARCELONA; //OS don't support AVX.
+
}
break;
case 5:
"BULLDOZER",
"PILEDRIVER",
"HASWELL",
+ "STEAMROLLER",
};
static char *lowercpuname[] = {
"bulldozer",
"piledriver",
"haswell",
+ "steamroller",
};
static char *corename[] = {
"BULLDOZER",
"PILEDRIVER",
"HASWELL",
+ "STEAMROLLER",
};
static char *corename_lower[] = {
"bulldozer",
"piledriver",
"haswell",
+ "steamroller",
};
return CORE_PILEDRIVER;
else
return CORE_BARCELONA; //OS don't support AVX.
+
+ case 0:
+ if(support_avx())
+ return CORE_STEAMROLLER;
+ else
+ return CORE_BARCELONA; //OS don't support AVX.
}
+
+
}else return CORE_BARCELONA;
}
}
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
+extern gotoblas_t gotoblas_STEAMROLLER;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#else
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
+#define gotoblas_STEAMROLLER gotoblas_BARCELONA
#endif
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
+ }else if(model == 0){
+ //AMD STEAMROLLER
+ if(support_avx())
+ return &gotoblas_STEAMROLLER;
+ else{
+ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+ }
}
+
+
} else {
return &gotoblas_BARCELONA;
}
"Bulldozer",
"Piledriver",
"Haswell",
+ "Steamroller",
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];
+ if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
return corename[0];
}
switch (found)
{
+ case 21: return (&gotoblas_STEAMROLLER);
case 20: return (&gotoblas_HASWELL);
case 19: return (&gotoblas_PILEDRIVER);
case 18: return (&gotoblas_BULLDOZER);
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
- defined(PILEDRIVER) || defined(HASWELL)
+ defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
env_var_t p;
int factor;
-#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
int size = 16;
#else
int size = get_L2_size();
#define CORENAME "PILEDRIVER"
#endif
+#if defined (FORCE_STEAMROLLER)
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE "X86"
+#define SUBARCHITECTURE "STEAMROLLER"
+#define ARCHCONFIG "-DSTEAMROLLER " \
+ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
+ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
+ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
+ "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
+#define LIBNAME "steamroller"
+#define CORENAME "STEAMROLLER"
+#endif
+
+
#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL
#endif
#endif
+#ifdef STEAMROLLER
+
+#ifdef DEBUG
+ fprintf(stderr, "Steamroller\n");
+#endif
+
+ TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+ TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+ TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+ TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+#ifdef EXPRECISION
+ TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
+ TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
+#endif
+#endif
+
+
#ifdef NANO
#ifdef DEBUG
--- /dev/null
+SGEMVNKERNEL = sgemv_n_4.c
+SGEMVTKERNEL = sgemv_t_4.c
+
+ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVTKERNEL = zgemv_t_4.c
+
+DGEMVNKERNEL = dgemv_n_bulldozer.S
+DGEMVTKERNEL = dgemv_t_bulldozer.S
+
+DDOTKERNEL = ddot_bulldozer.S
+DCOPYKERNEL = dcopy_bulldozer.S
+
+SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S
+SGEMMINCOPY = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY = ../generic/gemm_tcopy_16.c
+SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
+SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S
+DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
+DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
+DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
+DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S
+ZGEMMINCOPY =
+ZGEMMITCOPY =
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
+ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
+DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "ddot_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "ddot_microk_nehalem-2.c"
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "sdot_microk_bulldozer-2.c"
#elif defined(NEHALEM)
#include "sdot_microk_nehalem-2.c"
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "sgemv_n_microk_bulldozer-4.c"
#elif defined(NEHALEM)
#include "sgemv_n_microk_nehalem-4.c"
#if defined(NEHALEM)
#include "sgemv_t_microk_nehalem-4.c"
-#elif defined(BULLDOZER) || defined(PILEDRIVER)
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "sgemv_t_microk_bulldozer-4.c"
#elif defined(SANDYBRIDGE)
#include "sgemv_t_microk_sandy-4.c"
#include "common.h"
-#if defined(BULLDOZER) || defined(PILEDRIVER)
+#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
#include "zgemv_t_microk_bulldozer-4.c"
#elif defined(HASWELL)
#include "zgemv_t_microk_haswell-4.c"
#endif
+#ifdef STEAMROLLER
+#define SNUMOPT 8
+#define DNUMOPT 4
+
+#define GEMM_DEFAULT_OFFSET_A 64
+#define GEMM_DEFAULT_OFFSET_B 832
+#define GEMM_DEFAULT_ALIGN 0x0fffUL
+
+
+
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#ifdef ARCH_X86
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+#else
+#define SGEMM_DEFAULT_UNROLL_N 2
+#define DGEMM_DEFAULT_UNROLL_N 2
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_M 2
+#define XGEMM_DEFAULT_UNROLL_M 1
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
+#define ZGEMM3M_DEFAULT_UNROLL_N 4
+#define ZGEMM3M_DEFAULT_UNROLL_M 4
+#define GEMV_UNROLL 8
+#endif
+
+#if defined(ARCH_X86_64)
+#define SGEMM_DEFAULT_P 768
+#define DGEMM_DEFAULT_P 768
+#define ZGEMM_DEFAULT_P 384
+#define CGEMM_DEFAULT_P 768
+#else
+#define SGEMM_DEFAULT_P 448
+#define DGEMM_DEFAULT_P 480
+#define ZGEMM_DEFAULT_P 112
+#define CGEMM_DEFAULT_P 224
+#endif
+#define QGEMM_DEFAULT_P 112
+#define XGEMM_DEFAULT_P 56
+
+#if defined(ARCH_X86_64)
+#define SGEMM_DEFAULT_Q 192
+#define DGEMM_DEFAULT_Q 168
+#define ZGEMM_DEFAULT_Q 168
+#define CGEMM_DEFAULT_Q 168
+#else
+#define SGEMM_DEFAULT_Q 224
+#define DGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 224
+#define CGEMM_DEFAULT_Q 224
+#endif
+#define QGEMM_DEFAULT_Q 224
+#define XGEMM_DEFAULT_Q 224
+
+#define CGEMM3M_DEFAULT_P 448
+#define ZGEMM3M_DEFAULT_P 224
+#define XGEMM3M_DEFAULT_P 112
+#define CGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 224
+#define XGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_R 12288
+#define ZGEMM3M_DEFAULT_R 12288
+#define XGEMM3M_DEFAULT_R 12288
+
+#define SGEMM_DEFAULT_R 12288
+#define QGEMM_DEFAULT_R qgemm_r
+#define DGEMM_DEFAULT_R 12288
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_R xgemm_r
+
+#define SYMV_P 16
+#define HAVE_EXCLUSIVE_CACHE
+
+#define GEMM_THREAD gemm_thread_mn
+
+#endif
+
+
#ifdef ATHLON
#define SNUMOPT 4