DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
endif
endif
#define HAVE_MISALIGNSSE (1 << 15)
#define HAVE_128BITFPU (1 << 16)
#define HAVE_FASTMOVU (1 << 17)
-#define HAVE_AVX (1 << 18)
+#define HAVE_AVX (1 << 18)
+#define HAVE_FMA4 (1 << 19)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
#ifdef NO_AVX
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM
+#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
+#define CORE_BULLDOZER CORE_BARCELONA
#endif
#ifndef CPUIDEMU
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A;
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE;
+#ifndef NO_AVX
+ if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4;
+#endif
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
}
return CPUTYPE_OPTERON;
case 1:
case 10:
- case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
return CPUTYPE_BARCELONA;
+ case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(support_avx())
+ return CPUTYPE_BULLDOZER;
+ else
+ return CPUTYPE_BARCELONA; //OS don't support AVX.
case 5:
return CPUTYPE_BOBCAT;
}
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
- else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
- else return CORE_BARCELONA;
+ else if (exfamily == 6) {
+ //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(support_avx())
+ return CORE_BULLDOZER;
+ else
+ return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
+ }else return CORE_BARCELONA;
}
}
extern gotoblas_t gotoblas_BOBCAT;
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
+extern gotoblas_t gotoblas_BULLDOZER;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
+#define gotoblas_BULLDOZER gotoblas_BARCELONA
#endif
else return &gotoblas_OPTERON;
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
+ } else if (exfamily == 6) {
+ //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(support_avx())
+ return &gotoblas_BULLDOZER;
+ else{
+ fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n");
+ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+ }
} else {
return &gotoblas_BARCELONA;
}
"Nano",
"Sandybridge",
"Bobcat",
+ "Bulldozer",
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
+ if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
return corename[0];
}
#define CORENAME "OPTERON"
#endif
-#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER)
+#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define CORENAME "BOBCAT"
#endif
+#if defined (FORCE_BULLDOZER)
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE "X86"
+#define SUBARCHITECTURE "BULLDOZER"
+#define ARCHCONFIG "-DBARCELONA " \
+ "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
+ "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \
+ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
+ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
+ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \
+ "-DHAVE_AVX -DHAVE_FMA4"
+#define LIBNAME "bulldozer"
+#define CORENAME "BULLDOZER"
+#endif
+
#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL
--- /dev/null
+SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
+SGEMMINCOPY =
+SGEMMITCOPY =
+SGEMMONCOPY = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
+DGEMMINCOPY = ../generic/gemm_ncopy_2.c
+DGEMMITCOPY = ../generic/gemm_tcopy_2.c
+DGEMMONCOPY = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
+CGEMMINCOPY =
+CGEMMITCOPY =
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ =
+CGEMMITCOPYOBJ =
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
+ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
+STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
+STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
+STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
+
+DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
+DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
+DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
+DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
+
+CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
+CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
+CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
+CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
+
+ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
+
+CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
+ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
--- /dev/null
+ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVTKERNEL = zgemv_t_dup.S
+
+SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
+SGEMMINCOPY = ../generic/gemm_ncopy_8.c
+SGEMMITCOPY = ../generic/gemm_tcopy_8.c
+SGEMMONCOPY = gemm_ncopy_4_opteron.S
+SGEMMOTCOPY = gemm_tcopy_4_opteron.S
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
+DGEMMINCOPY =
+DGEMMITCOPY =
+DGEMMONCOPY = gemm_ncopy_4_opteron.S
+DGEMMOTCOPY = gemm_tcopy_4_opteron.S
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPY = zgemm_ncopy_2.S
+CGEMMOTCOPY = zgemm_tcopy_2.S
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
+ZGEMMINCOPY =
+ZGEMMITCOPY =
+ZGEMMONCOPY = zgemm_ncopy_2.S
+ZGEMMOTCOPY = zgemm_tcopy_2.S
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
+STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
+STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
+STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
+
+DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
+DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
+DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
+DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
+
+CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
+CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
+CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
+CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
+
+ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
+ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
+ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
+ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
+
+CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
+ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
#define SNUMOPT 8
#define DNUMOPT 4