DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif
#define MMXSTORE movd
#endif
+#if defined(SANDYBRIDGE) || defined(HASWELL)
+//Enable some optimazation for nehalem.
+#define NEHALEM_OPTIMIZATION
+#endif
+
#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#ifdef ASSEMBLER
+#if defined(SANDYBRIDGE) || defined(HASWELL)
+//Enable some optimazation for nehalem.
+#define NEHALEM_OPTIMIZATION
+#endif
+
+
#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23
-#define CORE_HASWELL CORE_SANDYBRIDGE
+#define CORE_HASWELL 24
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47
-// this define is because BLAS doesn't have haswell specific optimizations yet
-#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
+#define CPUTYPE_HASWELL 48
#endif
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
+ "HASWELL",
};
static char *lowercpuname[] = {
"bobcat",
"bulldozer",
"piledriver",
+ "haswell",
};
static char *corename[] = {
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
+ "HASWELL",
};
static char *corename_lower[] = {
"bobcat",
"bulldozer",
"piledriver",
+ "haswell",
};
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
+extern gotoblas_t gotoblas_HASWELL;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
+#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#endif
-//Use sandy bridge kernels for haswell.
-#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
+
#define VENDOR_INTEL 1
#define VENDOR_AMD 2
"Bobcat",
"Bulldozer",
"Piledriver",
+ "Haswell",
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
- if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
+ if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
+ if (gotoblas == &gotoblas_HASWELL) return corename[20];
return corename[0];
}
#define CORENAME "SANDYBRIDGE"
#endif
+#ifdef FORCE_HASWELL
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE "X86"
+#define SUBARCHITECTURE "HASWELL"
+#define ARCHCONFIG "-DHASWELL " \
+ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
+ "-DFMA3"
+#define LIBNAME "haswell"
+#define CORENAME "HASWELL"
+#endif
+
#ifdef FORCE_ATOM
#define FORCE
#define FORCE_INTEL
--- /dev/null
+include $(KERNELDIR)/KERNEL.PENRYN
#define PREFETCHSIZE (8 * 21 + 4)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#define PREFETCHSIZE (8 * 21 + 4)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#define PREFETCHSIZE (8 * 21 + 4)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#define PREFETCHSIZE (8 * 21 + 4)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#define PREFETCHSIZE (8 * 21 + 4)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#define PREFETCHSIZE (8 * 21 + 4)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#define PREFETCHSIZE 84
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif
#define PREFETCHSIZE 84
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif
#define PREFETCHSIZE 84
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif
#define PREFETCHSIZE 84
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif
#define PREFETCHSIZE 84
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif
--- /dev/null
+SGEMMKERNEL = sgemm_kernel_8x8_sandy.S
+SGEMMINCOPY =
+SGEMMITCOPY =
+SGEMMONCOPY = ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
+DGEMMINCOPY = ../generic/gemm_ncopy_8.c
+DGEMMITCOPY = ../generic/gemm_tcopy_8.c
+#DGEMMONCOPY = gemm_ncopy_4.S
+DGEMMONCOPY = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+#DGEMMOTCOPY = gemm_tcopy_4.S
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S
+CGEMMKERNEL = cgemm_kernel_4x8_sandy.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S
+ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S
+ZGEMMINCOPY =
+ZGEMMITCOPY =
+ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S
+#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
+#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
+#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
+
+#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
+#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
+#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
+#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
+
+#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
+#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
+#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S
+#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S
+
+#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S
+#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S
+#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S
+#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+
+
+CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S
+ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S
#define PREFETCHSIZE (16 * 12)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
#define PREFETCHSIZE (16 * 12)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
#define PREFETCHSIZE (16 * 12)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 12)
#define PREFETCHSIZE (16 * 12)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
#define PREFETCHSIZE (16 * 24)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
#define PREFETCHSIZE (16 * 24)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
#define PREFETCHSIZE (16 * 24)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
#define PREFETCHSIZE (16 * 24)
#endif
-#if defined(NEHALEM) || defined(SANDYBRIDGE)
+#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION)
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0
#define PREFETCHSIZE (16 * 24)
#endif
+#ifdef HASWELL
+
+#define SNUMOPT 8
+#define DNUMOPT 4
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SYMV_P 8
+
+#define SWITCH_RATIO 4
+
+#ifdef ARCH_X86
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+#else
+#define SGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define XGEMM_DEFAULT_UNROLL_M 1
+
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#define XGEMM_DEFAULT_UNROLL_N 1
+#endif
+
+#define SGEMM_DEFAULT_P 512
+#define SGEMM_DEFAULT_R sgemm_r
+//#define SGEMM_DEFAULT_R 1024
+
+#define DGEMM_DEFAULT_P 512
+#define DGEMM_DEFAULT_R dgemm_r
+//#define DGEMM_DEFAULT_R 1024
+
+#define QGEMM_DEFAULT_P 504
+#define QGEMM_DEFAULT_R qgemm_r
+
+#define CGEMM_DEFAULT_P 128
+//#define CGEMM_DEFAULT_R cgemm_r
+#define CGEMM_DEFAULT_R 1024
+
+#define ZGEMM_DEFAULT_P 512
+#define ZGEMM_DEFAULT_R zgemm_r
+//#define ZGEMM_DEFAULT_R 1024
+
+#define XGEMM_DEFAULT_P 252
+#define XGEMM_DEFAULT_R xgemm_r
+
+#define SGEMM_DEFAULT_Q 256
+#define DGEMM_DEFAULT_Q 256
+#define QGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 256
+#define ZGEMM_DEFAULT_Q 192
+#define XGEMM_DEFAULT_Q 128
+
+#define GETRF_FACTOR 0.72
+
+#endif
+
#ifdef ATOM