DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
endif
endif
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
+- **AMD PILEDRIVER**: Used Bulldozer codes.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
#define MMXSTORE movd
#endif
+#if defined(PILEDRIVER) || defined(BULLDOZER)
+//Enable some optimazation for barcelona.
+#define BARCELONA_OPTIMIZATION
+#endif
+
#if defined(HAVE_3DNOW)
#define EMMS femms
#elif defined(HAVE_MMX)
#ifdef ASSEMBLER
+#if defined(PILEDRIVER) || defined(BULLDOZER)
+//Enable some optimazation for barcelona.
+#define BARCELONA_OPTIMIZATION
+#endif
+
#if defined(HAVE_3DNOW)
#define EMMS femms
#elif defined(HAVE_MMX)
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
+#define CORE_PILEDRIVER 23
#define CORE_HASWELL CORE_SANDYBRIDGE
#define HAVE_SSE (1 << 0)
#define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19)
+#define HAVE_FMA3 (1 << 20)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
#define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
+#define CPUTYPE_PILEDRIVER 47
// this define is because BLAS doesn't have haswell specific optimizations yet
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
#define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
#define CORE_BULLDOZER CORE_BARCELONA
+#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
+#define CORE_PILEDRIVER CORE_BARCELONA
#endif
#ifndef CPUIDEMU
#ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX;
#endif
+ if ((ecx & (1 << 20)) != 0) feature |= HAVE_FMA3;
if (have_excpuid() >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
case 1:
case 10:
return CPUTYPE_BARCELONA;
- case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
- if(support_avx())
- return CPUTYPE_BULLDOZER;
- else
- return CPUTYPE_BARCELONA; //OS don't support AVX.
+ case 6:
+ switch (model) {
+ case 1:
+ //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(support_avx())
+ return CPUTYPE_BULLDOZER;
+ else
+ return CPUTYPE_BARCELONA; //OS don't support AVX.
+ case 2:
+ if(support_avx())
+ return CPUTYPE_PILEDRIVER;
+ else
+ return CPUTYPE_BARCELONA; //OS don't support AVX.
+ }
+ break;
case 5:
return CPUTYPE_BOBCAT;
}
"SANDYBRIDGE",
"BOBCAT",
"BULLDOZER",
+ "PILEDRIVER",
};
static char *lowercpuname[] = {
"sandybridge",
"bobcat",
"bulldozer",
+ "piledriver",
};
static char *corename[] = {
"SANDYBRIDGE",
"BOBCAT",
"BULLDOZER",
+ "PILEDRIVER",
};
static char *corename_lower[] = {
"sandybridge",
"bobcat",
"bulldozer",
+ "piledriver",
};
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
else if (exfamily == 6) {
- //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
- if(support_avx())
- return CORE_BULLDOZER;
- else
- return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
+ switch (model) {
+ case 1:
+ //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(support_avx())
+ return CORE_BULLDOZER;
+ else
+ return CORE_BARCELONA; //OS don't support AVX.
+ case 2:
+ if(support_avx())
+ return CORE_PILEDRIVER;
+ else
+ return CORE_BARCELONA; //OS don't support AVX.
+ }
}else return CORE_BARCELONA;
}
}
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
+ if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
+ if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
}
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
+extern gotoblas_t gotoblas_PILEDRIVER;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
+#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#endif
//Use sandy bridge kernels for haswell.
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else if (exfamily == 6) {
- //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+ if(model == 1){
+ //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return &gotoblas_BULLDOZER;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
- }
+ }
+ }else if(model == 2){
+ //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
+ if(support_avx())
+ return &gotoblas_PILEDRIVER;
+ else{
+ fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
+ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+ }
+ }
} else {
return &gotoblas_BARCELONA;
}
"Sandybridge",
"Bobcat",
"Bulldozer",
+ "Piledriver",
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
+ if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
return corename[0];
}
/* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCAT */
/* #define FORCE_BULLDOZER */
+/* #define FORCE_PILEDRIVER */
/* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */
/* #define FORCE_NANO */
#define CORENAME "BULLDOZER"
#endif
+#if defined (FORCE_PILEDRIVER)
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE "X86"
+#define SUBARCHITECTURE "PILEDRIVER"
+#define ARCHCONFIG "-DPILEDRIVER " \
+ "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
+ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" \
+ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH" \
+ "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
+#define LIBNAME "piledriver"
+#define CORENAME "PILEDRIVER"
+#endif
+
#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL
#endif
#endif
+#ifdef PILEDRIVER
+
+#ifdef DEBUG
+ fprintf(stderr, "Piledriver\n");
+#endif
+
+ TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+ TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+ TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+ TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+#ifdef EXPRECISION
+ TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
+ TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
+#endif
+#endif
+
#ifdef NANO
#ifdef DEBUG
--- /dev/null
+SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
+SGEMMINCOPY =
+SGEMMITCOPY =
+SGEMMONCOPY = ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
+DGEMMINCOPY = ../generic/gemm_ncopy_2.c
+DGEMMITCOPY = ../generic/gemm_tcopy_2.c
+DGEMMONCOPY = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
+CGEMMINCOPY =
+CGEMMITCOPY =
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ =
+CGEMMITCOPYOBJ =
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
+ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
+STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
+STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
+STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
+
+DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
+DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
+DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
+DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
+
+CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
+CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
+CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
+CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
+
+ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
+
+CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
+ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
.L42:
mulpd %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
.L72:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
ALIGN_4
.L62:
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
.L52:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
.L112:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
.L102:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
.L92:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
.L42:
mulpd %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
.L52:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
ALIGN_4
.L62:
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
.L72:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
.L92:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
.L102:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
.L112:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
.L42:
mulpd %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
.L92:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
.L102:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
.L112:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
.L52:
mulps %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
ALIGN_4
.L62:
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
.L72:
mulss %xmm0, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
-#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4
--- /dev/null
+ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVTKERNEL = zgemv_t_dup.S
+
+DGEMVNKERNEL = dgemv_n_bulldozer.S
+DGEMVTKERNEL = dgemv_t_bulldozer.S
+DAXPYKERNEL = daxpy_bulldozer.S
+DDOTKERNEL = ddot_bulldozer.S
+DCOPYKERNEL = dcopy_bulldozer.S
+
+SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
+SGEMMINCOPY = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY = ../generic/gemm_tcopy_16.c
+SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
+SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
+DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
+DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
+DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
+DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
+CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
+CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
+ZGEMMINCOPY =
+ZGEMMITCOPY =
+ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
+ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+
#define movsd movlps
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define movsd movlps
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define a3 %xmm14
#define xt1 %xmm15
-#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define a3 %xmm14
#define xt1 %xmm15
-#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
+#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define xt1 %xmm14
#define xt2 %xmm15
-#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
+#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else
#define movsd movlpd
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)
#define a3 %xmm14
#define xt1 %xmm15
-#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
+#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else
#define movsd movlps
#endif
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps
#endif
+#ifdef PILEDRIVER
+
+#define SNUMOPT 8
+#define DNUMOPT 4
+
+#define GEMM_DEFAULT_OFFSET_A 64
+#define GEMM_DEFAULT_OFFSET_B 832
+#define GEMM_DEFAULT_ALIGN 0x0fffUL
+
+
+
+#define QGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
+#define XGEMM_DEFAULT_UNROLL_N 1
+
+#ifdef ARCH_X86
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 2
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#define XGEMM_DEFAULT_UNROLL_M 1
+#else
+#define SGEMM_DEFAULT_UNROLL_N 2
+#define DGEMM_DEFAULT_UNROLL_N 2
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define QGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_M 2
+#define XGEMM_DEFAULT_UNROLL_M 1
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
+#define ZGEMM3M_DEFAULT_UNROLL_N 4
+#define ZGEMM3M_DEFAULT_UNROLL_M 4
+#define GEMV_UNROLL 8
+#endif
+
+
+#if defined(ARCH_X86_64)
+#define SGEMM_DEFAULT_P 768
+#define DGEMM_DEFAULT_P 384
+#else
+#define SGEMM_DEFAULT_P 448
+#define DGEMM_DEFAULT_P 224
+#endif
+#define QGEMM_DEFAULT_P 112
+#define CGEMM_DEFAULT_P 224
+#define ZGEMM_DEFAULT_P 112
+#define XGEMM_DEFAULT_P 56
+
+#if defined(ARCH_X86_64)
+#define SGEMM_DEFAULT_Q 168
+#define DGEMM_DEFAULT_Q 168
+#else
+#define SGEMM_DEFAULT_Q 224
+#define DGEMM_DEFAULT_Q 224
+#endif
+#define QGEMM_DEFAULT_Q 224
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 224
+#define XGEMM_DEFAULT_Q 224
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define QGEMM_DEFAULT_R qgemm_r
+#define DGEMM_DEFAULT_R dgemm_r
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
+#define XGEMM_DEFAULT_R xgemm_r
+
+#define SYMV_P 16
+#define HAVE_EXCLUSIVE_CACHE
+
+#define GEMM_THREAD gemm_thread_mn
+
+#endif
+
#ifdef ATHLON
#define SNUMOPT 4