Init AMD Bulldozer codebase.
authorZhang Xianyi <traits.zhang@gmail.com>
Thu, 6 Dec 2012 12:29:54 +0000 (07:29 -0500)
committerZhang Xianyi <traits.zhang@gmail.com>
Thu, 6 Dec 2012 12:29:54 +0000 (07:29 -0500)
Makefile.system
cpuid.h
cpuid_x86.c
driver/others/dynamic.c
getarch.c
kernel/x86/KERNEL.BULLDOZER [new file with mode: 0644]
kernel/x86_64/KERNEL.BULLDOZER [new file with mode: 0644]
param.h

index 27f30fa..75c0e0a 100644 (file)
@@ -277,14 +277,14 @@ ifeq ($(ARCH), x86)
 DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
               CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE 
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
 endif
 endif
 
 ifeq ($(ARCH), x86_64)
 DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
 ifneq ($(NO_AVX), 1)
-DYNAMIC_CORE += SANDYBRIDGE 
+DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
 endif
 endif
 
diff --git a/cpuid.h b/cpuid.h
index bb57ad9..c52d503 100644 (file)
--- a/cpuid.h
+++ b/cpuid.h
 #define HAVE_MISALIGNSSE (1 << 15)
 #define HAVE_128BITFPU   (1 << 16)
 #define HAVE_FASTMOVU    (1 << 17)
-#define HAVE_AVX     (1 <<  18)
+#define HAVE_AVX      (1 <<  18)
+#define HAVE_FMA4     (1 <<  19)
 
 #define CACHE_INFO_L1_I     1
 #define CACHE_INFO_L1_D     2
index 6e4eae2..afc3b17 100644 (file)
@@ -43,6 +43,8 @@
 #ifdef NO_AVX
 #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
 #define CORE_SANDYBRIDGE CORE_NEHALEM
+#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
+#define CORE_BULLDOZER CORE_BARCELONA
 #endif
 
 #ifndef CPUIDEMU
@@ -228,6 +230,9 @@ int get_cputype(int gettype){
       cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
       if ((ecx & (1 <<  6)) != 0) feature |= HAVE_SSE4A;
       if ((ecx & (1 <<  7)) != 0) feature |= HAVE_MISALIGNSSE;
+#ifndef NO_AVX
+      if ((ecx & (1 <<  16)) != 0) feature |= HAVE_FMA4;
+#endif
       if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
       if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
     }
@@ -1075,8 +1080,12 @@ int get_cpuname(void){
        return CPUTYPE_OPTERON;
       case  1:
       case 10:
-      case  6:   //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
        return CPUTYPE_BARCELONA;
+      case  6:   //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+       if(support_avx())
+         return CPUTYPE_BULLDOZER;
+       else
+         return CPUTYPE_BARCELONA; //OS don't support AVX.
       case  5:
        return CPUTYPE_BOBCAT;
       }
@@ -1427,8 +1436,13 @@ int get_coretype(void){
     if (family == 0xf){
       if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; 
       else if (exfamily == 5) return CORE_BOBCAT; 
-      else if (exfamily == 6) return CORE_BARCELONA;  //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
-      else return CORE_BARCELONA;
+      else if (exfamily == 6) {
+       //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+       if(support_avx())
+         return CORE_BULLDOZER;
+       else
+         return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
+      }else return CORE_BARCELONA;
     }
   }
 
index 5d2bc78..1c0e1d3 100644 (file)
@@ -63,9 +63,11 @@ extern gotoblas_t  gotoblas_BARCELONA;
 extern gotoblas_t  gotoblas_BOBCAT;
 #ifndef NO_AVX
 extern gotoblas_t  gotoblas_SANDYBRIDGE;
+extern gotoblas_t  gotoblas_BULLDOZER;
 #else
 //Use NEHALEM kernels for sandy bridge
 #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
+#define gotoblas_BULLDOZER gotoblas_BARCELONA
 #endif
 
 
@@ -202,6 +204,14 @@ static gotoblas_t *get_coretype(void){
        else return &gotoblas_OPTERON;
       }  else if (exfamily == 5) {
        return &gotoblas_BOBCAT;
+      } else if (exfamily == 6) {
+       //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+         if(support_avx())
+           return &gotoblas_BULLDOZER;
+         else{
+           fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n");
+           return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+         }     
       } else {
        return &gotoblas_BARCELONA;
       }
@@ -238,6 +248,7 @@ static char *corename[] = {
     "Nano",
     "Sandybridge",
     "Bobcat",
+    "Bulldozer",
 };
 
 char *gotoblas_corename(void) {
@@ -259,6 +270,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_NANO)         return corename[15];
   if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
   if (gotoblas == &gotoblas_BOBCAT)       return corename[17];
+  if (gotoblas == &gotoblas_BULLDOZER)    return corename[18];
 
   return corename[0];
 }
index 5916a9a..4daf260 100644 (file)
--- a/getarch.c
+++ b/getarch.c
@@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "OPTERON"
 #endif
 
-#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER)
+#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
@@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "BOBCAT"
 #endif
 
+#if defined (FORCE_BULLDOZER)
+#define FORCE
+#define FORCE_INTEL
+#define ARCHITECTURE    "X86"
+#define SUBARCHITECTURE "BULLDOZER"
+#define ARCHCONFIG   "-DBARCELONA " \
+                    "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
+                    "-DL2_SIZE=1024000 -DL2_LINESIZE=64  -DL3_SIZE=16777216 " \
+                    "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
+                    "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
+                    "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \
+                     "-DHAVE_AVX -DHAVE_FMA4"
+#define LIBNAME   "bulldozer"
+#define CORENAME  "BULLDOZER"
+#endif
+
 #ifdef FORCE_SSE_GENERIC
 #define FORCE
 #define FORCE_INTEL
diff --git a/kernel/x86/KERNEL.BULLDOZER b/kernel/x86/KERNEL.BULLDOZER
new file mode 100644 (file)
index 0000000..231350a
--- /dev/null
@@ -0,0 +1,59 @@
+SGEMMKERNEL    =  gemm_kernel_4x4_barcelona.S
+SGEMMINCOPY    =  
+SGEMMITCOPY    =  
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ =  
+SGEMMITCOPYOBJ =  
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL    =  gemm_kernel_2x4_barcelona.S
+DGEMMINCOPY    =  ../generic/gemm_ncopy_2.c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_2.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL    =  zgemm_kernel_2x2_barcelona.S
+CGEMMINCOPY    =  
+CGEMMITCOPY    =  
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ =
+CGEMMITCOPYOBJ =  
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL    =  zgemm_kernel_1x2_barcelona.S
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_1.c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_1.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN =  trsm_kernel_LN_4x4_sse.S
+STRSMKERNEL_LT =  trsm_kernel_LT_4x4_sse.S
+STRSMKERNEL_RN =  trsm_kernel_LT_4x4_sse.S
+STRSMKERNEL_RT =  trsm_kernel_RT_4x4_sse.S
+
+DTRSMKERNEL_LN =  trsm_kernel_LN_2x4_sse2.S
+DTRSMKERNEL_LT =  trsm_kernel_LT_2x4_sse2.S
+DTRSMKERNEL_RN =  trsm_kernel_LT_2x4_sse2.S
+DTRSMKERNEL_RT =  trsm_kernel_RT_2x4_sse2.S
+
+CTRSMKERNEL_LN =  ztrsm_kernel_LN_2x2_sse.S
+CTRSMKERNEL_LT =  ztrsm_kernel_LT_2x2_sse.S
+CTRSMKERNEL_RN =  ztrsm_kernel_LT_2x2_sse.S
+CTRSMKERNEL_RT =  ztrsm_kernel_RT_2x2_sse.S
+
+ZTRSMKERNEL_LN =  ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_LT =  ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_RN =  ztrsm_kernel_LT_1x2_sse2.S
+ZTRSMKERNEL_RT =  ztrsm_kernel_RT_1x2_sse2.S
+
+CGEMM3MKERNEL    =  zgemm3m_kernel_4x4_barcelona.S
+ZGEMM3MKERNEL    =  zgemm3m_kernel_2x4_barcelona.S
diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
new file mode 100644 (file)
index 0000000..051a522
--- /dev/null
@@ -0,0 +1,62 @@
+ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVTKERNEL = zgemv_t_dup.S
+
+SGEMMKERNEL    =  gemm_kernel_8x4_barcelona.S
+SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
+SGEMMONCOPY    =  gemm_ncopy_4_opteron.S
+SGEMMOTCOPY    =  gemm_tcopy_4_opteron.S
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX) 
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX) 
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+DGEMMKERNEL    =  gemm_kernel_4x4_barcelona.S
+DGEMMINCOPY    =
+DGEMMITCOPY    =
+DGEMMONCOPY    =  gemm_ncopy_4_opteron.S
+DGEMMOTCOPY    =  gemm_tcopy_4_opteron.S
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMKERNEL    =  zgemm_kernel_4x2_barcelona.S
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_4.c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_4.c
+CGEMMONCOPY    =  zgemm_ncopy_2.S
+CGEMMOTCOPY    =  zgemm_tcopy_2.S
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMKERNEL    =  zgemm_kernel_2x2_barcelona.S
+ZGEMMINCOPY    =
+ZGEMMITCOPY    =
+ZGEMMONCOPY    =  zgemm_ncopy_2.S
+ZGEMMOTCOPY    =  zgemm_tcopy_2.S
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN =  trsm_kernel_LN_8x4_sse.S
+STRSMKERNEL_LT =  trsm_kernel_LT_8x4_sse.S
+STRSMKERNEL_RN =  trsm_kernel_LT_8x4_sse.S
+STRSMKERNEL_RT =  trsm_kernel_RT_8x4_sse.S
+
+DTRSMKERNEL_LN =  trsm_kernel_LN_4x4_barcelona.S
+DTRSMKERNEL_LT =  trsm_kernel_LT_4x4_barcelona.S
+DTRSMKERNEL_RN =  trsm_kernel_LT_4x4_barcelona.S
+DTRSMKERNEL_RT =  trsm_kernel_RT_4x4_barcelona.S
+
+CTRSMKERNEL_LN =  ztrsm_kernel_LN_4x2_sse.S
+CTRSMKERNEL_LT =  ztrsm_kernel_LT_4x2_sse.S
+CTRSMKERNEL_RN =  ztrsm_kernel_LT_4x2_sse.S
+CTRSMKERNEL_RT =  ztrsm_kernel_RT_4x2_sse.S
+
+ZTRSMKERNEL_LN =  ztrsm_kernel_LN_2x2_sse2.S
+ZTRSMKERNEL_LT =  ztrsm_kernel_LT_2x2_sse2.S
+ZTRSMKERNEL_RN =  ztrsm_kernel_LT_2x2_sse2.S
+ZTRSMKERNEL_RT =  ztrsm_kernel_RT_2x2_sse2.S
+
+CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_barcelona.S
+ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_barcelona.S
diff --git a/param.h b/param.h
index 11c1a26..5b6a19a 100644 (file)
--- a/param.h
+++ b/param.h
@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
+#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
 
 #define SNUMOPT                8
 #define DNUMOPT                4