Simplifying ARMv8 build parameters
authorRenato Golin <rengolin@systemcall.eu>
Fri, 16 Nov 2018 15:45:12 +0000 (15:45 +0000)
committerRenato Golin <rengolin@systemcall.eu>
Mon, 19 Nov 2018 16:41:49 +0000 (16:41 +0000)
ARMv8 builds were a bit mixed up, with ThunderX2 code in ARMv8 mode
(which is not right because TX2 is ARMv8.1) as well as requiring a few
redundancies in the defines, making it harder to maintain and understand
what core has what. A few other minor issues were also fixed.

Tests were made on the following cores: A53, A57, A72, Falkor, ThunderX,
ThunderX2, and XGene.

Tests were: OpenBLAS/test, OpenBLAS/benchmark, BLAS-Tester.

A summary:
 * Removed TX2 code from ARMv8 build, to make sure it is compatible with
   all ARMv8 cores, not just v8.1. Also, the TX2 code has actually
   harmed performance on big cores.
 * Commoned up ARMv8 architectures' defines in params.h, to make sure
   that all will benefit from ARMv8 settings, in addition to their own.
 * Adding a few more cores, using ARMv8's include strategy, to benefit
   from compiler optimisations using mtune. Also updated cache
   information from the manuals, making sure we set good conservative
   values by default. Removed Vulcan, as it's an alias to TX2.
 * Auto-detecting most of those cores, but also updating the forced
   compilation in getarch.c, to make sure the parameters are the same
   whether compiled natively or forced arch.

Benefits:
 * ARMv8 build is now guaranteed to work on all ARMv8 cores
 * Improved performance for ARMv8 builds on some cores (A72, Falkor,
   ThunderX1 and 2: up to 11%) over current develop
 * Improved performance for *all* cores comparing to develop branch
   before TX2's patch (9% ~ 36%)
 * ThunderX1 builds are 14% faster than ARMv8 on TX1, 9% faster than
   current develop's branch and 8% faster than deveop before tx2 patches

Issues:
 * Regression from current develop branch for A53 (-12%) and A57 (-3%)
   with ARMv8 builds, but still faster than before TX2's commit (+15%
   and +24% respectively). This can be improved with a simplification of
   TX2's code, to be done in future patches. At least the code is
   guaranteed to be ARMv8.0 now.

Comments:
 * CortexA57 builds are unchanged on A57 hardware from develop's branch,
   which makes sense, as it's untouched.
 * CortexA72 builds improve over A57 on A72 hardware, even if they're
   using the same includes due to new compiler tunning in the makefile.

Makefile.arm64
TargetList.txt
cpuid_arm64.c
getarch.c
kernel/arm64/KERNEL.ARMV8
kernel/arm64/KERNEL.CORTEXA53 [new file with mode: 0644]
kernel/arm64/KERNEL.CORTEXA72 [new file with mode: 0644]
kernel/arm64/KERNEL.CORTEXA73 [new file with mode: 0644]
kernel/arm64/KERNEL.FALKOR [new file with mode: 0644]
kernel/arm64/KERNEL.VULCAN [deleted file]
param.h

index d19e796..a529fab 100644 (file)
@@ -4,22 +4,37 @@ CCOMMON_OPT += -march=armv8-a
 FCOMMON_OPT += -march=armv8-a
 endif
 
+ifeq ($(CORE), CORTEXA53)
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
+endif
+
 ifeq ($(CORE), CORTEXA57)
-CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
-FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
+endif
+
+ifeq ($(CORE), CORTEXA72)
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif
 
-ifeq ($(CORE), VULCAN)
-CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
-FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
+ifeq ($(CORE), CORTEXA73)
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
 endif
 
 ifeq ($(CORE), THUNDERX)
-CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
-FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
+CCOMMON_OPT += -march=armv8-a -mtune=thunderx
+FCOMMON_OPT += -march=armv8-a -mtune=thunderx
+endif
+
+ifeq ($(CORE), FALKOR)
+CCOMMON_OPT += -march=armv8.1-a -mtune=falkor
+FCOMMON_OPT += -march=armv8.1-a -mtune=falkor
 endif
 
 ifeq ($(CORE), THUNDERX2T99)
-CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
-FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
+CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
+FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 endif
index 31e4881..3d04a57 100644 (file)
@@ -83,8 +83,11 @@ ARMV5
 
 8.ARM 64-bit CPU:
 ARMV8
+CORTEXA53
 CORTEXA57
-VULCAN
+CORTEXA72
+CORTEXA73
+FALKOR
 THUNDERX
 THUNDERX2T99
 
index 3acb395..c914fbc 100644 (file)
 
 #define CPU_UNKNOWN            0
 #define CPU_ARMV8              1
-#define CPU_CORTEXA57          2
-#define CPU_VULCAN             3
-#define CPU_THUNDERX           4
-#define CPU_THUNDERX2T99       5
+// Arm
+#define CPU_CORTEXA53     2
+#define CPU_CORTEXA57     3
+#define CPU_CORTEXA72     4
+#define CPU_CORTEXA73     5
+// Qualcomm
+#define CPU_FALKOR        6
+// Cavium
+#define CPU_THUNDERX      7
+#define CPU_THUNDERX2T99  8
 
 static char *cpuname[] = {
   "UNKNOWN",
   "ARMV8" ,
+  "CORTEXA53",
   "CORTEXA57",
-  "VULCAN",
+  "CORTEXA72",
+  "CORTEXA73",
+  "FALKOR",
   "THUNDERX",
   "THUNDERX2T99"
 };
 
 static char *cpuname_lower[] = {
   "unknown",
-  "armv8" ,
+  "armv8",
+  "cortexa53",
   "cortexa57",
-  "vulcan",
+  "cortexa72",
+  "cortexa73",
+  "falkor",
   "thunderx",
   "thunderx2t99"
 };
@@ -114,14 +126,24 @@ int detect(void)
 
        fclose(infile);
        if(cpu_part != NULL && cpu_implementer != NULL) {
-               if (strstr(cpu_implementer, "0x41") && 
-               (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08")))
-                       return CPU_CORTEXA57; //or compatible, ex. A72
-               else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
-                       return CPU_VULCAN;
-               else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
+    // Arm
+    if (strstr(cpu_implementer, "0x41")) {
+      if (strstr(cpu_part, "0xd03"))
+        return CPU_CORTEXA53;
+      else if (strstr(cpu_part, "0xd07"))
+        return CPU_CORTEXA57;
+      else if (strstr(cpu_part, "0xd08"))
+        return CPU_CORTEXA72;
+      else if (strstr(cpu_part, "0xd09"))
+        return CPU_CORTEXA73;
+    }
+    // Qualcomm
+    else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
+      return CPU_FALKOR;
+    // Cavium
+    else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
                        return CPU_THUNDERX;
-               else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
+    else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
                        return CPU_THUNDERX2T99;
        }
 
@@ -180,62 +202,62 @@ void get_subdirname(void)
 void get_cpuconfig(void)
 {
 
+  // All arches should define ARMv8
+  printf("#define ARMV8\n");
+  printf("#define HAVE_NEON\n"); // This shouldn't be necessary
+  printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
+
        int d = detect();
        switch (d)
        {
 
+    case CPU_CORTEXA53:
+      printf("#define %s\n", cpuname[d]);
+      // Fall-through
                case CPU_ARMV8:
-                       printf("#define ARMV8\n");
-                       printf("#define L1_DATA_SIZE 32768\n");
-                       printf("#define L1_DATA_LINESIZE 64\n");
-                       printf("#define L2_SIZE 262144\n");
-                       printf("#define L2_LINESIZE 64\n");
-                       printf("#define DTB_DEFAULT_ENTRIES 64\n");
-                       printf("#define DTB_SIZE 4096\n");
-                       printf("#define L2_ASSOCIATIVE 4\n");
-                       break;
-
-               case CPU_VULCAN:
-                       printf("#define VULCAN                        \n");
-                       printf("#define HAVE_VFP                      \n");
-                       printf("#define HAVE_VFPV3                    \n");
-                       printf("#define HAVE_NEON                     \n");
-                       printf("#define HAVE_VFPV4                    \n");
-                       printf("#define L1_CODE_SIZE         32768    \n");
-                       printf("#define L1_CODE_LINESIZE     64       \n");
-                       printf("#define L1_CODE_ASSOCIATIVE  8        \n");
-                       printf("#define L1_DATA_SIZE         32768    \n");
-                       printf("#define L1_DATA_LINESIZE     64       \n");
-                       printf("#define L1_DATA_ASSOCIATIVE  8        \n");
-                       printf("#define L2_SIZE              262144   \n");
-                       printf("#define L2_LINESIZE          64       \n");
-                       printf("#define L2_ASSOCIATIVE       8        \n");
-                       printf("#define L3_SIZE              33554432 \n");
-                       printf("#define L3_LINESIZE          64       \n");
-                       printf("#define L3_ASSOCIATIVE       32       \n");
-                       printf("#define DTB_DEFAULT_ENTRIES  64       \n");
-                       printf("#define DTB_SIZE             4096     \n");
+      // Minimum parameters for ARMv8 (based on A53)
+       printf("#define L1_DATA_SIZE 32768\n");
+       printf("#define L1_DATA_LINESIZE 64\n");
+       printf("#define L2_SIZE 262144\n");
+       printf("#define L2_LINESIZE 64\n");
+       printf("#define DTB_DEFAULT_ENTRIES 64\n");
+       printf("#define DTB_SIZE 4096\n");
+       printf("#define L2_ASSOCIATIVE 4\n");
                        break;
 
                case CPU_CORTEXA57:
-                       printf("#define CORTEXA57\n");
-                       printf("#define HAVE_VFP\n");
-                       printf("#define HAVE_VFPV3\n");
-                       printf("#define HAVE_NEON\n");
-                       printf("#define HAVE_VFPV4\n");
+               case CPU_CORTEXA72:
+               case CPU_CORTEXA73:
+      // Common minimum settings for these Arm cores
+      // Can change a lot, but we need to be conservative
+      // TODO: detect info from /sys if possible
+      printf("#define %s\n", cpuname[d]);
                        printf("#define L1_CODE_SIZE 49152\n");
                        printf("#define L1_CODE_LINESIZE 64\n");
                        printf("#define L1_CODE_ASSOCIATIVE 3\n");
                        printf("#define L1_DATA_SIZE 32768\n");
                        printf("#define L1_DATA_LINESIZE 64\n");
                        printf("#define L1_DATA_ASSOCIATIVE 2\n");
-                       printf("#define L2_SIZE 2097152\n");
+      printf("#define L2_SIZE 524288\n");
                        printf("#define L2_LINESIZE 64\n");
                        printf("#define L2_ASSOCIATIVE 16\n");
                        printf("#define DTB_DEFAULT_ENTRIES 64\n");
                        printf("#define DTB_SIZE 4096\n");
                        break;
 
+    case CPU_FALKOR:
+      printf("#define FALKOR\n");
+      printf("#define L1_CODE_SIZE 65536\n");
+      printf("#define L1_CODE_LINESIZE 64\n");
+      printf("#define L1_DATA_SIZE 32768\n");
+      printf("#define L1_DATA_LINESIZE 128\n");
+      printf("#define L2_SIZE 524288\n");
+      printf("#define L2_LINESIZE 64\n");
+      printf("#define DTB_DEFAULT_ENTRIES 64\n");
+      printf("#define DTB_SIZE 4096\n");
+      printf("#define L2_ASSOCIATIVE 16\n");
+      break;
+
                case CPU_THUNDERX:
                        printf("#define THUNDERX\n");
                        printf("#define L1_DATA_SIZE 32768\n");
@@ -249,10 +271,6 @@ void get_cpuconfig(void)
 
                case CPU_THUNDERX2T99:
                        printf("#define VULCAN                        \n");
-                       printf("#define HAVE_VFP                      \n");
-                       printf("#define HAVE_VFPV3                    \n");
-                       printf("#define HAVE_NEON                     \n");
-                       printf("#define HAVE_VFPV4                    \n");
                        printf("#define L1_CODE_SIZE         32768    \n");
                        printf("#define L1_CODE_LINESIZE     64       \n");
                        printf("#define L1_CODE_ASSOCIATIVE  8        \n");
index 31f41d6..146f1f3 100644 (file)
--- a/getarch.c
+++ b/getarch.c
@@ -927,11 +927,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ARCHCONFIG   "-DARMV8 " \
        "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
        "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " 
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
 #define LIBNAME   "armv8"
 #define CORENAME  "ARMV8"
 #endif
 
+#ifdef FORCE_CORTEXA53
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "CORTEXA53"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DCORTEXA53 " \
+       "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "cortexa53"
+#define CORENAME  "CORTEXA53"
+#else
+#endif
+
 #ifdef FORCE_CORTEXA57
 #define FORCE
 #define ARCHITECTURE    "ARM64"
@@ -942,26 +959,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
        "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
-       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
 #define LIBNAME   "cortexa57"
 #define CORENAME  "CORTEXA57"
 #else
 #endif
 
-#ifdef FORCE_VULCAN
+#ifdef FORCE_CORTEXA72
 #define FORCE
 #define ARCHITECTURE    "ARM64"
-#define SUBARCHITECTURE "VULCAN"
+#define SUBARCHITECTURE "CORTEXA72"
 #define SUBDIRNAME      "arm64"
-#define ARCHCONFIG   "-DVULCAN " \
-       "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
-       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
-       "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
-       "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
+#define ARCHCONFIG   "-DCORTEXA72 " \
+       "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+       "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "cortexa72"
+#define CORENAME  "CORTEXA72"
+#else
+#endif
+
+#ifdef FORCE_CORTEXA73
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "CORTEXA73"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DCORTEXA73 " \
+       "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+       "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "cortexa73"
+#define CORENAME  "CORTEXA73"
+#else
+#endif
+
+#ifdef FORCE_FALKOR
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "FALKOR"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DFALKOR " \
+       "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+       "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
-       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
-#define LIBNAME   "vulcan"
-#define CORENAME  "VULCAN"
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME   "falkor"
+#define CORENAME  "FALKOR"
 #else
 #endif
 
@@ -973,13 +1021,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ARCHCONFIG   "-DTHUNDERX " \
        "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
        "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
 #define LIBNAME   "thunderx"
 #define CORENAME  "THUNDERX"
 #else
 #endif
 
 #ifdef FORCE_THUNDERX2T99
+#define ARMV8
 #define FORCE
 #define ARCHITECTURE    "ARM64"
 #define SUBARCHITECTURE "THUNDERX2T99"
@@ -990,7 +1040,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
        "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
-       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
 #define LIBNAME   "thunderx2t99"
 #define CORENAME  "THUNDERX2T99"
 #else
index bcecd00..5c70390 100644 (file)
@@ -46,7 +46,7 @@ CAMAXKERNEL  = zamax.S
 ZAMAXKERNEL  = zamax.S
 
 SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
+DAXPYKERNEL  = axpy.S
 CAXPYKERNEL  = zaxpy.S
 ZAXPYKERNEL  = zaxpy.S
 
@@ -71,39 +71,37 @@ CGEMVTKERNEL = zgemv_t.S
 ZGEMVTKERNEL = zgemv_t.S
 
 
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
+SASUMKERNEL    = asum.S
+DASUMKERNEL    = asum.S
+CASUMKERNEL    = casum.S
+ZASUMKERNEL    = zasum.S
 
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
+SCOPYKERNEL    = copy.S
+DCOPYKERNEL    = copy.S
+CCOPYKERNEL    = copy.S
+ZCOPYKERNEL    = copy.S
 
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
+SSWAPKERNEL    = swap.S
+DSWAPKERNEL    = swap.S
+CSWAPKERNEL    = swap.S
+ZSWAPKERNEL    = swap.S
 
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
+ISAMAXKERNEL   = iamax.S
+IDAMAXKERNEL   = iamax.S
+ICAMAXKERNEL   = izamax.S
+IZAMAXKERNEL   = izamax.S
 
 ifneq ($(OS_DARWIN)$(CROSS),11)
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-#DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-#ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+SNRM2KERNEL    = nrm2.S
+CNRM2KERNEL    = nrm2.S
+DNRM2KERNEL    = znrm2.S
+ZNRM2KERNEL    = znrm2.S
 endif
 
-DDOTKERNEL     = dot_thunderx2t99.c
-SDOTKERNEL     = dot_thunderx2t99.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
+DDOTKERNEL     = dot.S
+SDOTKERNEL     = dot.S
+CDOTKERNEL     = zdot.S
+ZDOTKERNEL     = zdot.S
 DSDOTKERNEL    = dot.S
 
 ifneq ($(OS_DARWIN)$(CROSS),11)
@@ -175,22 +173,6 @@ ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
-DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
-SGEMMKERNEL    =  sgemm_kernel_16x4_thunderx2t99.S
-endif
-
-ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
-CGEMMKERNEL    =  cgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
-ZGEMMKERNEL    =  zgemm_kernel_4x4_thunderx2t99.S
-endif
-
 else
 
 STRMMKERNEL    = ../generic/trmmkernel_2x2.c
diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53
new file mode 100644 (file)
index 0000000..c1d33fa
--- /dev/null
@@ -0,0 +1,3 @@
+include $(KERNELDIR)/KERNEL.ARMV8
+
+
diff --git a/kernel/arm64/KERNEL.CORTEXA72 b/kernel/arm64/KERNEL.CORTEXA72
new file mode 100644 (file)
index 0000000..007b2ce
--- /dev/null
@@ -0,0 +1,3 @@
+include $(KERNELDIR)/KERNEL.CORTEXA57
+
+
diff --git a/kernel/arm64/KERNEL.CORTEXA73 b/kernel/arm64/KERNEL.CORTEXA73
new file mode 100644 (file)
index 0000000..007b2ce
--- /dev/null
@@ -0,0 +1,3 @@
+include $(KERNELDIR)/KERNEL.CORTEXA57
+
+
diff --git a/kernel/arm64/KERNEL.FALKOR b/kernel/arm64/KERNEL.FALKOR
new file mode 100644 (file)
index 0000000..007b2ce
--- /dev/null
@@ -0,0 +1,3 @@
+include $(KERNELDIR)/KERNEL.CORTEXA57
+
+
diff --git a/kernel/arm64/KERNEL.VULCAN b/kernel/arm64/KERNEL.VULCAN
deleted file mode 100644 (file)
index 8b02739..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-include $(KERNELDIR)/KERNEL.THUNDERX2T99
-
-
diff --git a/param.h b/param.h
index d1b2115..8f56cda 100644 (file)
--- a/param.h
+++ b/param.h
@@ -2543,8 +2543,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P 16
 #endif
 
+// Common ARMv8 parameters
+#if defined(ARMV8)
 
-#if defined(CORTEXA57)
 #define SNUMOPT                2
 #define DNUMOPT                2
 
@@ -2552,46 +2553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  16
-#define SGEMM_DEFAULT_UNROLL_N  4
-
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
-
-#define CGEMM_DEFAULT_UNROLL_M  8
-#define CGEMM_DEFAULT_UNROLL_N  4
-
-#define ZGEMM_DEFAULT_UNROLL_M  4
-#define ZGEMM_DEFAULT_UNROLL_N  4
-
-#define SGEMM_DEFAULT_P        512
-#define DGEMM_DEFAULT_P        256
-#define CGEMM_DEFAULT_P 256
-#define ZGEMM_DEFAULT_P 128
-
-#define SGEMM_DEFAULT_Q 1024
-#define DGEMM_DEFAULT_Q 512
-#define CGEMM_DEFAULT_Q 512
-#define ZGEMM_DEFAULT_Q 512
-
-#define SGEMM_DEFAULT_R 4096
-#define DGEMM_DEFAULT_R 4096
-#define CGEMM_DEFAULT_R 4096
-#define ZGEMM_DEFAULT_R 2048
-
-
 #define SYMV_P 16
-#endif
-
-#if defined(ARMV8)
 
+// Darwin / Cross
 #if defined(OS_DARWIN) && defined(CROSS)
-#define SNUMOPT                2
-#define DNUMOPT                2
-
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  2
@@ -2620,15 +2585,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#define SYMV_P 16
-#else
+#else // Linux / Native
 
-#define SNUMOPT                2
-#define DNUMOPT                2
-
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#if defined(CORTEXA53) || defined(CORTEXA57) || \
+    defined(CORTEXA72) || defined(CORTEXA73) || \
+    defined(FALKOR)
 
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -2642,33 +2603,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
-#define SGEMM_DEFAULT_P        128
-#define DGEMM_DEFAULT_P        160
-#define CGEMM_DEFAULT_P 128
+#define SGEMM_DEFAULT_P        512
+#define DGEMM_DEFAULT_P        256
+#define CGEMM_DEFAULT_P 256
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_Q 352
-#define DGEMM_DEFAULT_Q 128
-#define CGEMM_DEFAULT_Q 224
-#define ZGEMM_DEFAULT_Q 112
+#define SGEMM_DEFAULT_Q 1024
+#define DGEMM_DEFAULT_Q 512
+#define CGEMM_DEFAULT_Q 512
+#define ZGEMM_DEFAULT_Q 512
 
 #define SGEMM_DEFAULT_R 4096
 #define DGEMM_DEFAULT_R 4096
 #define CGEMM_DEFAULT_R 4096
-#define ZGEMM_DEFAULT_R 4096
-
-#define SYMV_P 16
-#endif
-
-#endif
-
-#if defined(THUNDERX)
-#define SNUMOPT                2
-#define DNUMOPT                2
+#define ZGEMM_DEFAULT_R 2048
 
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#elif defined(THUNDERX)
 
 #define SGEMM_DEFAULT_UNROLL_M  4
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -2697,17 +2647,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
+#elif defined(THUNDERX2T99)
 
-#define SYMV_P 16
-#endif
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
 
-#if defined(THUNDERX2T99) || defined(VULCAN)
-#define SNUMOPT                2
-#define DNUMOPT                2
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
 
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P        128
+#define DGEMM_DEFAULT_P        160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#else // Other/undetected ARMv8 cores
 
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -2736,8 +2705,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#define SYMV_P 16
-#endif
+#endif // Cores
+
+#endif // Linux / Darwin
+
+#endif // ARMv8
 
 #if defined(ARMV5)
 #define SNUMOPT                2