FCOMMON_OPT += -march=armv8-a
endif
+ifeq ($(CORE), CORTEXA53)
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
+endif
+
ifeq ($(CORE), CORTEXA57)
-CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
-FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
+endif
+
+ifeq ($(CORE), CORTEXA72)
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
-ifeq ($(CORE), VULCAN)
-CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
-FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
+ifeq ($(CORE), CORTEXA73)
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
ifeq ($(CORE), THUNDERX)
-CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
-FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
+CCOMMON_OPT += -march=armv8-a -mtune=thunderx
+FCOMMON_OPT += -march=armv8-a -mtune=thunderx
+endif
+
+ifeq ($(CORE), FALKOR)
+CCOMMON_OPT += -march=armv8.1-a -mtune=falkor
+FCOMMON_OPT += -march=armv8.1-a -mtune=falkor
endif
ifeq ($(CORE), THUNDERX2T99)
-CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
-FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
+CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
+FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
8.ARM 64-bit CPU:
ARMV8
+CORTEXA53
CORTEXA57
-VULCAN
+CORTEXA72
+CORTEXA73
+FALKOR
THUNDERX
THUNDERX2T99
#define CPU_UNKNOWN 0
#define CPU_ARMV8 1
-#define CPU_CORTEXA57 2
-#define CPU_VULCAN 3
-#define CPU_THUNDERX 4
-#define CPU_THUNDERX2T99 5
+// Arm
+#define CPU_CORTEXA53 2
+#define CPU_CORTEXA57 3
+#define CPU_CORTEXA72 4
+#define CPU_CORTEXA73 5
+// Qualcomm
+#define CPU_FALKOR 6
+// Cavium
+#define CPU_THUNDERX 7
+#define CPU_THUNDERX2T99 8
static char *cpuname[] = {
"UNKNOWN",
"ARMV8" ,
+ "CORTEXA53",
"CORTEXA57",
- "VULCAN",
+ "CORTEXA72",
+ "CORTEXA73",
+ "FALKOR",
"THUNDERX",
"THUNDERX2T99"
};
static char *cpuname_lower[] = {
"unknown",
- "armv8" ,
+ "armv8",
+ "cortexa53",
"cortexa57",
- "vulcan",
+ "cortexa72",
+ "cortexa73",
+ "falkor",
"thunderx",
"thunderx2t99"
};
fclose(infile);
if(cpu_part != NULL && cpu_implementer != NULL) {
- if (strstr(cpu_implementer, "0x41") &&
- (strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08")))
- return CPU_CORTEXA57; //or compatible, ex. A72
- else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
- return CPU_VULCAN;
- else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
+ // Arm
+ if (strstr(cpu_implementer, "0x41")) {
+ if (strstr(cpu_part, "0xd03"))
+ return CPU_CORTEXA53;
+ else if (strstr(cpu_part, "0xd07"))
+ return CPU_CORTEXA57;
+ else if (strstr(cpu_part, "0xd08"))
+ return CPU_CORTEXA72;
+ else if (strstr(cpu_part, "0xd09"))
+ return CPU_CORTEXA73;
+ }
+ // Qualcomm
+ else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
+ return CPU_FALKOR;
+ // Cavium
+ else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
return CPU_THUNDERX;
- else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
+ else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99;
}
void get_cpuconfig(void)
{
+ // All arches should define ARMv8
+ printf("#define ARMV8\n");
+ printf("#define HAVE_NEON\n"); // This shouldn't be necessary
+ printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
+
int d = detect();
switch (d)
{
+ case CPU_CORTEXA53:
+ printf("#define %s\n", cpuname[d]);
+ // Fall-through
case CPU_ARMV8:
- printf("#define ARMV8\n");
- printf("#define L1_DATA_SIZE 32768\n");
- printf("#define L1_DATA_LINESIZE 64\n");
- printf("#define L2_SIZE 262144\n");
- printf("#define L2_LINESIZE 64\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
- printf("#define DTB_SIZE 4096\n");
- printf("#define L2_ASSOCIATIVE 4\n");
- break;
-
- case CPU_VULCAN:
- printf("#define VULCAN \n");
- printf("#define HAVE_VFP \n");
- printf("#define HAVE_VFPV3 \n");
- printf("#define HAVE_NEON \n");
- printf("#define HAVE_VFPV4 \n");
- printf("#define L1_CODE_SIZE 32768 \n");
- printf("#define L1_CODE_LINESIZE 64 \n");
- printf("#define L1_CODE_ASSOCIATIVE 8 \n");
- printf("#define L1_DATA_SIZE 32768 \n");
- printf("#define L1_DATA_LINESIZE 64 \n");
- printf("#define L1_DATA_ASSOCIATIVE 8 \n");
- printf("#define L2_SIZE 262144 \n");
- printf("#define L2_LINESIZE 64 \n");
- printf("#define L2_ASSOCIATIVE 8 \n");
- printf("#define L3_SIZE 33554432 \n");
- printf("#define L3_LINESIZE 64 \n");
- printf("#define L3_ASSOCIATIVE 32 \n");
- printf("#define DTB_DEFAULT_ENTRIES 64 \n");
- printf("#define DTB_SIZE 4096 \n");
+ // Minimum parameters for ARMv8 (based on A53)
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L2_SIZE 262144\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_CORTEXA57:
- printf("#define CORTEXA57\n");
- printf("#define HAVE_VFP\n");
- printf("#define HAVE_VFPV3\n");
- printf("#define HAVE_NEON\n");
- printf("#define HAVE_VFPV4\n");
+ case CPU_CORTEXA72:
+ case CPU_CORTEXA73:
+ // Common minimum settings for these Arm cores
+ // Can change a lot, but we need to be conservative
+ // TODO: detect info from /sys if possible
+ printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n");
- printf("#define L2_SIZE 2097152\n");
+ printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
+ case CPU_FALKOR:
+ printf("#define FALKOR\n");
+ printf("#define L1_CODE_SIZE 65536\n");
+ printf("#define L1_CODE_LINESIZE 64\n");
+ printf("#define L1_DATA_SIZE 32768\n");
+ printf("#define L1_DATA_LINESIZE 128\n");
+ printf("#define L2_SIZE 524288\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ printf("#define L2_ASSOCIATIVE 16\n");
+ break;
+
case CPU_THUNDERX:
printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n");
case CPU_THUNDERX2T99:
printf("#define VULCAN \n");
- printf("#define HAVE_VFP \n");
- printf("#define HAVE_VFPV3 \n");
- printf("#define HAVE_NEON \n");
- printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
#define ARCHCONFIG "-DARMV8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
- "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "armv8"
#define CORENAME "ARMV8"
#endif
+#ifdef FORCE_CORTEXA53
+#define FORCE
+#define ARCHITECTURE "ARM64"
+#define SUBARCHITECTURE "CORTEXA53"
+#define SUBDIRNAME "arm64"
+#define ARCHCONFIG "-DCORTEXA53 " \
+ "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+ "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME "cortexa53"
+#define CORENAME "CORTEXA53"
+#else
+#endif
+
#ifdef FORCE_CORTEXA57
#define FORCE
#define ARCHITECTURE "ARM64"
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
- "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57"
#else
#endif
-#ifdef FORCE_VULCAN
+#ifdef FORCE_CORTEXA72
#define FORCE
#define ARCHITECTURE "ARM64"
-#define SUBARCHITECTURE "VULCAN"
+#define SUBARCHITECTURE "CORTEXA72"
#define SUBDIRNAME "arm64"
-#define ARCHCONFIG "-DVULCAN " \
- "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
- "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
- "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
- "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
+#define ARCHCONFIG "-DCORTEXA72 " \
+ "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME "cortexa72"
+#define CORENAME "CORTEXA72"
+#else
+#endif
+
+#ifdef FORCE_CORTEXA73
+#define FORCE
+#define ARCHITECTURE "ARM64"
+#define SUBARCHITECTURE "CORTEXA73"
+#define SUBDIRNAME "arm64"
+#define ARCHCONFIG "-DCORTEXA73 " \
+ "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME "cortexa73"
+#define CORENAME "CORTEXA73"
+#else
+#endif
+
+#ifdef FORCE_FALKOR
+#define FORCE
+#define ARCHITECTURE "ARM64"
+#define SUBARCHITECTURE "FALKOR"
+#define SUBDIRNAME "arm64"
+#define ARCHCONFIG "-DFALKOR " \
+ "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
+ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
+ "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
- "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
-#define LIBNAME "vulcan"
-#define CORENAME "VULCAN"
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
+#define LIBNAME "falkor"
+#define CORENAME "FALKOR"
#else
#endif
#define ARCHCONFIG "-DTHUNDERX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
- "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx"
#define CORENAME "THUNDERX"
#else
#endif
#ifdef FORCE_THUNDERX2T99
+#define ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX2T99"
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
- "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx2t99"
#define CORENAME "THUNDERX2T99"
#else
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
-DAXPYKERNEL = daxpy_thunderx2t99.S
+DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
ZGEMVTKERNEL = zgemv_t.S
-SASUMKERNEL = sasum_thunderx2t99.c
-DASUMKERNEL = dasum_thunderx2t99.c
-CASUMKERNEL = casum_thunderx2t99.c
-ZASUMKERNEL = zasum_thunderx2t99.c
+SASUMKERNEL = asum.S
+DASUMKERNEL = asum.S
+CASUMKERNEL = casum.S
+ZASUMKERNEL = zasum.S
-SCOPYKERNEL = copy_thunderx2t99.c
-DCOPYKERNEL = copy_thunderx2t99.c
-CCOPYKERNEL = copy_thunderx2t99.c
-ZCOPYKERNEL = copy_thunderx2t99.c
+SCOPYKERNEL = copy.S
+DCOPYKERNEL = copy.S
+CCOPYKERNEL = copy.S
+ZCOPYKERNEL = copy.S
-SSWAPKERNEL = swap_thunderx2t99.S
-DSWAPKERNEL = swap_thunderx2t99.S
-CSWAPKERNEL = swap_thunderx2t99.S
-ZSWAPKERNEL = swap_thunderx2t99.S
+SSWAPKERNEL = swap.S
+DSWAPKERNEL = swap.S
+CSWAPKERNEL = swap.S
+ZSWAPKERNEL = swap.S
-ISAMAXKERNEL = iamax_thunderx2t99.c
-IDAMAXKERNEL = iamax_thunderx2t99.c
-ICAMAXKERNEL = izamax_thunderx2t99.c
-IZAMAXKERNEL = izamax_thunderx2t99.c
+ISAMAXKERNEL = iamax.S
+IDAMAXKERNEL = iamax.S
+ICAMAXKERNEL = izamax.S
+IZAMAXKERNEL = izamax.S
ifneq ($(OS_DARWIN)$(CROSS),11)
-SNRM2KERNEL = scnrm2_thunderx2t99.c
-CNRM2KERNEL = scnrm2_thunderx2t99.c
-#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
-#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
-DNRM2KERNEL = dznrm2_thunderx2t99.c
-ZNRM2KERNEL = dznrm2_thunderx2t99.c
+SNRM2KERNEL = nrm2.S
+CNRM2KERNEL = nrm2.S
+DNRM2KERNEL = znrm2.S
+ZNRM2KERNEL = znrm2.S
endif
-DDOTKERNEL = dot_thunderx2t99.c
-SDOTKERNEL = dot_thunderx2t99.c
-CDOTKERNEL = zdot_thunderx2t99.c
-ZDOTKERNEL = zdot_thunderx2t99.c
+DDOTKERNEL = dot.S
+SDOTKERNEL = dot.S
+CDOTKERNEL = zdot.S
+ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
ifneq ($(OS_DARWIN)$(CROSS),11)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
-DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
-SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
-endif
-
-ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
-CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
-ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
-endif
-
else
STRMMKERNEL = ../generic/trmmkernel_2x2.c
--- /dev/null
+include $(KERNELDIR)/KERNEL.ARMV8
+
+
--- /dev/null
+include $(KERNELDIR)/KERNEL.CORTEXA57
+
+
--- /dev/null
+include $(KERNELDIR)/KERNEL.CORTEXA57
+
+
--- /dev/null
+include $(KERNELDIR)/KERNEL.CORTEXA57
+
+
+++ /dev/null
-include $(KERNELDIR)/KERNEL.THUNDERX2T99
-
-
#define SYMV_P 16
#endif
+// Common ARMv8 parameters
+#if defined(ARMV8)
-#if defined(CORTEXA57)
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
-#define SGEMM_DEFAULT_UNROLL_M 16
-#define SGEMM_DEFAULT_UNROLL_N 4
-
-#define DGEMM_DEFAULT_UNROLL_M 8
-#define DGEMM_DEFAULT_UNROLL_N 4
-
-#define CGEMM_DEFAULT_UNROLL_M 8
-#define CGEMM_DEFAULT_UNROLL_N 4
-
-#define ZGEMM_DEFAULT_UNROLL_M 4
-#define ZGEMM_DEFAULT_UNROLL_N 4
-
-#define SGEMM_DEFAULT_P 512
-#define DGEMM_DEFAULT_P 256
-#define CGEMM_DEFAULT_P 256
-#define ZGEMM_DEFAULT_P 128
-
-#define SGEMM_DEFAULT_Q 1024
-#define DGEMM_DEFAULT_Q 512
-#define CGEMM_DEFAULT_Q 512
-#define ZGEMM_DEFAULT_Q 512
-
-#define SGEMM_DEFAULT_R 4096
-#define DGEMM_DEFAULT_R 4096
-#define CGEMM_DEFAULT_R 4096
-#define ZGEMM_DEFAULT_R 2048
-
-
#define SYMV_P 16
-#endif
-
-#if defined(ARMV8)
+// Darwin / Cross
#if defined(OS_DARWIN) && defined(CROSS)
-#define SNUMOPT 2
-#define DNUMOPT 2
-
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
-#define SYMV_P 16
-#else
+#else // Linux / Native
-#define SNUMOPT 2
-#define DNUMOPT 2
-
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#if defined(CORTEXA53) || defined(CORTEXA57) || \
+ defined(CORTEXA72) || defined(CORTEXA73) || \
+ defined(FALKOR)
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
-#define SGEMM_DEFAULT_P 128
-#define DGEMM_DEFAULT_P 160
-#define CGEMM_DEFAULT_P 128
+#define SGEMM_DEFAULT_P 512
+#define DGEMM_DEFAULT_P 256
+#define CGEMM_DEFAULT_P 256
#define ZGEMM_DEFAULT_P 128
-#define SGEMM_DEFAULT_Q 352
-#define DGEMM_DEFAULT_Q 128
-#define CGEMM_DEFAULT_Q 224
-#define ZGEMM_DEFAULT_Q 112
+#define SGEMM_DEFAULT_Q 1024
+#define DGEMM_DEFAULT_Q 512
+#define CGEMM_DEFAULT_Q 512
+#define ZGEMM_DEFAULT_Q 512
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
-#define ZGEMM_DEFAULT_R 4096
-
-#define SYMV_P 16
-#endif
-
-#endif
-
-#if defined(THUNDERX)
-#define SNUMOPT 2
-#define DNUMOPT 2
+#define ZGEMM_DEFAULT_R 2048
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#elif defined(THUNDERX)
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
+#elif defined(THUNDERX2T99)
-#define SYMV_P 16
-#endif
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 4
-#if defined(THUNDERX2T99) || defined(VULCAN)
-#define SNUMOPT 2
-#define DNUMOPT 2
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
-#define GEMM_DEFAULT_OFFSET_A 0
-#define GEMM_DEFAULT_OFFSET_B 0
-#define GEMM_DEFAULT_ALIGN 0x03fffUL
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#else // Other/undetected ARMv8 cores
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
-#define SYMV_P 16
-#endif
+#endif // Cores
+
+#endif // Linux / Darwin
+
+#endif // ARMv8
#if defined(ARMV5)
#define SNUMOPT 2