FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
+# Use a72 tunings because Neoverse-N1 is only available
+# in GCC>=9
+ifeq ($(CORE), NEOVERSEN1)
+ifeq ($(GCCVERSIONGTEQ7), 1)
+ifeq ($(GCCVERSIONGTEQ9), 1)
+CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
+FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
+else
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+endif
+else
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+endif
+endif
+
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
+GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
+DYNAMIC_CORE += NEOVERSEN1
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
CORTEXA57
CORTEXA72
CORTEXA73
+NEOVERSEN1
FALKOR
THUNDERX
THUNDERX2T99
if (DYNAMIC_ARCH)
if (ARM64)
- set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180)
+ set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1)
endif ()
if (POWER)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
+ elseif ("${TCORE}" STREQUAL "NEOVERSEN1")
+ file(APPEND ${TARGET_CONF_TEMP}
+ "#define L1_CODE_SIZE\t65536\n"
+ "#define L1_CODE_LINESIZE\t64\n"
+ "#define L1_CODE_ASSOCIATIVE\t4\n"
+ "#define L1_DATA_SIZE\t65536\n"
+ "#define L1_DATA_LINESIZE\t64\n"
+ "#define L1_DATA_ASSOCIATIVE\t2\n"
+ "#define L2_SIZE\t1048576\n\n"
+ "#define L2_LINESIZE\t64\n"
+ "#define L2_ASSOCIATIVE\t16\n"
+ "#define DTB_DEFAULT_ENTRIES\t64\n"
+ "#define DTB_SIZE\t4096\n"
+ "#define HAVE_VFPV4\n"
+ "#define HAVE_VFPV3\n"
+ "#define HAVE_VFP\n"
+ "#define HAVE_NEON\n"
+ "#define ARMV8\n")
+ set(SGEMM_UNROLL_M 16)
+ set(SGEMM_UNROLL_N 4)
+ set(DGEMM_UNROLL_M 8)
+ set(DGEMM_UNROLL_N 4)
+ set(CGEMM_UNROLL_M 8)
+ set(CGEMM_UNROLL_N 4)
+ set(ZGEMM_UNROLL_M 4)
+ set(ZGEMM_UNROLL_N 4)
+ set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "FALKOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
#define CPU_CORTEXA57 3
#define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
+#define CPU_NEOVERSEN1 11
// Qualcomm
#define CPU_FALKOR 6
// Cavium
"THUNDERX",
"THUNDERX2T99",
"TSV110",
- "EMAG8180"
+ "EMAG8180",
+ "NEOVERSEN1"
};
static char *cpuname_lower[] = {
"thunderx",
"thunderx2t99",
"tsv110",
- "emag8180"
+ "emag8180",
+ "neoversen1"
};
int get_feature(char *search)
return CPU_CORTEXA72;
else if (strstr(cpu_part, "0xd09"))
return CPU_CORTEXA73;
+ else if (strstr(cpu_part, "0xd0c"))
+ return CPU_NEOVERSEN1;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
+ case CPU_NEOVERSEN1:
+ printf("#define %s\n", cpuname[d]);
+ printf("#define L1_CODE_SIZE 65536\n");
+ printf("#define L1_CODE_LINESIZE 64\n");
+ printf("#define L1_CODE_ASSOCIATIVE 4\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L1_DATA_ASSOCIATIVE 4\n");
+ printf("#define L2_SIZE 1048576\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define L2_ASSOCIATIVE 16\n");
+ printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define DTB_SIZE 4096\n");
+ break;
case CPU_FALKOR:
printf("#define FALKOR\n");
extern gotoblas_t gotoblas_THUNDERX2T99;
extern gotoblas_t gotoblas_TSV110;
extern gotoblas_t gotoblas_EMAG8180;
+extern gotoblas_t gotoblas_NEOVERSEN1;
extern void openblas_warning(int verbose, const char * msg);
-#define NUM_CORETYPES 10
+#define NUM_CORETYPES 11
/*
* In case asm/hwcap.h is outdated on the build system, make sure
"thunderx2t99",
"tsv110",
"emag8180",
+ "neoversen1",
"unknown"
};
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
+ if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
return corename[NUM_CORETYPES];
}
case 7: return (&gotoblas_THUNDERX2T99);
case 8: return (&gotoblas_TSV110);
case 9: return (&gotoblas_EMAG8180);
+ case 10: return (&gotoblas_NEOVERSEN1);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return &gotoblas_CORTEXA72;
case 0xd09: // Cortex A73
return &gotoblas_CORTEXA73;
+ case 0xd0c: // Neoverse N1
+ return &gotoblas_NEOVERSEN1;
}
break;
case 0x42: // Broadcom
#else
#endif
+#ifdef FORCE_NEOVERSEN1
+#define FORCE
+#define ARCHITECTURE "ARM64"
+#define SUBARCHITECTURE "NEOVERSEN1"
+#define SUBDIRNAME "arm64"
+#define ARCHCONFIG "-DNEOVERSEN1 " \
+ "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
+ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" \
+ "-march=armv8.2-a -mtune=cortex-a72"
+#define LIBNAME "neoversen1"
+#define CORENAME "NEOVERSEN1"
+#else
+#endif
+
+
#ifdef FORCE_FALKOR
#define FORCE
#define ARCHITECTURE "ARM64"
--- /dev/null
+SAMINKERNEL = ../arm/amin.c
+DAMINKERNEL = ../arm/amin.c
+CAMINKERNEL = ../arm/zamin.c
+ZAMINKERNEL = ../arm/zamin.c
+
+SMAXKERNEL = ../arm/max.c
+DMAXKERNEL = ../arm/max.c
+
+SMINKERNEL = ../arm/min.c
+DMINKERNEL = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL = ../arm/imax.c
+IDMAXKERNEL = ../arm/imax.c
+
+ISMINKERNEL = ../arm/imin.c
+IDMINKERNEL = ../arm/imin.c
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL = amax.S
+DAMAXKERNEL = amax.S
+CAMAXKERNEL = zamax.S
+ZAMAXKERNEL = zamax.S
+
+SAXPYKERNEL = axpy.S
+DAXPYKERNEL = daxpy_thunderx2t99.S
+CAXPYKERNEL = zaxpy.S
+ZAXPYKERNEL = zaxpy.S
+
+SROTKERNEL = rot.S
+DROTKERNEL = rot.S
+CROTKERNEL = zrot.S
+ZROTKERNEL = zrot.S
+
+SSCALKERNEL = scal.S
+DSCALKERNEL = scal.S
+CSCALKERNEL = zscal.S
+ZSCALKERNEL = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL = sasum_thunderx2t99.c
+DASUMKERNEL = dasum_thunderx2t99.c
+CASUMKERNEL = casum_thunderx2t99.c
+ZASUMKERNEL = zasum_thunderx2t99.c
+
+SCOPYKERNEL = copy_thunderx2t99.c
+DCOPYKERNEL = copy_thunderx2t99.c
+CCOPYKERNEL = copy_thunderx2t99.c
+ZCOPYKERNEL = copy_thunderx2t99.c
+
+SSWAPKERNEL = swap_thunderx2t99.S
+DSWAPKERNEL = swap_thunderx2t99.S
+CSWAPKERNEL = swap_thunderx2t99.S
+ZSWAPKERNEL = swap_thunderx2t99.S
+
+ISAMAXKERNEL = iamax_thunderx2t99.c
+IDAMAXKERNEL = iamax_thunderx2t99.c
+ICAMAXKERNEL = izamax_thunderx2t99.c
+IZAMAXKERNEL = izamax_thunderx2t99.c
+
+SNRM2KERNEL = scnrm2_thunderx2t99.c
+DNRM2KERNEL = dznrm2_thunderx2t99.c
+CNRM2KERNEL = scnrm2_thunderx2t99.c
+ZNRM2KERNEL = dznrm2_thunderx2t99.c
+
+DDOTKERNEL = dot_thunderx2t99.c
+SDOTKERNEL = dot_thunderx2t99.c
+CDOTKERNEL = zdot_thunderx2t99.c
+ZDOTKERNEL = zdot_thunderx2t99.c
+DSDOTKERNEL = dot.S
+
+DGEMM_BETA = dgemm_beta.S
+SGEMM_BETA = sgemm_beta.S
+
+SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
+#elif defined(NEOVERSEN1)
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 4
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
#else // Other/undetected ARMv8 cores
#define SGEMM_DEFAULT_UNROLL_M 16