endif
endif
+# Use a72 tunings because Neoverse-V1 is only available
+# in GCC>=9.4
+ifeq ($(CORE), NEOVERSEV1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
+ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
+endif
+else
+CCOMMON_OPT += -march=armv8.4-a -mtune=native
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.4-a -mtune=native
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+endif
+endif
+endif
+
+# Use a72 tunings because Neoverse-N2 is only available
+# in GCC>=9.4
+ifeq ($(CORE), NEOVERSEN2)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
+ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
+endif
+else
+CCOMMON_OPT += -march=armv8.5-a -mtune=native
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.5-a -mtune=native
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+endif
+endif
+else
+CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+endif
+endif
+endif
+
# Use a53 tunings because a55 is only available in GCC>=8.1
ifeq ($(CORE), CORTEXA55)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
endif
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
+GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
endif
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
DYNAMIC_CORE += NEOVERSEN1
+DYNAMIC_CORE += NEOVERSEV1
+DYNAMIC_CORE += NEOVERSEN2
DYNAMIC_CORE += CORTEXA55
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX
CORTEXA72
CORTEXA73
NEOVERSEN1
+NEOVERSEV1
+NEOVERSEN2
CORTEXA55
EMAG8180
FALKOR
if (DYNAMIC_ARCH)
if (ARM64)
- set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
+ set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110)
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
endif ()
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
+ "#define L1_DATA_ASSOCIATIVE\t4\n"
+ "#define L2_SIZE\t1048576\n\n"
+ "#define L2_LINESIZE\t64\n"
+ "#define L2_ASSOCIATIVE\t8\n"
+ "#define DTB_DEFAULT_ENTRIES\t48\n"
+ "#define DTB_SIZE\t4096\n"
+ "#define HAVE_VFPV4\n"
+ "#define HAVE_VFPV3\n"
+ "#define HAVE_VFP\n"
+ "#define HAVE_NEON\n"
+ "#define ARMV8\n")
+ set(SGEMM_UNROLL_M 16)
+ set(SGEMM_UNROLL_N 4)
+ set(DGEMM_UNROLL_M 8)
+ set(DGEMM_UNROLL_N 4)
+ set(CGEMM_UNROLL_M 8)
+ set(CGEMM_UNROLL_N 4)
+ set(ZGEMM_UNROLL_M 4)
+ set(ZGEMM_UNROLL_N 4)
+ set(SYMV_P 16)
+ elseif ("${TCORE}" STREQUAL "NEOVERSEV1")
+ file(APPEND ${TARGET_CONF_TEMP}
+ "#define L1_CODE_SIZE\t65536\n"
+ "#define L1_CODE_LINESIZE\t64\n"
+ "#define L1_CODE_ASSOCIATIVE\t4\n"
+ "#define L1_DATA_SIZE\t65536\n"
+ "#define L1_DATA_LINESIZE\t64\n"
+ "#define L1_DATA_ASSOCIATIVE\t4\n"
+ "#define L2_SIZE\t1048576\n\n"
+ "#define L2_LINESIZE\t64\n"
+ "#define L2_ASSOCIATIVE\t8\n"
+ "#define DTB_DEFAULT_ENTRIES\t48\n"
+ "#define DTB_SIZE\t4096\n"
+ "#define HAVE_VFPV4\n"
+ "#define HAVE_VFPV3\n"
+ "#define HAVE_VFP\n"
+ "#define HAVE_NEON\n"
+ "#define HAVE_SVE\n"
+ "#define ARMV8\n")
+ set(SGEMM_UNROLL_M 16)
+ set(SGEMM_UNROLL_N 4)
+ set(DGEMM_UNROLL_M 8)
+ set(DGEMM_UNROLL_N 4)
+ set(CGEMM_UNROLL_M 8)
+ set(CGEMM_UNROLL_N 4)
+ set(ZGEMM_UNROLL_M 4)
+ set(ZGEMM_UNROLL_N 4)
+ set(SYMV_P 16)
+ elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
+ file(APPEND ${TARGET_CONF_TEMP}
+ "#define L1_CODE_SIZE\t65536\n"
+ "#define L1_CODE_LINESIZE\t64\n"
+ "#define L1_CODE_ASSOCIATIVE\t4\n"
+ "#define L1_DATA_SIZE\t65536\n"
+ "#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t1048576\n\n"
"#define L2_LINESIZE\t64\n"
- "#define L2_ASSOCIATIVE\t16\n"
- "#define DTB_DEFAULT_ENTRIES\t64\n"
+ "#define L2_ASSOCIATIVE\t8\n"
+ "#define DTB_DEFAULT_ENTRIES\t48\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
+ "#define HAVE_SVE\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
#define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
#define CPU_NEOVERSEN1 11
+#define CPU_NEOVERSEV1 16
+#define CPU_NEOVERSEN2 17
// Qualcomm
#define CPU_FALKOR 6
// Cavium
"TSV110",
"EMAG8180",
"NEOVERSEN1",
+ "NEOVERSEV1"
+ "NEOVERSEN2"
"THUNDERX3T110",
"VORTEX",
"CORTEXA55",
"tsv110",
"emag8180",
"neoversen1",
+ "neoversev1",
+ "neoversen2",
"thunderx3t110",
"vortex",
"cortexa55",
return CPU_CORTEXA73;
else if (strstr(cpu_part, "0xd0c"))
return CPU_NEOVERSEN1;
+ else if (strstr(cpu_part, "0xd40"))
+ return CPU_NEOVERSEV1;
+ else if (strstr(cpu_part, "0xd49"))
+ return CPU_NEOVERSEN2;
else if (strstr(cpu_part, "0xd05"))
return CPU_CORTEXA55;
}
printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
- printf("#define L2_ASSOCIATIVE 16\n");
- printf("#define DTB_DEFAULT_ENTRIES 64\n");
+ printf("#define L2_ASSOCIATIVE 8\n");
+ printf("#define DTB_DEFAULT_ENTRIES 48\n");
printf("#define DTB_SIZE 4096\n");
break;
+ case CPU_NEOVERSEV1:
+ printf("#define %s\n", cpuname[d]);
+ printf("#define L1_CODE_SIZE 65536\n");
+ printf("#define L1_CODE_LINESIZE 64\n");
+ printf("#define L1_CODE_ASSOCIATIVE 4\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L1_DATA_ASSOCIATIVE 4\n");
+ printf("#define L2_SIZE 1048576\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define L2_ASSOCIATIVE 8\n");
+ printf("#define DTB_DEFAULT_ENTRIES 48\n");
+ printf("#define DTB_SIZE 4096\n");
+ break;
+
+ case CPU_NEOVERSEN2:
+ printf("#define %s\n", cpuname[d]);
+ printf("#define L1_CODE_SIZE 65536\n");
+ printf("#define L1_CODE_LINESIZE 64\n");
+ printf("#define L1_CODE_ASSOCIATIVE 4\n");
+ printf("#define L1_DATA_SIZE 65536\n");
+ printf("#define L1_DATA_LINESIZE 64\n");
+ printf("#define L1_DATA_ASSOCIATIVE 4\n");
+ printf("#define L2_SIZE 1048576\n");
+ printf("#define L2_LINESIZE 64\n");
+ printf("#define L2_ASSOCIATIVE 8\n");
+ printf("#define DTB_DEFAULT_ENTRIES 48\n");
+ printf("#define DTB_SIZE 4096\n");
+ break;
+
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
"tsv110",
"emag8180",
"neoversen1",
+ "neoversev1",
+ "neoversen2",
"thunderx3t110",
"cortexa55",
"unknown"
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \
- "-march=armv8.2-a -mtune=cortex-a72"
+ "-march=armv8.2-a -mtune=neoverse-n1"
#define LIBNAME "neoversen1"
#define CORENAME "NEOVERSEN1"
#else
#endif
+#ifdef FORCE_NEOVERSEV1
+#define FORCE
+#define ARCHITECTURE "ARM64"
+#define SUBARCHITECTURE "NEOVERSEV1"
+#define SUBDIRNAME "arm64"
+#define ARCHCONFIG "-DNEOVERSEV1 " \
+ "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
+ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
+ "-march=armv8.4-a -mtune=neoverse-v1"
+#define LIBNAME "neoversev1"
+#define CORENAME "NEOVERSEV1"
+#else
+#endif
+
+
+#ifdef FORCE_NEOVERSEN2
+#define FORCE
+#define ARCHITECTURE "ARM64"
+#define SUBARCHITECTURE "NEOVERSEN2"
+#define SUBDIRNAME "arm64"
+#define ARCHCONFIG "-DNEOVERSEN2 " \
+ "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
+ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
+ "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
+ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
+ "-march=armv8.5-a -mtune=neoverse-n2"
+#define LIBNAME "neoversen2"
+#define CORENAME "NEOVERSEN2"
+#else
+#endif
+
#ifdef FORCE_CORTEXA55
#define FORCE
#define ARCHITECTURE "ARM64"
--- /dev/null
+SAMINKERNEL = ../arm/amin.c
+DAMINKERNEL = ../arm/amin.c
+CAMINKERNEL = ../arm/zamin.c
+ZAMINKERNEL = ../arm/zamin.c
+
+SMAXKERNEL = ../arm/max.c
+DMAXKERNEL = ../arm/max.c
+
+SMINKERNEL = ../arm/min.c
+DMINKERNEL = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL = ../arm/imax.c
+IDMAXKERNEL = ../arm/imax.c
+
+ISMINKERNEL = ../arm/imin.c
+IDMINKERNEL = ../arm/imin.c
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL = amax.S
+DAMAXKERNEL = amax.S
+CAMAXKERNEL = zamax.S
+ZAMAXKERNEL = zamax.S
+
+SAXPYKERNEL = axpy.S
+DAXPYKERNEL = daxpy_thunderx2t99.S
+CAXPYKERNEL = zaxpy.S
+ZAXPYKERNEL = zaxpy.S
+
+SROTKERNEL = rot.S
+DROTKERNEL = rot.S
+CROTKERNEL = zrot.S
+ZROTKERNEL = zrot.S
+
+SSCALKERNEL = scal.S
+DSCALKERNEL = scal.S
+CSCALKERNEL = zscal.S
+ZSCALKERNEL = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL = sasum_thunderx2t99.c
+DASUMKERNEL = dasum_thunderx2t99.c
+CASUMKERNEL = casum_thunderx2t99.c
+ZASUMKERNEL = zasum_thunderx2t99.c
+
+SCOPYKERNEL = copy_thunderx2t99.c
+DCOPYKERNEL = copy_thunderx2t99.c
+CCOPYKERNEL = copy_thunderx2t99.c
+ZCOPYKERNEL = copy_thunderx2t99.c
+
+SSWAPKERNEL = swap_thunderx2t99.S
+DSWAPKERNEL = swap_thunderx2t99.S
+CSWAPKERNEL = swap_thunderx2t99.S
+ZSWAPKERNEL = swap_thunderx2t99.S
+
+ISAMAXKERNEL = iamax_thunderx2t99.c
+IDAMAXKERNEL = iamax_thunderx2t99.c
+ICAMAXKERNEL = izamax_thunderx2t99.c
+IZAMAXKERNEL = izamax_thunderx2t99.c
+
+SNRM2KERNEL = scnrm2_thunderx2t99.c
+DNRM2KERNEL = dznrm2_thunderx2t99.c
+CNRM2KERNEL = scnrm2_thunderx2t99.c
+ZNRM2KERNEL = dznrm2_thunderx2t99.c
+
+DDOTKERNEL = dot_thunderx2t99.c
+SDOTKERNEL = dot_thunderx2t99.c
+CDOTKERNEL = zdot_thunderx2t99.c
+ZDOTKERNEL = zdot_thunderx2t99.c
+DSDOTKERNEL = dot.S
+
+DGEMM_BETA = dgemm_beta.S
+SGEMM_BETA = sgemm_beta.S
+
+SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
--- /dev/null
+SAMINKERNEL = ../arm/amin.c
+DAMINKERNEL = ../arm/amin.c
+CAMINKERNEL = ../arm/zamin.c
+ZAMINKERNEL = ../arm/zamin.c
+
+SMAXKERNEL = ../arm/max.c
+DMAXKERNEL = ../arm/max.c
+
+SMINKERNEL = ../arm/min.c
+DMINKERNEL = ../arm/min.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL = ../arm/imax.c
+IDMAXKERNEL = ../arm/imax.c
+
+ISMINKERNEL = ../arm/imin.c
+IDMINKERNEL = ../arm/imin.c
+
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+
+SAMAXKERNEL = amax.S
+DAMAXKERNEL = amax.S
+CAMAXKERNEL = zamax.S
+ZAMAXKERNEL = zamax.S
+
+SAXPYKERNEL = axpy.S
+DAXPYKERNEL = daxpy_thunderx2t99.S
+CAXPYKERNEL = zaxpy.S
+ZAXPYKERNEL = zaxpy.S
+
+SROTKERNEL = rot.S
+DROTKERNEL = rot.S
+CROTKERNEL = zrot.S
+ZROTKERNEL = zrot.S
+
+SSCALKERNEL = scal.S
+DSCALKERNEL = scal.S
+CSCALKERNEL = zscal.S
+ZSCALKERNEL = zscal.S
+
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
+
+
+SASUMKERNEL = sasum_thunderx2t99.c
+DASUMKERNEL = dasum_thunderx2t99.c
+CASUMKERNEL = casum_thunderx2t99.c
+ZASUMKERNEL = zasum_thunderx2t99.c
+
+SCOPYKERNEL = copy_thunderx2t99.c
+DCOPYKERNEL = copy_thunderx2t99.c
+CCOPYKERNEL = copy_thunderx2t99.c
+ZCOPYKERNEL = copy_thunderx2t99.c
+
+SSWAPKERNEL = swap_thunderx2t99.S
+DSWAPKERNEL = swap_thunderx2t99.S
+CSWAPKERNEL = swap_thunderx2t99.S
+ZSWAPKERNEL = swap_thunderx2t99.S
+
+ISAMAXKERNEL = iamax_thunderx2t99.c
+IDAMAXKERNEL = iamax_thunderx2t99.c
+ICAMAXKERNEL = izamax_thunderx2t99.c
+IZAMAXKERNEL = izamax_thunderx2t99.c
+
+SNRM2KERNEL = scnrm2_thunderx2t99.c
+DNRM2KERNEL = dznrm2_thunderx2t99.c
+CNRM2KERNEL = scnrm2_thunderx2t99.c
+ZNRM2KERNEL = dznrm2_thunderx2t99.c
+
+DDOTKERNEL = dot_thunderx2t99.c
+SDOTKERNEL = dot_thunderx2t99.c
+CDOTKERNEL = zdot_thunderx2t99.c
+ZDOTKERNEL = zdot_thunderx2t99.c
+DSDOTKERNEL = dot.S
+
+DGEMM_BETA = dgemm_beta.S
+SGEMM_BETA = sgemm_beta.S
+
+SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
+ifeq ($(SGEMM_UNROLL_M), 4)
+SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
+else
+SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
+SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
+#elif defined(NEOVERSEV1)
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 4
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#elif defined(NEOVERSEN2)
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 4
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
#elif defined(ARMV8SVE) || defined(A64FX)
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".