-SAMAXKERNEL = amax.S
-DAMAXKERNEL = amax.S
-CAMAXKERNEL = zamax.S
-ZAMAXKERNEL = zamax.S
-
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
-ISAMAXKERNEL = iamax.S
-IDAMAXKERNEL = iamax.S
-ICAMAXKERNEL = izamax.S
-IZAMAXKERNEL = izamax.S
-
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
-SASUMKERNEL = asum.S
-DASUMKERNEL = asum.S
-CASUMKERNEL = casum.S
-ZASUMKERNEL = zasum.S
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-SAXPYKERNEL = axpy.S
-DAXPYKERNEL = axpy.S
-CAXPYKERNEL = zaxpy.S
-ZAXPYKERNEL = zaxpy.S
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-SCOPYKERNEL = copy.S
-DCOPYKERNEL = copy.S
-CCOPYKERNEL = copy.S
-ZCOPYKERNEL = copy.S
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-SDOTKERNEL = dot.S
-DDOTKERNEL = dot.S
-CDOTKERNEL = zdot.S
-ZDOTKERNEL = zdot.S
-DSDOTKERNEL = dot.S
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-ifneq ($(OS_DARWIN)$(CROSS),11)
-SNRM2KERNEL = nrm2.S
-DNRM2KERNEL = nrm2.S
-CNRM2KERNEL = znrm2.S
-ZNRM2KERNEL = znrm2.S
-endif
+SAMAXKERNEL = amax.S
+DAMAXKERNEL = amax.S
+CAMAXKERNEL = zamax.S
+ZAMAXKERNEL = zamax.S
+
+SAXPYKERNEL = axpy.S
+DAXPYKERNEL = daxpy_thunderx2t99.S
+CAXPYKERNEL = zaxpy.S
+ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
-SSWAPKERNEL = swap.S
-DSWAPKERNEL = swap.S
-CSWAPKERNEL = swap.S
-ZSWAPKERNEL = swap.S
-
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
-STRMMKERNEL = ../generic/trmmkernel_4x4.c
+
+SASUMKERNEL = sasum_thunderx2t99.c
+DASUMKERNEL = dasum_thunderx2t99.c
+CASUMKERNEL = casum_thunderx2t99.c
+ZASUMKERNEL = zasum_thunderx2t99.c
+
+SCOPYKERNEL = copy_thunderx2t99.c
+DCOPYKERNEL = copy_thunderx2t99.c
+CCOPYKERNEL = copy_thunderx2t99.c
+ZCOPYKERNEL = copy_thunderx2t99.c
+
+SSWAPKERNEL = swap_thunderx2t99.S
+DSWAPKERNEL = swap_thunderx2t99.S
+CSWAPKERNEL = swap_thunderx2t99.S
+ZSWAPKERNEL = swap_thunderx2t99.S
+
+ISAMAXKERNEL = iamax_thunderx2t99.c
+IDAMAXKERNEL = iamax_thunderx2t99.c
+ICAMAXKERNEL = izamax_thunderx2t99.c
+IZAMAXKERNEL = izamax_thunderx2t99.c
+
+ifneq ($(OS_DARWIN)$(CROSS),11)
+SNRM2KERNEL = scnrm2_thunderx2t99.c
+CNRM2KERNEL = scnrm2_thunderx2t99.c
+#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
+#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
+DNRM2KERNEL = dznrm2_thunderx2t99.c
+ZNRM2KERNEL = dznrm2_thunderx2t99.c
+endif
+
+DDOTKERNEL = dot_thunderx2t99.c
+SDOTKERNEL = dot_thunderx2t99.c
+CDOTKERNEL = zdot_thunderx2t99.c
+ZDOTKERNEL = zdot_thunderx2t99.c
+DSDOTKERNEL = dot.S
+
+ifneq ($(OS_DARWIN)$(CROSS),11)
+
+SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ = sgemm_incopy.o
+SGEMMITCOPYOBJ = sgemm_itcopy.o
+endif
+SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ = sgemm_oncopy.o
+SGEMMOTCOPYOBJ = sgemm_otcopy.o
+
+DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ = cgemm_incopy.o
+CGEMMITCOPYOBJ = cgemm_itcopy.o
+endif
+CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ = cgemm_oncopy.o
+CGEMMOTCOPYOBJ = cgemm_otcopy.o
+
+ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ = zgemm_incopy.o
+ZGEMMITCOPYOBJ = zgemm_itcopy.o
+endif
+ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ = zgemm_oncopy.o
+ZGEMMOTCOPYOBJ = zgemm_otcopy.o
+
+ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
+DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
+endif
+
+ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
+SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
+endif
+
+ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
+CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
+endif
+
+ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
+ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
+endif
+
+else
+
+STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
-ifneq ($(OS_DARWIN)$(CROSS),11)
-SGEMMKERNEL = sgemm_kernel_4x4.S
-else
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
-endif
-SGEMMONCOPY = ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
+SGEMMONCOPY = ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
-STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-
-
-
+endif
#endif
#if defined(ARMV8)
+
+#if defined(OS_DARWIN) && defined(CROSS)
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
-#if defined(OS_DARWIN) && defined(CROSS)
#define SGEMM_DEFAULT_UNROLL_M 2
-#define SGEMM_DEFAULT_UNROLL N 2
-#else
-#define SGEMM_DEFAULT_UNROLL_M 4
-#define SGEMM_DEFAULT_UNROLL_N 4
-#endif
+#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 4096
+#define SYMV_P 16
+#else
+
+#define SNUMOPT 2
+#define DNUMOPT 2
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 4
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P sgemm_p
+#define DGEMM_DEFAULT_P dgemm_p
+#define CGEMM_DEFAULT_P cgemm_p
+#define ZGEMM_DEFAULT_P zgemm_p
+
+#define SGEMM_DEFAULT_Q sgemm_q
+#define DGEMM_DEFAULT_Q dgemm_q
+#define CGEMM_DEFAULT_Q cgemm_q
+#define ZGEMM_DEFAULT_Q zgemm_q
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R dgemm_r
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
#define SYMV_P 16
#endif
+#endif
+
#if defined(THUNDERX)
#define SNUMOPT 2
#define DNUMOPT 2