From 21f46a1cf2cefbdedf89878e3a6324578d0fe8ca Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Wed, 17 Oct 2018 08:11:27 -0700 Subject: [PATCH] ARM64: Use THUNDERX2T99 Neon Kernels for ARMV8 Currently the generic ARMV8 target uses C implementations for many routines. Replace these with the neon implementations written for THUNDERX2T99 target which are upto 6x faster for certain routines. --- driver/others/parameter.c | 4 +- interface/swap.c | 2 +- kernel/arm64/KERNEL.ARMV8 | 220 ++++++++++++++++++++++++++++++++-------------- param.h | 47 ++++++++-- 4 files changed, 196 insertions(+), 77 deletions(-) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index e7332c0..0f2364d 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -730,7 +730,7 @@ void blas_set_parameter(void){ #if defined(ARCH_ARM64) -#if defined(VULCAN) || defined(THUNDERX2T99) +#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) unsigned long dgemm_prefetch_size_a; unsigned long dgemm_prefetch_size_b; unsigned long dgemm_prefetch_size_c; @@ -738,7 +738,7 @@ unsigned long dgemm_prefetch_size_c; void blas_set_parameter(void) { -#if defined(VULCAN) || defined(THUNDERX2T99) +#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8) dgemm_p = 160; dgemm_q = 128; dgemm_r = 4096; diff --git a/interface/swap.c b/interface/swap.c index f7642ed..17a9868 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 4c6d6fb..7e7a900 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -1,8 +1,3 @@ -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - SAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c @@ -14,11 +9,6 @@ DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c DMINKERNEL = ../arm/min.c -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - ISAMINKERNEL = ../arm/iamin.c IDAMINKERNEL = ../arm/iamin.c ICAMINKERNEL = ../arm/izamin.c @@ -30,33 +20,35 @@ IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c IDMINKERNEL = ../arm/imin.c -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -ifneq ($(OS_DARWIN)$(CROSS),11) -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S -endif +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S SROTKERNEL = rot.S DROTKERNEL = rot.S @@ -68,11 +60,6 @@ DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - SGEMVNKERNEL = gemv_n.S DGEMVNKERNEL = gemv_n.S CGEMVNKERNEL = zgemv_n.S @@ -83,18 +70,137 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S -STRMMKERNEL = ../generic/trmmkernel_4x4.c + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +ifneq ($(OS_DARWIN)$(CROSS),11) +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c +endif + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifneq ($(OS_DARWIN)$(CROSS),11) + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif + +else + +STRMMKERNEL = ../generic/trmmkernel_2x2.c DTRMMKERNEL = ../generic/trmmkernel_2x2.c CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ifneq ($(OS_DARWIN)$(CROSS),11) -SGEMMKERNEL = sgemm_kernel_4x4.S -else SGEMMKERNEL = ../generic/gemmkernel_2x2.c -endif -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o @@ -116,26 +222,4 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - +endif diff --git a/param.h b/param.h index ded9fe0..c7952e1 100644 --- a/param.h +++ b/param.h @@ -2583,6 +2583,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(ARMV8) + +#if defined(OS_DARWIN) && defined(CROSS) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2590,13 +2592,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#if defined(OS_DARWIN) && defined(CROSS) #define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL N 2 -#else -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 -#endif +#define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_N 2 @@ -2622,10 +2619,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#define SYMV_P 16 +#else + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p + +#define SGEMM_DEFAULT_Q sgemm_q +#define DGEMM_DEFAULT_Q dgemm_q +#define CGEMM_DEFAULT_Q cgemm_q +#define ZGEMM_DEFAULT_Q zgemm_q + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r #define SYMV_P 16 #endif +#endif + #if defined(THUNDERX) #define SNUMOPT 2 #define DNUMOPT 2 -- 2.7.4