ARM64: Use THUNDERX2T99 Neon Kernels for ARMV8
authorAshwin Sekhar T K <ashwin.sekhar@cavium.com>
Wed, 17 Oct 2018 15:11:27 +0000 (08:11 -0700)
committerAshwin Sekhar T K <ashwin.sekhar@cavium.com>
Wed, 17 Oct 2018 17:44:37 +0000 (10:44 -0700)
Currently the generic ARMV8 target uses C implementations
for many routines. Replace these with the neon implementations
written for THUNDERX2T99 target which are upto 6x faster for
certain routines.

driver/others/parameter.c
interface/swap.c
kernel/arm64/KERNEL.ARMV8
param.h

index e7332c0..0f2364d 100644 (file)
@@ -730,7 +730,7 @@ void blas_set_parameter(void){
 
 #if defined(ARCH_ARM64)
 
-#if defined(VULCAN) || defined(THUNDERX2T99)
+#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8)
 unsigned long dgemm_prefetch_size_a;
 unsigned long dgemm_prefetch_size_b;
 unsigned long dgemm_prefetch_size_c;
@@ -738,7 +738,7 @@ unsigned long dgemm_prefetch_size_c;
 
 void blas_set_parameter(void)
 {
-#if defined(VULCAN) || defined(THUNDERX2T99)
+#if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8)
   dgemm_p = 160;
   dgemm_q = 128;
   dgemm_r = 4096;
index f7642ed..17a9868 100644 (file)
@@ -42,7 +42,7 @@
 #include "functable.h"
 #endif
 
-#if defined(THUNDERX2T99) || defined(VULCAN)
+#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8)
 // Multithreaded swap gives performance benefits in ThunderX2T99
 #else
 // Disable multi-threading as it does not show any performance
index 4c6d6fb..7e7a900 100644 (file)
@@ -1,8 +1,3 @@
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
 SAMINKERNEL  = ../arm/amin.c
 DAMINKERNEL  = ../arm/amin.c
 CAMINKERNEL  = ../arm/zamin.c
@@ -14,11 +9,6 @@ DMAXKERNEL   = ../arm/max.c
 SMINKERNEL   = ../arm/min.c
 DMINKERNEL   = ../arm/min.c
 
-ISAMAXKERNEL = iamax.S
-IDAMAXKERNEL = iamax.S
-ICAMAXKERNEL = izamax.S
-IZAMAXKERNEL = izamax.S
-
 ISAMINKERNEL = ../arm/iamin.c
 IDAMINKERNEL = ../arm/iamin.c
 ICAMINKERNEL = ../arm/izamin.c
@@ -30,33 +20,35 @@ IDMAXKERNEL  = ../arm/imax.c
 ISMINKERNEL  = ../arm/imin.c
 IDMINKERNEL  = ../arm/imin.c
 
-SASUMKERNEL  = asum.S
-DASUMKERNEL  = asum.S
-CASUMKERNEL  = casum.S
-ZASUMKERNEL  = zasum.S
+STRSMKERNEL_LN =  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT =  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
 
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 
-SCOPYKERNEL  = copy.S
-DCOPYKERNEL  = copy.S
-CCOPYKERNEL  = copy.S
-ZCOPYKERNEL  = copy.S
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 
-SDOTKERNEL   = dot.S
-DDOTKERNEL   = dot.S
-CDOTKERNEL   = zdot.S
-ZDOTKERNEL   = zdot.S
-DSDOTKERNEL  = dot.S
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 
-ifneq ($(OS_DARWIN)$(CROSS),11)
-SNRM2KERNEL  = nrm2.S
-DNRM2KERNEL  = nrm2.S
-CNRM2KERNEL  = znrm2.S
-ZNRM2KERNEL  = znrm2.S
-endif
+SAMAXKERNEL  = amax.S
+DAMAXKERNEL  = amax.S
+CAMAXKERNEL  = zamax.S
+ZAMAXKERNEL  = zamax.S
+
+SAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
+CAXPYKERNEL  = zaxpy.S
+ZAXPYKERNEL  = zaxpy.S
 
 SROTKERNEL   = rot.S
 DROTKERNEL   = rot.S
@@ -68,11 +60,6 @@ DSCALKERNEL  = scal.S
 CSCALKERNEL  = zscal.S
 ZSCALKERNEL  = zscal.S
 
-SSWAPKERNEL  = swap.S
-DSWAPKERNEL  = swap.S
-CSWAPKERNEL  = swap.S
-ZSWAPKERNEL  = swap.S
-
 SGEMVNKERNEL = gemv_n.S
 DGEMVNKERNEL = gemv_n.S
 CGEMVNKERNEL = zgemv_n.S
@@ -83,18 +70,137 @@ DGEMVTKERNEL = gemv_t.S
 CGEMVTKERNEL = zgemv_t.S
 ZGEMVTKERNEL = zgemv_t.S
 
-STRMMKERNEL    = ../generic/trmmkernel_4x4.c
+
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+ifneq ($(OS_DARWIN)$(CROSS),11)
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+#DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
+#ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+endif
+
+DDOTKERNEL     = dot_thunderx2t99.c
+SDOTKERNEL     = dot_thunderx2t99.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
+DSDOTKERNEL    = dot.S
+
+ifneq ($(OS_DARWIN)$(CROSS),11)
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy.o
+SGEMMITCOPYOBJ =  sgemm_itcopy.o
+endif
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+
+ifeq ($(DGEMM_UNROLL_M), 8)
+DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
+DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
+else
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+endif
+
+DGEMMINCOPYOBJ =  dgemm_incopy.o
+DGEMMITCOPYOBJ =  dgemm_itcopy.o
+endif
+
+ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
+DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
+else
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+endif
+
+DGEMMONCOPYOBJ =  dgemm_oncopy.o
+DGEMMOTCOPYOBJ =  dgemm_otcopy.o
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy.o
+CGEMMITCOPYOBJ =  cgemm_itcopy.o
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy.o
+ZGEMMITCOPYOBJ =  zgemm_itcopy.o
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
+DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
+endif
+
+ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
+SGEMMKERNEL    =  sgemm_kernel_16x4_thunderx2t99.S
+endif
+
+ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
+CGEMMKERNEL    =  cgemm_kernel_8x4_thunderx2t99.S
+endif
+
+ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
+ZGEMMKERNEL    =  zgemm_kernel_4x4_thunderx2t99.S
+endif
+
+else
+
+STRMMKERNEL    = ../generic/trmmkernel_2x2.c
 DTRMMKERNEL    = ../generic/trmmkernel_2x2.c
 CTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
 ZTRMMKERNEL    = ../generic/ztrmmkernel_2x2.c
 
-ifneq ($(OS_DARWIN)$(CROSS),11)
-SGEMMKERNEL    =  sgemm_kernel_4x4.S
-else
 SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-endif
-SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
@@ -116,26 +222,4 @@ ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMONCOPYOBJ =  zgemm_oncopy.o
 ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
 
-STRSMKERNEL_LN =  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT =  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT =  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
-
-
-
-
+endif
diff --git a/param.h b/param.h
index ded9fe0..c7952e1 100644 (file)
--- a/param.h
+++ b/param.h
@@ -2583,6 +2583,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #if defined(ARMV8)
+
+#if defined(OS_DARWIN) && defined(CROSS)
 #define SNUMOPT                2
 #define DNUMOPT                2
 
@@ -2590,13 +2592,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#if defined(OS_DARWIN) && defined(CROSS)
 #define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL N  2
-#else
-#define SGEMM_DEFAULT_UNROLL_M  4
-#define SGEMM_DEFAULT_UNROLL_N  4
-#endif
+#define SGEMM_DEFAULT_UNROLL_N  2
 
 #define DGEMM_DEFAULT_UNROLL_M  2
 #define DGEMM_DEFAULT_UNROLL_N  2
@@ -2622,10 +2619,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
+#define SYMV_P 16
+#else
+
+#define SNUMOPT                2
+#define DNUMOPT                2
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P        sgemm_p
+#define DGEMM_DEFAULT_P        dgemm_p
+#define CGEMM_DEFAULT_P cgemm_p
+#define ZGEMM_DEFAULT_P zgemm_p
+
+#define SGEMM_DEFAULT_Q sgemm_q
+#define DGEMM_DEFAULT_Q dgemm_q
+#define CGEMM_DEFAULT_Q cgemm_q
+#define ZGEMM_DEFAULT_Q zgemm_q
+
+#define SGEMM_DEFAULT_R sgemm_r
+#define DGEMM_DEFAULT_R dgemm_r
+#define CGEMM_DEFAULT_R cgemm_r
+#define ZGEMM_DEFAULT_R zgemm_r
 
 #define SYMV_P 16
 #endif
 
+#endif
+
 #if defined(THUNDERX)
 #define SNUMOPT                2
 #define DNUMOPT                2