fix UNROLL_MN and add to targets for SVE
authorBine Brank <binebrank@gmail.com>
Sat, 11 Dec 2021 15:37:23 +0000 (16:37 +0100)
committerBine Brank <binebrank@gmail.com>
Sat, 11 Dec 2021 15:37:23 +0000 (16:37 +0100)
kernel/arm64/KERNEL.A64FX
kernel/arm64/KERNEL.ARMV8SVE
param.h

index ee66fea..80be4dd 100644 (file)
@@ -114,8 +114,8 @@ DSDOTKERNEL    = dot.S
 DGEMM_BETA     = dgemm_beta.S
 SGEMM_BETA     = sgemm_beta.S
 
-SGEMMKERNEL    =  sgemm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_8x$(SGEMM_UNROLL_N).S
+SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
 
 SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
 SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
@@ -127,6 +127,11 @@ SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
+STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
+STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
+STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
+
 SSYMMUCOPY_M    =  symm_ucopy_sve.c
 SSYMMLCOPY_M    =  symm_lcopy_sve.c
 
index 1f605d1..0364a92 100644 (file)
@@ -114,35 +114,27 @@ DSDOTKERNEL    = dot.S
 DGEMM_BETA     = dgemm_beta.S
 SGEMM_BETA     = sgemm_beta.S
 
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_M), 16)
-SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-endif
-ifeq ($(SGEMM_UNROLL_M), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-endif
+SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
+
+SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
+SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
+SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
+SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
+
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ifeq ($(SGEMM_UNROLL_N), 16)
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-else
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
-endif
-ifeq ($(SGEMM_UNROLL_N), 4)
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-else
-SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
-endif
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
+STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
+STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
+STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
+
+SSYMMUCOPY_M    =  symm_ucopy_sve.c
+SSYMMLCOPY_M    =  symm_lcopy_sve.c
+
 DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
 DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
 
diff --git a/param.h b/param.h
index e9419bd..f7b8eb0 100644 (file)
--- a/param.h
+++ b/param.h
@@ -3296,14 +3296,22 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(ARMV8SVE) || defined(A64FX)
 
+/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
+Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
 #define SGEMM_DEFAULT_UNROLL_M  4
 #define SGEMM_DEFAULT_UNROLL_N  8
+/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
+ * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
+ * If SVE size is ever more than 1024, this should be increased also. */
+#define SGEMM_DEFAULT_UNROLL_MN  32
 
 /* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl".
 Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
 #define DGEMM_DEFAULT_UNROLL_M  2 
 #define DGEMM_DEFAULT_UNROLL_N  8
 
+#define DGEMM_DEFAULT_UNROLL_MN  32
+
 #define CGEMM_DEFAULT_UNROLL_M  8
 #define CGEMM_DEFAULT_UNROLL_N  4