some clean-up & commentary

author Bine Brank <binebrank@gmail.com>

Sun, 21 Nov 2021 13:56:27 +0000 (14:56 +0100)

committer Bine Brank <binebrank@gmail.com>

Sun, 21 Nov 2021 13:56:27 +0000 (14:56 +0100)
author Bine Brank <binebrank@gmail.com>
Sun, 21 Nov 2021 13:56:27 +0000 (14:56 +0100)
committer Bine Brank <binebrank@gmail.com>
Sun, 21 Nov 2021 13:56:27 +0000 (14:56 +0100)
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE

index 572c96f..dbf11fd 100644 (file)
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -143,7 +143,7 @@ endif
  SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
  SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
  
-DGEMMKERNEL    =  dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
+DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
  DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
  
  DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S

index 94682ae..bbbd0fd 100644 (file)
--- a/kernel/arm64/dgemm_kernel_sve_v1x8.S
+++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S
@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #define alpha0         d10
  #define alphaZ         z2.d
  
-#define A_PRE_SIZE     2560
+#define A_PRE_SIZE     1536
  #define B_PRE_SIZE     512
  #define C_PRE_SIZE     128
  
@@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  .macro KERNELv1x8_I
      ld1d  z0.d, p1/z, [pA] 
      ld1d  z1.d, p1/z, [pA, lanes, lsl #3]   // next one
-    //incb  pA, all, mul #2
         add     pA, pA, lanes, lsl #4   // pA = pA + lanes * 2 * 8
  
      ld1rd  z8.d, p0/z,  [pB]
@@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      ptrue p0.d                  // create true predicate 
  
         mov     pB, origPB
-
+// Loop over N
         mov     counterJ, origN
         asr     counterJ, counterJ, #3          // J = J / 8
         cmp     counterJ, #0
         ble     .Ldgemm_kernel_L4_BEGIN
  
  /******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
  
         .align 5
  .Ldgemm_kernel_L8_BEGIN:
@@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
  .Ldgemm_kernel_L8_Mv1_BEGIN:
  
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
      mov counterI, #0
-    whilelt p1.d, counterI, origM               //SVE instruction
+    whilelt p1.d, counterI, origM   
      cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
  
         .align 5
@@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         bgt     .Ldgemm_kernel_L8_BEGIN
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
  
         .align 5
  .Ldgemm_kernel_L4_BEGIN:
@@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         add     origPB, origPB, temp    // B = B + K * 4 * 8
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
  
         .align 5
  .Ldgemm_kernel_L2_BEGIN:
@@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
  
         .align 5
  .Ldgemm_kernel_L1_BEGIN:
diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S

index 59e4155..023d5ba 100644 (file)
--- a/kernel/arm64/dgemm_kernel_sve_v2x8.S
+++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S
@@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *******************************************************************************/
  
+/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8.
+However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
+This means that we sweep two panels of packed A when iterating in a loop over K.
+With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */
+
  #define ASSEMBLER
  #include "common.h"
  
@@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #define alpha0         d10
  #define alphaZ         z7.d
  
-#define A_PRE_SIZE     2560
+#define A_PRE_SIZE     1536
  #define B_PRE_SIZE     512
  #define C_PRE_SIZE     128
  
@@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
  //v00 ALPHA -> pA10_0
  //v01 pA10_1
-//v02 
-//v03 
+//v02 pA20_0
+//v03 pA20_1
  //v04 
  //v05 
  //v06 
@@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  //v21 must save C5
  //v22 must save C6
  //v23 must save C7
+//v24 must save C8
+//v25 must save C9
+//v26 must save C10
+//v27 must save C11
+//v28 must save C12
+//v29 must save C13
+//v30 must save C14
+//v31 must save C15
  
  /*******************************************************************************
  * Macro definitions
@@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  .macro KERNELv1x8_I
      ld1d  z0.d, p1/z, [pA1] 
      ld1d  z1.d, p1/z, [pA1, lanes, lsl #3]   // next one
-    //incb  pA1, all, mul #2
         add     pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8
  
      ld1rd  z8.d, p0/z,  [pB]
@@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      ptrue p0.d                  // create true predicate 
  
         mov     pB, origPB
-
+// Loop over N
         mov     counterJ, origN
         asr     counterJ, counterJ, #3          // J = J / 8
         cmp     counterJ, #0
         ble     .Ldgemm_kernel_L4_BEGIN
  
  /******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
  
         .align 5
  .Ldgemm_kernel_L8_BEGIN:
@@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  .Ldgemm_kernel_L8_Mv2_BEGIN:
  
      mov counterI, #0
-    cmp origM, vec_lenx2
+    cmp origM, vec_lenx2        // Check if M < 2*SVE_LEN
      blt .Ldgemm_kernel_L8_Mv1_BEGIN
  
      mov counterI, origM
  
+/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
      mul temp, vec_len, origK                // generate address of pA2
         add     pA2, pA1, temp, lsl #3                  // pA1 = start of A array
         prfm    PLDL1KEEP, [pA2]
@@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      cmp counterI, origM
      beq .Ldgemm_kernel_L8_END
  
-//////////////////////////////////
+//////////////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
  .Ldgemm_kernel_L8_Mv1_BEGIN:
  
      whilelt p1.d, counterI, origM               //SVE instruction
@@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         bgt     .Ldgemm_kernel_L8_BEGIN
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
  
         .align 5
  .Ldgemm_kernel_L4_BEGIN:
@@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      beq .Ldgemm_kernel_L4_END
  
  //////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
  .Ldgemm_kernel_L4_Mv1_BEGIN:
  
      whilelt p1.d, counterI, origM               //SVE instruction
@@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         add     origPB, origPB, temp    // B = B + K * 4 * 8
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
  
         .align 5
  .Ldgemm_kernel_L2_BEGIN:
@@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
  
  //////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
  .Ldgemm_kernel_L2_Mv1_BEGIN:
  
      whilelt p1.d, counterI, origM               //SVE instruction
@@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         add     origPB, origPB, origK, lsl #4   // B = B + K * 2 * 8
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
  
         .align 5
  .Ldgemm_kernel_L1_BEGIN:
@@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
  
  //////////////////////////////////
+// We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
  .Ldgemm_kernel_L1_Mv1_BEGIN:
  
      whilelt p1.d, counterI, origM               //SVE instruction
diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c

index 3428121..1f812c7 100644 (file)
--- a/kernel/arm64/dgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/dgemm_ncopy_sve_v1.c
@@ -40,40 +40,40 @@
  #include "common.h"
  #include <arm_sve.h>
  
-// TODO: write in assembly with proper unrolling
+// TODO: write in assembly with proper unrolling of inner loop
  int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
  
-  BLASLONG j;
-  IFLOAT *aoffset, *aoffset1, *boffset;
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
  
-  svint64_t lda_vec = svindex_s64(0LL, lda);
-  uint64_t sve_size = svcntd();
+    svint64_t lda_vec = svindex_s64(0LL, lda);
+    uint64_t sve_size = svcntd();
  
-  aoffset = a;
-  boffset = b;
+    aoffset = a;
+    boffset = b;
  
-  j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
-  uint64_t active = svcntp_b64(svptrue_b64(), pg);
-  do {
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
  
-      aoffset1 = aoffset;
+        aoffset1 = aoffset;
  
-      uint64_t i_cnt = m;
-      while (i_cnt--) {
-          svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
-          svst1_f64(pg, (double *) boffset, a_vec);
-          aoffset1++;
-          boffset += active;
-      }
-      aoffset += sve_size * lda;
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
+            svst1_f64(pg, (double *) boffset, a_vec);
+            aoffset1++;
+            boffset += active;
+        }
+        aoffset += sve_size * lda;
  
-      j += svcntd();
-      pg = svwhilelt_b64(j, n);
-      active = svcntp_b64(svptrue_b64(), pg);
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
  
  
-  } while (svptest_any(svptrue_b64(), pg));
+    } while (svptest_any(svptrue_b64(), pg));
  
-  return 0;
+    return 0;
  }
diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c

index 33e69bf..cb645a1 100644 (file)
--- a/kernel/arm64/dgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/dgemm_tcopy_sve_v1.c
@@ -40,38 +40,38 @@
  #include "common.h"
  #include <arm_sve.h>
  
-// TODO: write in assembly with proper unrolling
+// TODO: write in assembly with proper unrolling of inner loop
  int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
  
-  BLASLONG j;
-  IFLOAT *aoffset, *aoffset1, *boffset;
+    BLASLONG j;
+    IFLOAT *aoffset, *aoffset1, *boffset;
  
-  uint64_t sve_size = svcntd();
+    uint64_t sve_size = svcntd();
  
-  aoffset = a;
-  boffset = b;
+    aoffset = a;
+    boffset = b;
  
-  j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
-  uint64_t active = svcntp_b64(svptrue_b64(), pg);
-  do {
+    j = 0;
+    svbool_t pg = svwhilelt_b64(j, n);
+    uint64_t active = svcntp_b64(svptrue_b64(), pg);
+    do {
  
-      aoffset1 = aoffset;
+        aoffset1 = aoffset;
  
-      uint64_t i_cnt = m;
-      while (i_cnt--) {
-          svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
-          svst1_f64(pg, (double *) boffset, a_vec);
-          aoffset1 += lda;
-          boffset += active;
-      }
-      aoffset += sve_size;
+        uint64_t i_cnt = m;
+        while (i_cnt--) {
+            svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
+            svst1_f64(pg, (double *) boffset, a_vec);
+            aoffset1 += lda;
+            boffset += active;
+        }
+        aoffset += sve_size;
  
-      j += svcntd();
-      pg = svwhilelt_b64(j, n);
-      active = svcntp_b64(svptrue_b64(), pg);
+        j += svcntd();
+        pg = svwhilelt_b64(j, n);
+        active = svcntp_b64(svptrue_b64(), pg);
  
-  } while (svptest_any(svptrue_b64(), pg));
+    } while (svptest_any(svptrue_b64(), pg));
  
-  return 0;
+    return 0;
  }
diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S

index 1d4df08..1f8c9b2 100644 (file)
--- a/kernel/arm64/dtrmm_kernel_sve_v1x8.S
+++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S
@@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #define alpha0         d10
  #define alphaZ         z2.d
  
-#define A_PRE_SIZE     2560
+#define A_PRE_SIZE     1536
  #define B_PRE_SIZE     512
  #define C_PRE_SIZE     128
  
@@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  .macro KERNELv1x8_I
      ld1d  z0.d, p1/z, [pA] 
      ld1d  z1.d, p1/z, [pA, lanes, lsl #3]   // next one
-    //incb  pA, all, mul #2
         add     pA, pA, lanes, lsl #4   // pA = pA + lanes * 2 * 8
  
      ld1rd  z8.d, p0/z,  [pB]
@@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #endif
  
         mov     pB, origPB
-
+// Loop over N
         mov     counterJ, origN
         asr     counterJ, counterJ, #3          // J = J / 8
         cmp     counterJ, #0
         ble     .Ldtrmm_kernel_L4_BEGIN
  
  /******************************************************************************/
+/* Repeat this as long as there are 8 left in N */
  
         .align 5
  .Ldtrmm_kernel_L8_BEGIN:
@@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
  .Ldtrmm_kernel_L8_Mv1_BEGIN:
  
+/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
      mov counterI, #0
-    whilelt p1.d, counterI, origM               //SVE instruction
-    cntp lanes, p0, p1.d
+    whilelt p1.d, counterI, origM      
+    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension
  
         .align 5
  .Ldtrmm_kernel_L8_Mv1_20:
@@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         bgt     .Ldtrmm_kernel_L8_BEGIN
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 4 left in N */
  
         .align 5
  .Ldtrmm_kernel_L4_BEGIN:
@@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #endif
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 2 left in N */
  
         .align 5
  .Ldtrmm_kernel_L2_BEGIN:
@@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  #endif
  
  /******************************************************************************/
-/******************************************************************************/
+/* Repeat the same thing if 1 left in N */
  
         .align 5
  .Ldtrmm_kernel_L1_BEGIN:
diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c

index e454e28..6c38cb3 100644 (file)
--- a/kernel/arm64/trmm_lncopy_sve_v1.c
+++ b/kernel/arm64/trmm_lncopy_sve_v1.c
@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
  
      BLASLONG i, js;
      BLASLONG X;
-    //printf("Using trmm_ln.\n");
  
      int sve_len = svcntd();
      svint64_t index = svindex_s64(0LL, lda);
@@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
          }
  
          i = 0;
-        /* svbool_t pm = svwhilelt_b64(i, m); */
-        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
          do 
          {
-            if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
+            if (X > posY) {
                  svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
                  svst1(pn, b, aj_vec);
                  ao ++;
@@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                      X ++;
                      i ++;
                  } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
  #ifdef UNIT
                      int temp = 0;
                      for (int j = 0; j < n_active; j++) {
@@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                  }
          } while (i < m);
  
-        //printf("\n");
-
-
          posY += n_active;
          js += n_active;
          pn = svwhilelt_b64(js, n);
diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c

index 86433f2..365be06 100644 (file)
--- a/kernel/arm64/trmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/trmm_ltcopy_sve_v1.c
@@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
      BLASLONG i, js;
      BLASLONG X;
  
-    //printf("Using trmm_lt.\n");
-
      int sve_len = svcntd();
  
      FLOAT *ao;
@@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
          }
  
          i = 0;
-        /* svbool_t pm = svwhilelt_b64(i, m); */
-        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
          do 
          {
-            if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
+            if (X > posY) {
                  ao ++;
                  b += n_active;
                  X ++;
@@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                      X ++;
                      i ++;
                  } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
  #ifdef UNIT
                      int temp = 0;
                      for (int j = 0; j < n_active; j++) {
@@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                  }
          } while (i < m);
  
-        //printf("\n");
-
  
          posY += n_active;
          js += n_active;
diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c

index 21f392b..502b799 100644 (file)
--- a/kernel/arm64/trmm_uncopy_sve_v1.c
+++ b/kernel/arm64/trmm_uncopy_sve_v1.c
@@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
  
      BLASLONG i, js;
      BLASLONG X;
-    //printf("Using trmm_un.\n");
-    //printf("Using m %ld, n %ld.\n", m, n);
-    //printf("Using lda %ld.\n", lda);
-    //printf("Using posX %ld, posY %ld.\n", posX, posY);
  
      int sve_len = svcntd();
      svint64_t index = svindex_s64(0LL, lda);
@@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
          }
  
          i = 0;
-        /* svbool_t pm = svwhilelt_b64(i, m); */
-        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
          do 
          {
-            if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
+            if (X < posY) {
                  svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
                  svst1(pn, b, aj_vec);
                  ao ++;
@@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                      X ++;
                      i ++;
                  } else {
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
  #ifdef UNIT
                      int temp = 0;
                      for (int j = 0; j < n_active; j++) {
@@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                  }
          } while (i < m);
  
-        //printf("\n");
-
-
          posY += n_active;
          js += n_active;
          pn = svwhilelt_b64(js, n);
diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c

index 38b88dc..b45cbd7 100644 (file)
--- a/kernel/arm64/trmm_utcopy_sve_v1.c
+++ b/kernel/arm64/trmm_utcopy_sve_v1.c
@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
  
      BLASLONG i, js;
      BLASLONG X;
-    //printf("Using trmm_ut.\n");
  
      int sve_len = svcntd();
  
@@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
          }
  
          i = 0;
-        /* svbool_t pm = svwhilelt_b64(i, m); */
-        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
          do 
          {
-            if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
+            if (X < posY) {
                  ao ++;
                  b += n_active;
                  X ++;
@@ -83,7 +80,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                      b += n_active;
                      X ++;
                      i ++;
-                } else {
+                } else { 
+                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
  #ifdef UNIT
                      int temp = 0;
                      for (int j = 0; j < n_active; j++) {
@@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                  }
          } while (i < m);
  
-        //printf("\n");
-
-
          posY += n_active;
          js += n_active;
          pn = svwhilelt_b64(js, n);
author	Bine Brank <binebrank@gmail.com>
	Sun, 21 Nov 2021 13:56:27 +0000 (14:56 +0100)
committer	Bine Brank <binebrank@gmail.com>
	Sun, 21 Nov 2021 13:56:27 +0000 (14:56 +0100)
kernel/arm64/KERNEL.ARMV8SVE		patch \| blob \| history
kernel/arm64/dgemm_kernel_sve_v1x8.S		patch \| blob \| history
kernel/arm64/dgemm_kernel_sve_v2x8.S		patch \| blob \| history
kernel/arm64/dgemm_ncopy_sve_v1.c		patch \| blob \| history
kernel/arm64/dgemm_tcopy_sve_v1.c		patch \| blob \| history
kernel/arm64/dtrmm_kernel_sve_v1x8.S		patch \| blob \| history
kernel/arm64/trmm_lncopy_sve_v1.c		patch \| blob \| history
kernel/arm64/trmm_ltcopy_sve_v1.c		patch \| blob \| history
kernel/arm64/trmm_uncopy_sve_v1.c		patch \| blob \| history
kernel/arm64/trmm_utcopy_sve_v1.c		patch \| blob \| history