From b58d4f31abf55446d4707036df0a0c5c7ef26047 Mon Sep 17 00:00:00 2001 From: Bine Brank Date: Sun, 21 Nov 2021 14:56:27 +0100 Subject: [PATCH] some clean-up & commentary --- kernel/arm64/KERNEL.ARMV8SVE | 2 +- kernel/arm64/dgemm_kernel_sve_v1x8.S | 15 ++++++----- kernel/arm64/dgemm_kernel_sve_v2x8.S | 38 +++++++++++++++++++-------- kernel/arm64/dgemm_ncopy_sve_v1.c | 50 ++++++++++++++++++------------------ kernel/arm64/dgemm_tcopy_sve_v1.c | 48 +++++++++++++++++----------------- kernel/arm64/dtrmm_kernel_sve_v1x8.S | 17 ++++++------ kernel/arm64/trmm_lncopy_sve_v1.c | 9 ++----- kernel/arm64/trmm_ltcopy_sve_v1.c | 9 ++----- kernel/arm64/trmm_uncopy_sve_v1.c | 12 ++------- kernel/arm64/trmm_utcopy_sve_v1.c | 11 +++----- 10 files changed, 104 insertions(+), 107 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 572c96f..dbf11fd 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -143,7 +143,7 @@ endif SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S DGEMMINCOPY = dgemm_ncopy_sve_v1.c diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S index 94682ae..bbbd0fd 100644 --- a/kernel/arm64/dgemm_kernel_sve_v1x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z2.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA] ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one - //incb pA, all, mul #2 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ptrue p0.d // create true predicate mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldgemm_kernel_L8_BEGIN: @@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldgemm_kernel_L8_Mv1_BEGIN: +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ mov counterI, #0 - whilelt p1.d, counterI, origM //SVE instruction + whilelt p1.d, counterI, origM cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 @@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldgemm_kernel_L4_BEGIN: @@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldgemm_kernel_L2_BEGIN: @@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldgemm_kernel_L1_BEGIN: diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S index 59e4155..023d5ba 100644 --- a/kernel/arm64/dgemm_kernel_sve_v2x8.S +++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S @@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ +/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ + #define ASSEMBLER #include "common.h" @@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z7.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v00 ALPHA -> pA10_0 //v01 pA10_1 -//v02 -//v03 +//v02 pA20_0 +//v03 pA20_1 //v04 //v05 //v06 @@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v21 must save C5 //v22 must save C6 //v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 /******************************************************************************* * Macro definitions @@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA1] ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one - //incb pA1, all, mul #2 add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ptrue p0.d // create true predicate mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldgemm_kernel_L8_BEGIN: @@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldgemm_kernel_L8_Mv2_BEGIN: mov counterI, #0 - cmp origM, vec_lenx2 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN blt .Ldgemm_kernel_L8_Mv1_BEGIN mov counterI, origM +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ mul temp, vec_len, origK // generate address of pA2 add pA2, pA1, temp, lsl #3 // pA1 = start of A array prfm PLDL1KEEP, [pA2] @@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp counterI, origM beq .Ldgemm_kernel_L8_END -////////////////////////////////// +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. .Ldgemm_kernel_L8_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldgemm_kernel_L4_BEGIN: @@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq .Ldgemm_kernel_L4_END ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. .Ldgemm_kernel_L4_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldgemm_kernel_L2_BEGIN: @@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. .Ldgemm_kernel_L2_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction @@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldgemm_kernel_L1_BEGIN: @@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. .Ldgemm_kernel_L1_Mv1_BEGIN: whilelt p1.d, counterI, origM //SVE instruction diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c index 3428121..1f812c7 100644 --- a/kernel/arm64/dgemm_ncopy_sve_v1.c +++ b/kernel/arm64/dgemm_ncopy_sve_v1.c @@ -40,40 +40,40 @@ #include "common.h" #include -// TODO: write in assembly with proper unrolling +// TODO: write in assembly with proper unrolling of inner loop int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; - svint64_t lda_vec = svindex_s64(0LL, lda); - uint64_t sve_size = svcntd(); + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); - aoffset = a; - boffset = b; + aoffset = a; + boffset = b; - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { - aoffset1 = aoffset; + aoffset1 = aoffset; - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1++; - boffset += active; - } - aoffset += sve_size * lda; + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); - } while (svptest_any(svptrue_b64(), pg)); + } while (svptest_any(svptrue_b64(), pg)); - return 0; + return 0; } diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c index 33e69bf..cb645a1 100644 --- a/kernel/arm64/dgemm_tcopy_sve_v1.c +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -40,38 +40,38 @@ #include "common.h" #include -// TODO: write in assembly with proper unrolling +// TODO: write in assembly with proper unrolling of inner loop int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ - BLASLONG j; - IFLOAT *aoffset, *aoffset1, *boffset; + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; - uint64_t sve_size = svcntd(); + uint64_t sve_size = svcntd(); - aoffset = a; - boffset = b; + aoffset = a; + boffset = b; - j = 0; - svbool_t pg = svwhilelt_b64(j, n); - uint64_t active = svcntp_b64(svptrue_b64(), pg); - do { + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { - aoffset1 = aoffset; + aoffset1 = aoffset; - uint64_t i_cnt = m; - while (i_cnt--) { - svfloat64_t a_vec = svld1(pg, (double *)aoffset1); - svst1_f64(pg, (double *) boffset, a_vec); - aoffset1 += lda; - boffset += active; - } - aoffset += sve_size; + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1(pg, (double *)aoffset1); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; - j += svcntd(); - pg = svwhilelt_b64(j, n); - active = svcntp_b64(svptrue_b64(), pg); + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); - } while (svptest_any(svptrue_b64(), pg)); + } while (svptest_any(svptrue_b64(), pg)); - return 0; + return 0; } diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S index 1d4df08..1f8c9b2 100644 --- a/kernel/arm64/dtrmm_kernel_sve_v1x8.S +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaZ z2.d -#define A_PRE_SIZE 2560 +#define A_PRE_SIZE 1536 #define B_PRE_SIZE 512 #define C_PRE_SIZE 128 @@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNELv1x8_I ld1d z0.d, p1/z, [pA] ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one - //incb pA, all, mul #2 add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 ld1rd z8.d, p0/z, [pB] @@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif mov pB, origPB - +// Loop over N mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 ble .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ .align 5 .Ldtrmm_kernel_L8_BEGIN: @@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Ldtrmm_kernel_L8_Mv1_BEGIN: +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ mov counterI, #0 - whilelt p1.d, counterI, origM //SVE instruction - cntp lanes, p0, p1.d + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension .align 5 .Ldtrmm_kernel_L8_Mv1_20: @@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bgt .Ldtrmm_kernel_L8_BEGIN /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ .align 5 .Ldtrmm_kernel_L4_BEGIN: @@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ .align 5 .Ldtrmm_kernel_L2_BEGIN: @@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif /******************************************************************************/ -/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ .align 5 .Ldtrmm_kernel_L1_BEGIN: diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index e454e28..6c38cb3 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_ln.\n"); int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); @@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X > posY) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); ao ++; @@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index 86433f2..365be06 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_lt.\n"); - int sve_len = svcntd(); FLOAT *ao; @@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X > posY) { ao ++; b += n_active; X ++; @@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - posY += n_active; js += n_active; diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index 21f392b..502b799 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_un.\n"); - //printf("Using m %ld, n %ld.\n", m, n); - //printf("Using lda %ld.\n", lda); - //printf("Using posX %ld, posY %ld.\n", posX, posY); int sve_len = svcntd(); svint64_t index = svindex_s64(0LL, lda); @@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X < posY) { svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); svst1(pn, b, aj_vec); ao ++; @@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index 38b88dc..b45cbd7 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON BLASLONG i, js; BLASLONG X; - //printf("Using trmm_ut.\n"); int sve_len = svcntd(); @@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - /* svbool_t pm = svwhilelt_b64(i, m); */ - /* int m_active = svcntp_b64(svptrue_b64(), pm); */ do { - if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl + if (X < posY) { ao ++; b += n_active; X ++; @@ -83,7 +80,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON b += n_active; X ++; i ++; - } else { + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ #ifdef UNIT int temp = 0; for (int j = 0; j < n_active; j++) { @@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } } while (i < m); - //printf("\n"); - - posY += n_active; js += n_active; pn = svwhilelt_b64(js, n); -- 2.7.4