From: Ashwin Sekhar T K Date: Tue, 24 Oct 2017 10:47:11 +0000 (+0000) Subject: ARM64: Convert all labels to local labels X-Git-Tag: v0.3.0~74^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a0128aa489720ac2fd883dbeebfecffd4812ff99;p=platform%2Fupstream%2Fopenblas.git ARM64: Convert all labels to local labels While debugging/profiling applications using perf or other tools, the kernels appear scattered in the profile reports. This is because the labels within the kernels are not local and each label is shown as a separate function. To avoid this, all the labels within the kernels are changed to local labels. --- diff --git a/kernel/arm64/amax.S b/kernel/arm64/amax.S index c02321a..f535ddf 100644 --- a/kernel/arm64/amax.S +++ b/kernel/arm64/amax.S @@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble amax_kernel_zero + ble .Lamax_kernel_zero cmp INC_X, xzr - ble amax_kernel_zero + ble .Lamax_kernel_zero cmp INC_X, #1 - bne amax_kernel_S_BEGIN + bne .Lamax_kernel_S_BEGIN -amax_kernel_F_BEGIN: +.Lamax_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq amax_kernel_F1_INIT + beq .Lamax_kernel_F1_INIT INIT_F4 subs I, I, #1 - beq amax_kernel_F1 + beq .Lamax_kernel_F1 -amax_kernel_F4: +.Lamax_kernel_F4: KERNEL_F4 subs I, I, #1 - bne amax_kernel_F4 + bne .Lamax_kernel_F4 -amax_kernel_F1: +.Lamax_kernel_F1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 -amax_kernel_F10: +.Lamax_kernel_F10: KERNEL_F1 subs I, I, #1 - bne amax_kernel_F10 + bne .Lamax_kernel_F10 ret -amax_kernel_F1_INIT: +.Lamax_kernel_F1_INIT: INIT_F1 subs N, N, #1 - b amax_kernel_F1 + b .Lamax_kernel_F1 -amax_kernel_S_BEGIN: +.Lamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble amax_kernel_S1 + ble .Lamax_kernel_S1 -amax_kernel_S4: +.Lamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -223,25 +223,25 @@ amax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S4 + bne .Lamax_kernel_S4 -amax_kernel_S1: +.Lamax_kernel_S1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 -amax_kernel_S10: +.Lamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S10 + bne .Lamax_kernel_S10 -amax_kernel_L999: +.Lamax_kernel_L999: ret -amax_kernel_zero: +.Lamax_kernel_zero: fmov MAXF, REG0 ret diff --git a/kernel/arm64/asum.S b/kernel/arm64/asum.S index bee8927..e88eb07 100644 --- a/kernel/arm64/asum.S +++ b/kernel/arm64/asum.S @@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble asum_kernel_L999 + ble .Lasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lasum_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq asum_kernel_F1 + beq .Lasum_kernel_F1 -asum_kernel_F8: +.Lasum_kernel_F8: KERNEL_F8 subs I, I, #1 - bne asum_kernel_F8 + bne .Lasum_kernel_F8 KERNEL_F8_FINALIZE -asum_kernel_F1: +.Lasum_kernel_F1: ands I, N, #7 - ble asum_kernel_L999 + ble .Lasum_kernel_L999 -asum_kernel_F10: +.Lasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lasum_kernel_F10 -asum_kernel_L999: +.Lasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lasum_kernel_S1 -asum_kernel_S4: +.Lasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -175,19 +175,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lasum_kernel_S4 -asum_kernel_S1: +.Lasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lasum_kernel_L999 -asum_kernel_S10: +.Lasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lasum_kernel_S10 ret diff --git a/kernel/arm64/axpy.S b/kernel/arm64/axpy.S index 554902c..8094351 100644 --- a/kernel/arm64/axpy.S +++ b/kernel/arm64/axpy.S @@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 fcmp DA, #0.0 - beq axpy_kernel_L999 + beq .Laxpy_kernel_L999 cmp INC_X, #1 - bne axpy_kernel_S_BEGIN + bne .Laxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne axpy_kernel_S_BEGIN + bne .Laxpy_kernel_S_BEGIN -axpy_kernel_F_BEGIN: +.Laxpy_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq axpy_kernel_F1 + beq .Laxpy_kernel_F1 -axpy_kernel_F8: +.Laxpy_kernel_F8: KERNEL_F8 subs I, I, #1 - bne axpy_kernel_F8 + bne .Laxpy_kernel_F8 -axpy_kernel_F1: +.Laxpy_kernel_F1: ands I, N, #7 - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 -axpy_kernel_F10: +.Laxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne axpy_kernel_F10 + bne .Laxpy_kernel_F10 mov w0, wzr ret -axpy_kernel_S_BEGIN: +.Laxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble axpy_kernel_S1 + ble .Laxpy_kernel_S1 -axpy_kernel_S4: +.Laxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -189,21 +189,21 @@ axpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S4 + bne .Laxpy_kernel_S4 -axpy_kernel_S1: +.Laxpy_kernel_S1: ands I, N, #3 - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 -axpy_kernel_S10: +.Laxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S10 + bne .Laxpy_kernel_S10 -axpy_kernel_L999: +.Laxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/casum.S b/kernel/arm64/casum.S index 8f09eec..7c82827 100644 --- a/kernel/arm64/casum.S +++ b/kernel/arm64/casum.S @@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov s1, SUMF cmp N, xzr - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lcasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lcasum_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq asum_kernel_F1 + beq .Lcasum_kernel_F1 -asum_kernel_F8: +.Lcasum_kernel_F8: KERNEL_F8 subs I, I, #1 - bne asum_kernel_F8 + bne .Lcasum_kernel_F8 KERNEL_F8_FINALIZE -asum_kernel_F1: +.Lcasum_kernel_F1: ands I, N, #7 - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 -asum_kernel_F10: +.Lcasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lcasum_kernel_F10 -asum_kernel_L999: +.Lcasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lcasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lcasum_kernel_S1 -asum_kernel_S4: +.Lcasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -151,19 +151,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lcasum_kernel_S4 -asum_kernel_S1: +.Lcasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 -asum_kernel_S10: +.Lcasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lcasum_kernel_S10 ret diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S index 7f2ddea..bbf0c75 100644 --- a/kernel/arm64/cgemm_kernel_4x4.S +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array add ppA, temp, pA -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: lsl temp, origK, #5 // k * 4 * 8 add pA, pA, temp add ppA, ppA, temp subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_40: +.Lcgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_42: +.Lcgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_42 + bgt .Lcgemm_kernel_L4_M4_42 -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M4_20 + bgt .Lcgemm_kernel_L2_M4_20 -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN: -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M4_20 + bgt .Lcgemm_kernel_L1_M4_20 -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index 5d14628..24e08a6 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN .align 5 -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 .align 5 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 .align 5 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #7 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 .align 5 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne cgemm_kernel_L4_M8_46 + bne .Lcgemm_kernel_L4_M8_46 -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M4_32 + blt .Lcgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble cgemm_kernel_L4_M4_22a + ble .Lcgemm_kernel_L4_M4_22a .align 5 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_22a: +.Lcgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_32: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_32: tst counterL, #1 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_40: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_40: INIT4x4 -cgemm_kernel_L4_M4_44: +.Lcgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_46: +.Lcgemm_kernel_L4_M4_46: KERNEL4x4_SUB -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -cgemm_kernel_L2_M8_BEGIN: +.Lcgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L2_M4_BEGIN + ble .Lcgemm_kernel_L2_M4_BEGIN -cgemm_kernel_L2_M8_20: +.Lcgemm_kernel_L2_M8_20: INIT8x2 @@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M8_40 + ble .Lcgemm_kernel_L2_M8_40 .align 5 -cgemm_kernel_L2_M8_22: +.Lcgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_22 + bgt .Lcgemm_kernel_L2_M8_22 -cgemm_kernel_L2_M8_40: +.Lcgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M8_100 + ble .Lcgemm_kernel_L2_M8_100 -cgemm_kernel_L2_M8_42: +.Lcgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_42 + bgt .Lcgemm_kernel_L2_M8_42 -cgemm_kernel_L2_M8_100: +.Lcgemm_kernel_L2_M8_100: SAVE8x2 -cgemm_kernel_L2_M8_END: +.Lcgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M8_20 + bgt .Lcgemm_kernel_L2_M8_20 -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -cgemm_kernel_L1_M8_BEGIN: +.Lcgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L1_M4_BEGIN + ble .Lcgemm_kernel_L1_M4_BEGIN -cgemm_kernel_L1_M8_20: +.Lcgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M8_40 + ble .Lcgemm_kernel_L1_M8_40 .align 5 -cgemm_kernel_L1_M8_22: +.Lcgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_22 + bgt .Lcgemm_kernel_L1_M8_22 -cgemm_kernel_L1_M8_40: +.Lcgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M8_100 + ble .Lcgemm_kernel_L1_M8_100 -cgemm_kernel_L1_M8_42: +.Lcgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_42 + bgt .Lcgemm_kernel_L1_M8_42 -cgemm_kernel_L1_M8_100: +.Lcgemm_kernel_L1_M8_100: SAVE8x1 -cgemm_kernel_L1_M8_END: +.Lcgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M8_20 + bgt .Lcgemm_kernel_L1_M8_20 -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S index 367cd02..29a68ff 100644 --- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S @@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN .align 5 -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #5 // origK / 32 cmp counterL , #2 - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20: KERNEL8x4_M1_M2_x8 subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1_M2_x16 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 .align 5 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 @@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 .align 5 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #31 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 .align 5 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne cgemm_kernel_L4_M8_46 + bne .Lcgemm_kernel_L4_M8_46 -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M4_32 + blt .Lcgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble cgemm_kernel_L4_M4_22a + ble .Lcgemm_kernel_L4_M4_22a .align 5 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_22a: +.Lcgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_32: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_32: tst counterL, #1 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_40: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_40: INIT4x4 -cgemm_kernel_L4_M4_44: +.Lcgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_46: +.Lcgemm_kernel_L4_M4_46: KERNEL4x4_SUB -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -cgemm_kernel_L2_M8_BEGIN: +.Lcgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L2_M4_BEGIN + ble .Lcgemm_kernel_L2_M4_BEGIN -cgemm_kernel_L2_M8_20: +.Lcgemm_kernel_L2_M8_20: INIT8x2 @@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M8_40 + ble .Lcgemm_kernel_L2_M8_40 .align 5 -cgemm_kernel_L2_M8_22: +.Lcgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_22 + bgt .Lcgemm_kernel_L2_M8_22 -cgemm_kernel_L2_M8_40: +.Lcgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M8_100 + ble .Lcgemm_kernel_L2_M8_100 -cgemm_kernel_L2_M8_42: +.Lcgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_42 + bgt .Lcgemm_kernel_L2_M8_42 -cgemm_kernel_L2_M8_100: +.Lcgemm_kernel_L2_M8_100: SAVE8x2 -cgemm_kernel_L2_M8_END: +.Lcgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M8_20 + bgt .Lcgemm_kernel_L2_M8_20 -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -cgemm_kernel_L1_M8_BEGIN: +.Lcgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L1_M4_BEGIN + ble .Lcgemm_kernel_L1_M4_BEGIN -cgemm_kernel_L1_M8_20: +.Lcgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M8_40 + ble .Lcgemm_kernel_L1_M8_40 .align 5 -cgemm_kernel_L1_M8_22: +.Lcgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_22 + bgt .Lcgemm_kernel_L1_M8_22 -cgemm_kernel_L1_M8_40: +.Lcgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M8_100 + ble .Lcgemm_kernel_L1_M8_100 -cgemm_kernel_L1_M8_42: +.Lcgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_42 + bgt .Lcgemm_kernel_L1_M8_42 -cgemm_kernel_L1_M8_100: +.Lcgemm_kernel_L1_M8_100: SAVE8x1 -cgemm_kernel_L1_M8_END: +.Lcgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M8_20 + bgt .Lcgemm_kernel_L1_M8_20 -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S index 70eab96..b8c6bfc 100644 --- a/kernel/arm64/copy.S +++ b/kernel/arm64/copy.S @@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 cmp INC_X, #1 - bne copy_kernel_S_BEGIN + bne .Lcopy_kernel_S_BEGIN cmp INC_Y, #1 - bne copy_kernel_S_BEGIN + bne .Lcopy_kernel_S_BEGIN -copy_kernel_F_BEGIN: +.Lcopy_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq copy_kernel_F1 + beq .Lcopy_kernel_F1 -copy_kernel_F4: +.Lcopy_kernel_F4: KERNEL_F4 subs I, I, #1 - bne copy_kernel_F4 + bne .Lcopy_kernel_F4 -copy_kernel_F1: +.Lcopy_kernel_F1: ands I, N, #3 - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 -copy_kernel_F10: +.Lcopy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne copy_kernel_F10 + bne .Lcopy_kernel_F10 mov w0, wzr ret -copy_kernel_S_BEGIN: +.Lcopy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble copy_kernel_S1 + ble .Lcopy_kernel_S1 -copy_kernel_S4: +.Lcopy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -210,21 +210,21 @@ copy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne copy_kernel_S4 + bne .Lcopy_kernel_S4 -copy_kernel_S1: +.Lcopy_kernel_S1: ands I, N, #3 - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 -copy_kernel_S10: +.Lcopy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne copy_kernel_S10 + bne .Lcopy_kernel_S10 -copy_kernel_L999: +.Lcopy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S index 3de2725..79d33e9 100644 --- a/kernel/arm64/ctrmm_kernel_4x4.S +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ctrmm_kernel_L2_BEGIN + ble .Lctrmm_kernel_L2_BEGIN /******************************************************************************/ -ctrmm_kernel_L4_BEGIN: +.Lctrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ctrmm_kernel_L4_M4_BEGIN: +.Lctrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ctrmm_kernel_L4_M2_BEGIN + ble .Lctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: +.Lctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt ctrmm_kernel_L4_M4_32 + blt .Lctrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble ctrmm_kernel_L4_M4_22a + ble .Lctrmm_kernel_L4_M4_22a .align 5 -ctrmm_kernel_L4_M4_22: +.Lctrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_22 + bgt .Lctrmm_kernel_L4_M4_22 -ctrmm_kernel_L4_M4_22a: +.Lctrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 + b .Lctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_32: +.Lctrmm_kernel_L4_M4_32: tst counterL, #1 - ble ctrmm_kernel_L4_M4_40 + ble .Lctrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 + b .Lctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_40: +.Lctrmm_kernel_L4_M4_40: INIT4x4 -ctrmm_kernel_L4_M4_44: +.Lctrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble ctrmm_kernel_L4_M4_100 + ble .Lctrmm_kernel_L4_M4_100 -ctrmm_kernel_L4_M4_46: +.Lctrmm_kernel_L4_M4_46: KERNEL4x4_SUB -ctrmm_kernel_L4_M4_100: +.Lctrmm_kernel_L4_M4_100: SAVE4x4 @@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L4_M4_END: +.Lctrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne ctrmm_kernel_L4_M4_20 + bne .Lctrmm_kernel_L4_M4_20 -ctrmm_kernel_L4_M2_BEGIN: +.Lctrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L4_M1_BEGIN + ble .Lctrmm_kernel_L4_M1_BEGIN -ctrmm_kernel_L4_M2_20: +.Lctrmm_kernel_L4_M2_20: INIT2x4 @@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M2_40 + ble .Lctrmm_kernel_L4_M2_40 -ctrmm_kernel_L4_M2_22: +.Lctrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_22 + bgt .Lctrmm_kernel_L4_M2_22 -ctrmm_kernel_L4_M2_40: +.Lctrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M2_100 + ble .Lctrmm_kernel_L4_M2_100 -ctrmm_kernel_L4_M2_42: +.Lctrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_42 + bgt .Lctrmm_kernel_L4_M2_42 -ctrmm_kernel_L4_M2_100: +.Lctrmm_kernel_L4_M2_100: SAVE2x4 @@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L4_M2_END: +.Lctrmm_kernel_L4_M2_END: -ctrmm_kernel_L4_M1_BEGIN: +.Lctrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END -ctrmm_kernel_L4_M1_20: +.Lctrmm_kernel_L4_M1_20: INIT1x4 @@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M1_40 + ble .Lctrmm_kernel_L4_M1_40 -ctrmm_kernel_L4_M1_22: +.Lctrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_22 + bgt .Lctrmm_kernel_L4_M1_22 -ctrmm_kernel_L4_M1_40: +.Lctrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M1_100 + ble .Lctrmm_kernel_L4_M1_100 -ctrmm_kernel_L4_M1_42: +.Lctrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_42 + bgt .Lctrmm_kernel_L4_M1_42 -ctrmm_kernel_L4_M1_100: +.Lctrmm_kernel_L4_M1_100: SAVE1x4 @@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L4_END: +.Lctrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ctrmm_kernel_L4_BEGIN + bgt .Lctrmm_kernel_L4_BEGIN /******************************************************************************/ -ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ctrmm_kernel_L999 // error, N was less than 4? + ble .Lctrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ctrmm_kernel_L1_BEGIN + ble .Lctrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -ctrmm_kernel_L2_M4_BEGIN: +.Lctrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble ctrmm_kernel_L2_M2_BEGIN + ble .Lctrmm_kernel_L2_M2_BEGIN -ctrmm_kernel_L2_M4_20: +.Lctrmm_kernel_L2_M4_20: INIT4x2 @@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M4_40 + ble .Lctrmm_kernel_L2_M4_40 .align 5 -ctrmm_kernel_L2_M4_22: +.Lctrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_22 + bgt .Lctrmm_kernel_L2_M4_22 -ctrmm_kernel_L2_M4_40: +.Lctrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M4_100 + ble .Lctrmm_kernel_L2_M4_100 -ctrmm_kernel_L2_M4_42: +.Lctrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_42 + bgt .Lctrmm_kernel_L2_M4_42 -ctrmm_kernel_L2_M4_100: +.Lctrmm_kernel_L2_M4_100: SAVE4x2 @@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L2_M4_END: +.Lctrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L2_M4_20 + bgt .Lctrmm_kernel_L2_M4_20 -ctrmm_kernel_L2_M2_BEGIN: +.Lctrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M1_BEGIN + ble .Lctrmm_kernel_L2_M1_BEGIN -ctrmm_kernel_L2_M2_20: +.Lctrmm_kernel_L2_M2_20: INIT2x2 @@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M2_40 + ble .Lctrmm_kernel_L2_M2_40 -ctrmm_kernel_L2_M2_22: +.Lctrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_22 + bgt .Lctrmm_kernel_L2_M2_22 -ctrmm_kernel_L2_M2_40: +.Lctrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M2_100 + ble .Lctrmm_kernel_L2_M2_100 -ctrmm_kernel_L2_M2_42: +.Lctrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_42 + bgt .Lctrmm_kernel_L2_M2_42 -ctrmm_kernel_L2_M2_100: +.Lctrmm_kernel_L2_M2_100: SAVE2x2 @@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L2_M2_END: +.Lctrmm_kernel_L2_M2_END: -ctrmm_kernel_L2_M1_BEGIN: +.Lctrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END -ctrmm_kernel_L2_M1_20: +.Lctrmm_kernel_L2_M1_20: INIT1x2 @@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ctrmm_kernel_L2_M1_40 + ble .Lctrmm_kernel_L2_M1_40 -ctrmm_kernel_L2_M1_22: +.Lctrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_22 + bgt .Lctrmm_kernel_L2_M1_22 -ctrmm_kernel_L2_M1_40: +.Lctrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M1_100 + ble .Lctrmm_kernel_L2_M1_100 -ctrmm_kernel_L2_M1_42: +.Lctrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_42 + bgt .Lctrmm_kernel_L2_M1_42 -ctrmm_kernel_L2_M1_100: +.Lctrmm_kernel_L2_M1_100: SAVE1x2 @@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L2_END: +.Lctrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END: /******************************************************************************/ -ctrmm_kernel_L1_BEGIN: +.Lctrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ctrmm_kernel_L999 // done + ble .Lctrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -ctrmm_kernel_L1_M4_BEGIN: +.Lctrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ctrmm_kernel_L1_M2_BEGIN + ble .Lctrmm_kernel_L1_M2_BEGIN -ctrmm_kernel_L1_M4_20: +.Lctrmm_kernel_L1_M4_20: INIT4x1 @@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M4_40 + ble .Lctrmm_kernel_L1_M4_40 .align 5 -ctrmm_kernel_L1_M4_22: +.Lctrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_22 + bgt .Lctrmm_kernel_L1_M4_22 -ctrmm_kernel_L1_M4_40: +.Lctrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M4_100 + ble .Lctrmm_kernel_L1_M4_100 -ctrmm_kernel_L1_M4_42: +.Lctrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_42 + bgt .Lctrmm_kernel_L1_M4_42 -ctrmm_kernel_L1_M4_100: +.Lctrmm_kernel_L1_M4_100: SAVE4x1 @@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L1_M4_END: +.Lctrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L1_M4_20 + bgt .Lctrmm_kernel_L1_M4_20 -ctrmm_kernel_L1_M2_BEGIN: +.Lctrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M1_BEGIN + ble .Lctrmm_kernel_L1_M1_BEGIN -ctrmm_kernel_L1_M2_20: +.Lctrmm_kernel_L1_M2_20: INIT2x1 @@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M2_40 + ble .Lctrmm_kernel_L1_M2_40 -ctrmm_kernel_L1_M2_22: +.Lctrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_22 + bgt .Lctrmm_kernel_L1_M2_22 -ctrmm_kernel_L1_M2_40: +.Lctrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M2_100 + ble .Lctrmm_kernel_L1_M2_100 -ctrmm_kernel_L1_M2_42: +.Lctrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_42 + bgt .Lctrmm_kernel_L1_M2_42 -ctrmm_kernel_L1_M2_100: +.Lctrmm_kernel_L1_M2_100: SAVE2x1 @@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L1_M2_END: +.Lctrmm_kernel_L1_M2_END: -ctrmm_kernel_L1_M1_BEGIN: +.Lctrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END -ctrmm_kernel_L1_M1_20: +.Lctrmm_kernel_L1_M1_20: INIT1x1 @@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M1_40 + ble .Lctrmm_kernel_L1_M1_40 -ctrmm_kernel_L1_M1_22: +.Lctrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_22 + bgt .Lctrmm_kernel_L1_M1_22 -ctrmm_kernel_L1_M1_40: +.Lctrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M1_100 + ble .Lctrmm_kernel_L1_M1_100 -ctrmm_kernel_L1_M1_42: +.Lctrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_42 + bgt .Lctrmm_kernel_L1_M1_42 -ctrmm_kernel_L1_M1_100: +.Lctrmm_kernel_L1_M1_100: SAVE1x1 -ctrmm_kernel_L1_END: +.Lctrmm_kernel_L1_END: -ctrmm_kernel_L999: +.Lctrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index 680fb56..5c08273 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ctrmm_kernel_L2_BEGIN + ble .Lctrmm_kernel_L2_BEGIN /******************************************************************************/ -ctrmm_kernel_L4_BEGIN: +.Lctrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ctrmm_kernel_L4_M8_BEGIN: +.Lctrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L4_M4_BEGIN + ble .Lctrmm_kernel_L4_M4_BEGIN -ctrmm_kernel_L4_M8_20: +.Lctrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20: asr counterL , tempK, #3 cmp counterL , #2 - blt ctrmm_kernel_L4_M8_32 + blt .Lctrmm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble ctrmm_kernel_L4_M8_22a + ble .Lctrmm_kernel_L4_M8_22a .align 5 -ctrmm_kernel_L4_M8_22: +.Lctrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M8_22 + bgt .Lctrmm_kernel_L4_M8_22 .align 5 -ctrmm_kernel_L4_M8_22a: +.Lctrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b ctrmm_kernel_L4_M8_44 + b .Lctrmm_kernel_L4_M8_44 .align 5 -ctrmm_kernel_L4_M8_32: +.Lctrmm_kernel_L4_M8_32: tst counterL, #1 - ble ctrmm_kernel_L4_M8_40 + ble .Lctrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b ctrmm_kernel_L4_M8_44 + b .Lctrmm_kernel_L4_M8_44 -ctrmm_kernel_L4_M8_40: +.Lctrmm_kernel_L4_M8_40: INIT8x4 -ctrmm_kernel_L4_M8_44: +.Lctrmm_kernel_L4_M8_44: ands counterL , tempK, #7 - ble ctrmm_kernel_L4_M8_100 + ble .Lctrmm_kernel_L4_M8_100 .align 5 -ctrmm_kernel_L4_M8_46: +.Lctrmm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne ctrmm_kernel_L4_M8_46 + bne .Lctrmm_kernel_L4_M8_46 -ctrmm_kernel_L4_M8_100: +.Lctrmm_kernel_L4_M8_100: SAVE8x4 @@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -ctrmm_kernel_L4_M8_END: +.Lctrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne ctrmm_kernel_L4_M8_20 + bne .Lctrmm_kernel_L4_M8_20 -ctrmm_kernel_L4_M4_BEGIN: +.Lctrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #4 - ble ctrmm_kernel_L4_M2_BEGIN + ble .Lctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: +.Lctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt ctrmm_kernel_L4_M4_32 + blt .Lctrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble ctrmm_kernel_L4_M4_22a + ble .Lctrmm_kernel_L4_M4_22a .align 5 -ctrmm_kernel_L4_M4_22: +.Lctrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_22 + bgt .Lctrmm_kernel_L4_M4_22 -ctrmm_kernel_L4_M4_22a: +.Lctrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_32: + b .Lctrmm_kernel_L4_M4_44 +.Lctrmm_kernel_L4_M4_32: tst counterL, #1 - ble ctrmm_kernel_L4_M4_40 + ble .Lctrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_40: + b .Lctrmm_kernel_L4_M4_44 +.Lctrmm_kernel_L4_M4_40: INIT4x4 -ctrmm_kernel_L4_M4_44: +.Lctrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble ctrmm_kernel_L4_M4_100 + ble .Lctrmm_kernel_L4_M4_100 -ctrmm_kernel_L4_M4_46: +.Lctrmm_kernel_L4_M4_46: KERNEL4x4_SUB -ctrmm_kernel_L4_M4_100: +.Lctrmm_kernel_L4_M4_100: SAVE4x4 @@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L4_M4_END: +.Lctrmm_kernel_L4_M4_END: -ctrmm_kernel_L4_M2_BEGIN: +.Lctrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L4_M1_BEGIN + ble .Lctrmm_kernel_L4_M1_BEGIN -ctrmm_kernel_L4_M2_20: +.Lctrmm_kernel_L4_M2_20: INIT2x4 @@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M2_40 + ble .Lctrmm_kernel_L4_M2_40 -ctrmm_kernel_L4_M2_22: +.Lctrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_22 + bgt .Lctrmm_kernel_L4_M2_22 -ctrmm_kernel_L4_M2_40: +.Lctrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M2_100 + ble .Lctrmm_kernel_L4_M2_100 -ctrmm_kernel_L4_M2_42: +.Lctrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_42 + bgt .Lctrmm_kernel_L4_M2_42 -ctrmm_kernel_L4_M2_100: +.Lctrmm_kernel_L4_M2_100: SAVE2x4 @@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L4_M2_END: +.Lctrmm_kernel_L4_M2_END: -ctrmm_kernel_L4_M1_BEGIN: +.Lctrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END -ctrmm_kernel_L4_M1_20: +.Lctrmm_kernel_L4_M1_20: INIT1x4 @@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M1_40 + ble .Lctrmm_kernel_L4_M1_40 -ctrmm_kernel_L4_M1_22: +.Lctrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_22 + bgt .Lctrmm_kernel_L4_M1_22 -ctrmm_kernel_L4_M1_40: +.Lctrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M1_100 + ble .Lctrmm_kernel_L4_M1_100 -ctrmm_kernel_L4_M1_42: +.Lctrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_42 + bgt .Lctrmm_kernel_L4_M1_42 -ctrmm_kernel_L4_M1_100: +.Lctrmm_kernel_L4_M1_100: SAVE1x4 @@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L4_END: +.Lctrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ctrmm_kernel_L4_BEGIN + bgt .Lctrmm_kernel_L4_BEGIN /******************************************************************************/ -ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ctrmm_kernel_L999 // error, N was less than 4? + ble .Lctrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ctrmm_kernel_L1_BEGIN + ble .Lctrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -ctrmm_kernel_L2_M8_BEGIN: +.Lctrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L2_M4_BEGIN + ble .Lctrmm_kernel_L2_M4_BEGIN -ctrmm_kernel_L2_M8_20: +.Lctrmm_kernel_L2_M8_20: INIT8x2 @@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M8_40 + ble .Lctrmm_kernel_L2_M8_40 .align 5 -ctrmm_kernel_L2_M8_22: +.Lctrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M8_22 + bgt .Lctrmm_kernel_L2_M8_22 -ctrmm_kernel_L2_M8_40: +.Lctrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M8_100 + ble .Lctrmm_kernel_L2_M8_100 -ctrmm_kernel_L2_M8_42: +.Lctrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M8_42 + bgt .Lctrmm_kernel_L2_M8_42 -ctrmm_kernel_L2_M8_100: +.Lctrmm_kernel_L2_M8_100: SAVE8x2 @@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -ctrmm_kernel_L2_M8_END: +.Lctrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L2_M8_20 + bgt .Lctrmm_kernel_L2_M8_20 -ctrmm_kernel_L2_M4_BEGIN: +.Lctrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M2_BEGIN + ble .Lctrmm_kernel_L2_M2_BEGIN -ctrmm_kernel_L2_M4_20: +.Lctrmm_kernel_L2_M4_20: INIT4x2 @@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M4_40 + ble .Lctrmm_kernel_L2_M4_40 .align 5 -ctrmm_kernel_L2_M4_22: +.Lctrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_22 + bgt .Lctrmm_kernel_L2_M4_22 -ctrmm_kernel_L2_M4_40: +.Lctrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M4_100 + ble .Lctrmm_kernel_L2_M4_100 -ctrmm_kernel_L2_M4_42: +.Lctrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_42 + bgt .Lctrmm_kernel_L2_M4_42 -ctrmm_kernel_L2_M4_100: +.Lctrmm_kernel_L2_M4_100: SAVE4x2 @@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L2_M4_END: +.Lctrmm_kernel_L2_M4_END: -ctrmm_kernel_L2_M2_BEGIN: +.Lctrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M1_BEGIN + ble .Lctrmm_kernel_L2_M1_BEGIN -ctrmm_kernel_L2_M2_20: +.Lctrmm_kernel_L2_M2_20: INIT2x2 @@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M2_40 + ble .Lctrmm_kernel_L2_M2_40 -ctrmm_kernel_L2_M2_22: +.Lctrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_22 + bgt .Lctrmm_kernel_L2_M2_22 -ctrmm_kernel_L2_M2_40: +.Lctrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M2_100 + ble .Lctrmm_kernel_L2_M2_100 -ctrmm_kernel_L2_M2_42: +.Lctrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_42 + bgt .Lctrmm_kernel_L2_M2_42 -ctrmm_kernel_L2_M2_100: +.Lctrmm_kernel_L2_M2_100: SAVE2x2 @@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L2_M2_END: +.Lctrmm_kernel_L2_M2_END: -ctrmm_kernel_L2_M1_BEGIN: +.Lctrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END -ctrmm_kernel_L2_M1_20: +.Lctrmm_kernel_L2_M1_20: INIT1x2 @@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ctrmm_kernel_L2_M1_40 + ble .Lctrmm_kernel_L2_M1_40 -ctrmm_kernel_L2_M1_22: +.Lctrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_22 + bgt .Lctrmm_kernel_L2_M1_22 -ctrmm_kernel_L2_M1_40: +.Lctrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M1_100 + ble .Lctrmm_kernel_L2_M1_100 -ctrmm_kernel_L2_M1_42: +.Lctrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_42 + bgt .Lctrmm_kernel_L2_M1_42 -ctrmm_kernel_L2_M1_100: +.Lctrmm_kernel_L2_M1_100: SAVE1x2 @@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L2_END: +.Lctrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END: /******************************************************************************/ -ctrmm_kernel_L1_BEGIN: +.Lctrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ctrmm_kernel_L999 // done + ble .Lctrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next @@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -ctrmm_kernel_L1_M8_BEGIN: +.Lctrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L1_M4_BEGIN + ble .Lctrmm_kernel_L1_M4_BEGIN -ctrmm_kernel_L1_M8_20: +.Lctrmm_kernel_L1_M8_20: INIT8x1 @@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M8_40 + ble .Lctrmm_kernel_L1_M8_40 .align 5 -ctrmm_kernel_L1_M8_22: +.Lctrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M8_22 + bgt .Lctrmm_kernel_L1_M8_22 -ctrmm_kernel_L1_M8_40: +.Lctrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M8_100 + ble .Lctrmm_kernel_L1_M8_100 -ctrmm_kernel_L1_M8_42: +.Lctrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M8_42 + bgt .Lctrmm_kernel_L1_M8_42 -ctrmm_kernel_L1_M8_100: +.Lctrmm_kernel_L1_M8_100: SAVE8x1 @@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -ctrmm_kernel_L1_M8_END: +.Lctrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L1_M8_20 + bgt .Lctrmm_kernel_L1_M8_20 -ctrmm_kernel_L1_M4_BEGIN: +.Lctrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M2_BEGIN + ble .Lctrmm_kernel_L1_M2_BEGIN -ctrmm_kernel_L1_M4_20: +.Lctrmm_kernel_L1_M4_20: INIT4x1 @@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M4_40 + ble .Lctrmm_kernel_L1_M4_40 .align 5 -ctrmm_kernel_L1_M4_22: +.Lctrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_22 + bgt .Lctrmm_kernel_L1_M4_22 -ctrmm_kernel_L1_M4_40: +.Lctrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M4_100 + ble .Lctrmm_kernel_L1_M4_100 -ctrmm_kernel_L1_M4_42: +.Lctrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_42 + bgt .Lctrmm_kernel_L1_M4_42 -ctrmm_kernel_L1_M4_100: +.Lctrmm_kernel_L1_M4_100: SAVE4x1 @@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L1_M4_END: +.Lctrmm_kernel_L1_M4_END: -ctrmm_kernel_L1_M2_BEGIN: +.Lctrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M1_BEGIN + ble .Lctrmm_kernel_L1_M1_BEGIN -ctrmm_kernel_L1_M2_20: +.Lctrmm_kernel_L1_M2_20: INIT2x1 @@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M2_40 + ble .Lctrmm_kernel_L1_M2_40 -ctrmm_kernel_L1_M2_22: +.Lctrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_22 + bgt .Lctrmm_kernel_L1_M2_22 -ctrmm_kernel_L1_M2_40: +.Lctrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M2_100 + ble .Lctrmm_kernel_L1_M2_100 -ctrmm_kernel_L1_M2_42: +.Lctrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_42 + bgt .Lctrmm_kernel_L1_M2_42 -ctrmm_kernel_L1_M2_100: +.Lctrmm_kernel_L1_M2_100: SAVE2x1 @@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L1_M2_END: +.Lctrmm_kernel_L1_M2_END: -ctrmm_kernel_L1_M1_BEGIN: +.Lctrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END -ctrmm_kernel_L1_M1_20: +.Lctrmm_kernel_L1_M1_20: INIT1x1 @@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M1_40 + ble .Lctrmm_kernel_L1_M1_40 -ctrmm_kernel_L1_M1_22: +.Lctrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_22 + bgt .Lctrmm_kernel_L1_M1_22 -ctrmm_kernel_L1_M1_40: +.Lctrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M1_100 + ble .Lctrmm_kernel_L1_M1_100 -ctrmm_kernel_L1_M1_42: +.Lctrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_42 + bgt .Lctrmm_kernel_L1_M1_42 -ctrmm_kernel_L1_M1_100: +.Lctrmm_kernel_L1_M1_100: SAVE1x1 -ctrmm_kernel_L1_END: +.Lctrmm_kernel_L1_END: -ctrmm_kernel_L999: +.Lctrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/daxpy_thunderx2t99.S b/kernel/arm64/daxpy_thunderx2t99.S index 5eb2ec0..b8d0af5 100644 --- a/kernel/arm64/daxpy_thunderx2t99.S +++ b/kernel/arm64/daxpy_thunderx2t99.S @@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 fcmp DA, #0.0 - beq axpy_kernel_L999 + beq .Ldaxpy_kernel_L999 cmp INC_X, #1 - bne axpy_kernel_S_BEGIN + bne .Ldaxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne axpy_kernel_S_BEGIN + bne .Ldaxpy_kernel_S_BEGIN -axpy_kernel_F_BEGIN: +.Ldaxpy_kernel_F_BEGIN: asr I, N, #5 cmp I, xzr - beq axpy_kernel_F1 + beq .Ldaxpy_kernel_F1 .align 5 -axpy_kernel_F32: +.Ldaxpy_kernel_F32: KERNEL_F32 subs I, I, #1 - bne axpy_kernel_F32 + bne .Ldaxpy_kernel_F32 -axpy_kernel_F1: +.Ldaxpy_kernel_F1: ands I, N, #31 - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 -axpy_kernel_F10: +.Ldaxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne axpy_kernel_F10 + bne .Ldaxpy_kernel_F10 - b axpy_kernel_L999 + b .Ldaxpy_kernel_L999 -axpy_kernel_S_BEGIN: +.Ldaxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble axpy_kernel_S1 + ble .Ldaxpy_kernel_S1 -axpy_kernel_S4: +.Ldaxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -176,21 +176,21 @@ axpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S4 + bne .Ldaxpy_kernel_S4 -axpy_kernel_S1: +.Ldaxpy_kernel_S1: ands I, N, #3 - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 -axpy_kernel_S10: +.Ldaxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S10 + bne .Ldaxpy_kernel_S10 -axpy_kernel_L999: +.Ldaxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S index 44b0f7f..3491670 100644 --- a/kernel/arm64/dgemm_kernel_4x4.S +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN: //------------------------------------------------------------------------------ -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #2 // L = K / 4 cmp counterL , #2 - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #3 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: lsl temp, origK, #5 prfm PLDL1KEEP, [pA, temp] prfm PLDL1KEEP, [ppA, temp] @@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100: SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: lsl temp, origK, #5 // k * 4 * 8 add pA, pA, temp add ppA, ppA, temp subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M4_20 + bgt .Ldgemm_kernel_L2_M4_20 -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN: -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M4_20 + bgt .Ldgemm_kernel_L1_M4_20 -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S index b04dbb5..ced26b4 100644 --- a/kernel/arm64/dgemm_kernel_4x8.S +++ b/kernel/arm64/dgemm_kernel_4x8.S @@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble dgemm_kernel_L4_BEGIN + ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L8_BEGIN: +.Ldgemm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 mov pA, origPA // pA = start of A array -dgemm_kernel_L8_M4_BEGIN: +.Ldgemm_kernel_L8_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L8_M2_BEGIN + ble .Ldgemm_kernel_L8_M2_BEGIN -dgemm_kernel_L8_M4_20: +.Ldgemm_kernel_L8_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L8_M4_32 + blt .Ldgemm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble dgemm_kernel_L8_M4_22a + ble .Ldgemm_kernel_L8_M4_22a .align 5 -dgemm_kernel_L8_M4_22: +.Ldgemm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M4_22 + bgt .Ldgemm_kernel_L8_M4_22 -dgemm_kernel_L8_M4_22a: +.Ldgemm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b dgemm_kernel_L8_M4_44 + b .Ldgemm_kernel_L8_M4_44 -dgemm_kernel_L8_M4_32: +.Ldgemm_kernel_L8_M4_32: tst counterL, #1 - ble dgemm_kernel_L8_M4_40 + ble .Ldgemm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b dgemm_kernel_L8_M4_44 + b .Ldgemm_kernel_L8_M4_44 -dgemm_kernel_L8_M4_40: +.Ldgemm_kernel_L8_M4_40: INIT4x8 -dgemm_kernel_L8_M4_44: +.Ldgemm_kernel_L8_M4_44: ands counterL , origK, #1 - ble dgemm_kernel_L8_M4_100 + ble .Ldgemm_kernel_L8_M4_100 -dgemm_kernel_L8_M4_46: +.Ldgemm_kernel_L8_M4_46: KERNEL4x8_SUB -dgemm_kernel_L8_M4_100: +.Ldgemm_kernel_L8_M4_100: SAVE4x8 -dgemm_kernel_L8_M4_END: +.Ldgemm_kernel_L8_M4_END: subs counterI, counterI, #1 - bne dgemm_kernel_L8_M4_20 + bne .Ldgemm_kernel_L8_M4_20 -dgemm_kernel_L8_M2_BEGIN: +.Ldgemm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L8_END + ble .Ldgemm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L8_M1_BEGIN + ble .Ldgemm_kernel_L8_M1_BEGIN -dgemm_kernel_L8_M2_20: +.Ldgemm_kernel_L8_M2_20: INIT2x8 @@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L8_M2_40 + ble .Ldgemm_kernel_L8_M2_40 -dgemm_kernel_L8_M2_22: +.Ldgemm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M2_22 + bgt .Ldgemm_kernel_L8_M2_22 -dgemm_kernel_L8_M2_40: +.Ldgemm_kernel_L8_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L8_M2_100 + ble .Ldgemm_kernel_L8_M2_100 -dgemm_kernel_L8_M2_42: +.Ldgemm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M2_42 + bgt .Ldgemm_kernel_L8_M2_42 -dgemm_kernel_L8_M2_100: +.Ldgemm_kernel_L8_M2_100: SAVE2x8 -dgemm_kernel_L8_M2_END: +.Ldgemm_kernel_L8_M2_END: -dgemm_kernel_L8_M1_BEGIN: +.Ldgemm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L8_END + ble .Ldgemm_kernel_L8_END -dgemm_kernel_L8_M1_20: +.Ldgemm_kernel_L8_M1_20: INIT1x8 @@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L8_M1_40 + ble .Ldgemm_kernel_L8_M1_40 -dgemm_kernel_L8_M1_22: +.Ldgemm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M1_22 + bgt .Ldgemm_kernel_L8_M1_22 -dgemm_kernel_L8_M1_40: +.Ldgemm_kernel_L8_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L8_M1_100 + ble .Ldgemm_kernel_L8_M1_100 -dgemm_kernel_L8_M1_42: +.Ldgemm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M1_42 + bgt .Ldgemm_kernel_L8_M1_42 -dgemm_kernel_L8_M1_100: +.Ldgemm_kernel_L8_M1_100: SAVE1x8 -dgemm_kernel_L8_END: +.Ldgemm_kernel_L8_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 8 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L8_BEGIN + bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble dgemm_kernel_L999 + ble .Ldgemm_kernel_L999 tst counterJ , #4 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M4_32 + blt .Ldgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dgemm_kernel_L4_M4_22a + ble .Ldgemm_kernel_L4_M4_22a .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_22a: +.Ldgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dgemm_kernel_L4_M4_44 + b .Ldgemm_kernel_L4_M4_44 -dgemm_kernel_L4_M4_32: +.Ldgemm_kernel_L4_M4_32: tst counterL, #1 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dgemm_kernel_L4_M4_44 + b .Ldgemm_kernel_L4_M4_44 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: INIT4x4 -dgemm_kernel_L4_M4_44: +.Ldgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_46: +.Ldgemm_kernel_L4_M4_46: KERNEL4x4_SUB -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M4_20 + bne .Ldgemm_kernel_L4_M4_20 -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M4_20 + bgt .Ldgemm_kernel_L2_M4_20 -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M4_20 + bgt .Ldgemm_kernel_L1_M4_20 -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S index 3fd74fc..af3aa02 100644 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN /******************************************************************************/ .align 5 -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #7 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 @@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22: prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 .align 5 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 .align 5 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB @@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M8_BEGIN: +.Ldgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L2_M4_BEGIN + ble .Ldgemm_kernel_L2_M4_BEGIN .align 5 -dgemm_kernel_L2_M8_20: +.Ldgemm_kernel_L2_M8_20: INIT8x2 @@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M8_40 + ble .Ldgemm_kernel_L2_M8_40 .align 5 -dgemm_kernel_L2_M8_22: +.Ldgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_22 + bgt .Ldgemm_kernel_L2_M8_22 -dgemm_kernel_L2_M8_40: +.Ldgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M8_100 + ble .Ldgemm_kernel_L2_M8_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M8_42: +.Ldgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_42 + bgt .Ldgemm_kernel_L2_M8_42 -dgemm_kernel_L2_M8_100: +.Ldgemm_kernel_L2_M8_100: SAVE8x2 -dgemm_kernel_L2_M8_END: +.Ldgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M8_20 + bgt .Ldgemm_kernel_L2_M8_20 -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB @@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A -dgemm_kernel_L1_M8_BEGIN: +.Ldgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L1_M4_BEGIN + ble .Ldgemm_kernel_L1_M4_BEGIN .align 5 -dgemm_kernel_L1_M8_20: +.Ldgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M8_40 + ble .Ldgemm_kernel_L1_M8_40 .align 5 -dgemm_kernel_L1_M8_22: +.Ldgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_22 + bgt .Ldgemm_kernel_L1_M8_22 -dgemm_kernel_L1_M8_40: +.Ldgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M8_100 + ble .Ldgemm_kernel_L1_M8_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M8_42: +.Ldgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_42 + bgt .Ldgemm_kernel_L1_M8_42 -dgemm_kernel_L1_M8_100: +.Ldgemm_kernel_L1_M8_100: SAVE8x1 -dgemm_kernel_L1_M8_END: +.Ldgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M8_20 + bgt .Ldgemm_kernel_L1_M8_20 -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB @@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] @@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S index 86865d8..598db6e 100644 --- a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S @@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN /******************************************************************************/ .align 5 -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #7 // L = K / 128 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M1_M2_x1 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1_M2_x64 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x16 @@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #127 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] @@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100: SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 @@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22: prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 .align 5 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 .align 5 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x4_SUB @@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M8_BEGIN: +.Ldgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L2_M4_BEGIN + ble .Ldgemm_kernel_L2_M4_BEGIN .align 5 -dgemm_kernel_L2_M8_20: +.Ldgemm_kernel_L2_M8_20: INIT8x2 @@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M8_40 + ble .Ldgemm_kernel_L2_M8_40 .align 5 -dgemm_kernel_L2_M8_22: +.Ldgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_22 + bgt .Ldgemm_kernel_L2_M8_22 -dgemm_kernel_L2_M8_40: +.Ldgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M8_100 + ble .Ldgemm_kernel_L2_M8_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M8_42: +.Ldgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_42 + bgt .Ldgemm_kernel_L2_M8_42 -dgemm_kernel_L2_M8_100: +.Ldgemm_kernel_L2_M8_100: SAVE8x2 -dgemm_kernel_L2_M8_END: +.Ldgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M8_20 + bgt .Ldgemm_kernel_L2_M8_20 -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x2_SUB @@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A -dgemm_kernel_L1_M8_BEGIN: +.Ldgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L1_M4_BEGIN + ble .Ldgemm_kernel_L1_M4_BEGIN .align 5 -dgemm_kernel_L1_M8_20: +.Ldgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M8_40 + ble .Ldgemm_kernel_L1_M8_40 .align 5 -dgemm_kernel_L1_M8_22: +.Ldgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_22 + bgt .Ldgemm_kernel_L1_M8_22 -dgemm_kernel_L1_M8_40: +.Ldgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M8_100 + ble .Ldgemm_kernel_L1_M8_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M8_42: +.Ldgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_42 + bgt .Ldgemm_kernel_L1_M8_42 -dgemm_kernel_L1_M8_100: +.Ldgemm_kernel_L1_M8_100: SAVE8x1 -dgemm_kernel_L1_M8_END: +.Ldgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M8_20 + bgt .Ldgemm_kernel_L1_M8_20 -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x1_SUB @@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] @@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_ncopy_4.S b/kernel/arm64/dgemm_ncopy_4.S index c98a732..29d274d 100644 --- a/kernel/arm64/dgemm_ncopy_4.S +++ b/kernel/arm64/dgemm_ncopy_4.S @@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl LDA, LDA, #3 // LDA = LDA * SIZE -dgemm_ncopy_L4_BEGIN: +.Ldgemm_ncopy_L4_BEGIN: asr J, N, #2 // J = N / 4 cmp J, #0 - ble dgemm_ncopy_L2_BEGIN + ble .Ldgemm_ncopy_L2_BEGIN .align 5 -dgemm_ncopy_L4_M4_BEGIN: +.Ldgemm_ncopy_L4_M4_BEGIN: mov A01, A00 add A02, A01, LDA @@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN: asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L4_M4_40 + ble .Ldgemm_ncopy_L4_M4_40 .align 5 -dgemm_ncopy_L4_M4_20: +.Ldgemm_ncopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne dgemm_ncopy_L4_M4_20 + bne .Ldgemm_ncopy_L4_M4_20 -dgemm_ncopy_L4_M4_40: +.Ldgemm_ncopy_L4_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L4_M4_END + ble .Ldgemm_ncopy_L4_M4_END .align 5 -dgemm_ncopy_L4_M4_60: +.Ldgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne dgemm_ncopy_L4_M4_60 + bne .Ldgemm_ncopy_L4_M4_60 -dgemm_ncopy_L4_M4_END: +.Ldgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne dgemm_ncopy_L4_M4_BEGIN + bne .Ldgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -dgemm_ncopy_L2_BEGIN: +.Ldgemm_ncopy_L2_BEGIN: tst N, #3 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #2 - ble dgemm_ncopy_L1_BEGIN + ble .Ldgemm_ncopy_L1_BEGIN -dgemm_ncopy_L2_M4_BEGIN: +.Ldgemm_ncopy_L2_M4_BEGIN: mov A01, A00 add A02, A01, LDA add A00, A02, LDA asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L2_M4_40 + ble .Ldgemm_ncopy_L2_M4_40 .align 5 -dgemm_ncopy_L2_M4_20: +.Ldgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne dgemm_ncopy_L2_M4_20 + bne .Ldgemm_ncopy_L2_M4_20 -dgemm_ncopy_L2_M4_40: +.Ldgemm_ncopy_L2_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L2_M4_END + ble .Ldgemm_ncopy_L2_M4_END .align 5 -dgemm_ncopy_L2_M4_60: +.Ldgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne dgemm_ncopy_L2_M4_60 + bne .Ldgemm_ncopy_L2_M4_60 -dgemm_ncopy_L2_M4_END: +.Ldgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -dgemm_ncopy_L1_BEGIN: +.Ldgemm_ncopy_L1_BEGIN: tst N, #1 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 -dgemm_ncopy_L1_M4_BEGIN: +.Ldgemm_ncopy_L1_M4_BEGIN: mov A01, A00 asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L1_M4_40 + ble .Ldgemm_ncopy_L1_M4_40 .align 5 -dgemm_ncopy_L1_M4_20: +.Ldgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne dgemm_ncopy_L1_M4_20 + bne .Ldgemm_ncopy_L1_M4_20 -dgemm_ncopy_L1_M4_40: +.Ldgemm_ncopy_L1_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L1_M4_END + ble .Ldgemm_ncopy_L1_M4_END .align 5 -dgemm_ncopy_L1_M4_60: +.Ldgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne dgemm_ncopy_L1_M4_60 + bne .Ldgemm_ncopy_L1_M4_60 -dgemm_ncopy_L1_M4_END: +.Ldgemm_ncopy_L1_M4_END: -dgemm_ncopy_L999: +.Ldgemm_ncopy_L999: mov x0, #0 RESTORE_REGS diff --git a/kernel/arm64/dgemm_ncopy_8.S b/kernel/arm64/dgemm_ncopy_8.S index 1f237b4..3664248 100644 --- a/kernel/arm64/dgemm_ncopy_8.S +++ b/kernel/arm64/dgemm_ncopy_8.S @@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl LDA, LDA, #3 // LDA = LDA * SIZE -dgemm_ncopy_L8_BEGIN: +.Ldgemm_ncopy_L8_BEGIN: asr J, N, #3 // J = N / 8 cmp J, #0 - ble dgemm_ncopy_L4_BEGIN + ble .Ldgemm_ncopy_L4_BEGIN -dgemm_ncopy_L8_M8_BEGIN: +.Ldgemm_ncopy_L8_M8_BEGIN: mov A01, A00 add A02, A01, LDA @@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN: asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L8_M8_40 + ble .Ldgemm_ncopy_L8_M8_40 -dgemm_ncopy_L8_M8_20: +.Ldgemm_ncopy_L8_M8_20: COPY8x8 subs I , I , #1 - bne dgemm_ncopy_L8_M8_20 + bne .Ldgemm_ncopy_L8_M8_20 -dgemm_ncopy_L8_M8_40: +.Ldgemm_ncopy_L8_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L8_M8_END + ble .Ldgemm_ncopy_L8_M8_END -dgemm_ncopy_L8_M8_60: +.Ldgemm_ncopy_L8_M8_60: COPY1x8 subs I , I , #1 - bne dgemm_ncopy_L8_M8_60 + bne .Ldgemm_ncopy_L8_M8_60 -dgemm_ncopy_L8_M8_END: +.Ldgemm_ncopy_L8_M8_END: subs J , J, #1 // j-- - bne dgemm_ncopy_L8_M8_BEGIN + bne .Ldgemm_ncopy_L8_M8_BEGIN /*********************************************************************************************/ -dgemm_ncopy_L4_BEGIN: +.Ldgemm_ncopy_L4_BEGIN: tst N, #7 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #4 - ble dgemm_ncopy_L2_BEGIN + ble .Ldgemm_ncopy_L2_BEGIN -dgemm_ncopy_L4_M8_BEGIN: +.Ldgemm_ncopy_L4_M8_BEGIN: mov A01, A00 add A02, A01, LDA @@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN: asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L4_M8_40 + ble .Ldgemm_ncopy_L4_M8_40 -dgemm_ncopy_L4_M8_20: +.Ldgemm_ncopy_L4_M8_20: COPY8x4 subs I , I , #1 - bne dgemm_ncopy_L4_M8_20 + bne .Ldgemm_ncopy_L4_M8_20 -dgemm_ncopy_L4_M8_40: +.Ldgemm_ncopy_L4_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L4_M8_END + ble .Ldgemm_ncopy_L4_M8_END -dgemm_ncopy_L4_M8_60: +.Ldgemm_ncopy_L4_M8_60: COPY1x4 subs I , I , #1 - bne dgemm_ncopy_L4_M8_60 + bne .Ldgemm_ncopy_L4_M8_60 -dgemm_ncopy_L4_M8_END: +.Ldgemm_ncopy_L4_M8_END: /*********************************************************************************************/ -dgemm_ncopy_L2_BEGIN: +.Ldgemm_ncopy_L2_BEGIN: tst N, #3 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #2 - ble dgemm_ncopy_L1_BEGIN + ble .Ldgemm_ncopy_L1_BEGIN -dgemm_ncopy_L2_M8_BEGIN: +.Ldgemm_ncopy_L2_M8_BEGIN: mov A01, A00 add A02, A01, LDA add A00, A02, LDA asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L2_M8_40 + ble .Ldgemm_ncopy_L2_M8_40 -dgemm_ncopy_L2_M8_20: +.Ldgemm_ncopy_L2_M8_20: COPY8x2 subs I , I , #1 - bne dgemm_ncopy_L2_M8_20 + bne .Ldgemm_ncopy_L2_M8_20 -dgemm_ncopy_L2_M8_40: +.Ldgemm_ncopy_L2_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L2_M8_END + ble .Ldgemm_ncopy_L2_M8_END -dgemm_ncopy_L2_M8_60: +.Ldgemm_ncopy_L2_M8_60: COPY1x2 subs I , I , #1 - bne dgemm_ncopy_L2_M8_60 + bne .Ldgemm_ncopy_L2_M8_60 -dgemm_ncopy_L2_M8_END: +.Ldgemm_ncopy_L2_M8_END: /*********************************************************************************************/ -dgemm_ncopy_L1_BEGIN: +.Ldgemm_ncopy_L1_BEGIN: tst N, #1 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 -dgemm_ncopy_L1_M8_BEGIN: +.Ldgemm_ncopy_L1_M8_BEGIN: mov A01, A00 asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L1_M8_40 + ble .Ldgemm_ncopy_L1_M8_40 -dgemm_ncopy_L1_M8_20: +.Ldgemm_ncopy_L1_M8_20: COPY8x1 subs I , I , #1 - bne dgemm_ncopy_L1_M8_20 + bne .Ldgemm_ncopy_L1_M8_20 -dgemm_ncopy_L1_M8_40: +.Ldgemm_ncopy_L1_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L1_M8_END + ble .Ldgemm_ncopy_L1_M8_END -dgemm_ncopy_L1_M8_60: +.Ldgemm_ncopy_L1_M8_60: COPY1x1 subs I , I , #1 - bne dgemm_ncopy_L1_M8_60 + bne .Ldgemm_ncopy_L1_M8_60 -dgemm_ncopy_L1_M8_END: +.Ldgemm_ncopy_L1_M8_END: -dgemm_ncopy_L999: +.Ldgemm_ncopy_L999: mov x0, #0 RESTORE_REGS diff --git a/kernel/arm64/dgemm_tcopy_4.S b/kernel/arm64/dgemm_tcopy_4.S index 5b2ed43..7c91352 100644 --- a/kernel/arm64/dgemm_tcopy_4.S +++ b/kernel/arm64/dgemm_tcopy_4.S @@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl M4, M, #5 // M4 = M * 4 * SIZE -dgemm_tcopy_L4_BEGIN: +.Ldgemm_tcopy_L4_BEGIN: asr J, M, #2 // J = M / 4 cmp J, #0 - ble dgemm_tcopy_L2_BEGIN + ble .Ldgemm_tcopy_L2_BEGIN .align 5 -dgemm_tcopy_L4_M4_BEGIN: +.Ldgemm_tcopy_L4_M4_BEGIN: mov A01, A add A02, A01, LDA @@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN: asr I, N, #2 // I = N / 4 cmp I, #0 - ble dgemm_tcopy_L4_M4_40 + ble .Ldgemm_tcopy_L4_M4_40 .align 5 -dgemm_tcopy_L4_M4_20: +.Ldgemm_tcopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne dgemm_tcopy_L4_M4_20 + bne .Ldgemm_tcopy_L4_M4_20 -dgemm_tcopy_L4_M4_40: +.Ldgemm_tcopy_L4_M4_40: tst N , #2 - ble dgemm_tcopy_L4_M4_60 + ble .Ldgemm_tcopy_L4_M4_60 COPY2x4 -dgemm_tcopy_L4_M4_60: +.Ldgemm_tcopy_L4_M4_60: tst N, #1 - ble dgemm_tcopy_L4_M4_END + ble .Ldgemm_tcopy_L4_M4_END COPY1x4 -dgemm_tcopy_L4_M4_END: +.Ldgemm_tcopy_L4_M4_END: subs J , J, #1 // j-- - bne dgemm_tcopy_L4_M4_BEGIN + bne .Ldgemm_tcopy_L4_M4_BEGIN /*********************************************************************************************/ -dgemm_tcopy_L2_BEGIN: +.Ldgemm_tcopy_L2_BEGIN: tst M, #3 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #2 - ble dgemm_tcopy_L1_BEGIN + ble .Ldgemm_tcopy_L1_BEGIN -dgemm_tcopy_L2_M4_BEGIN: +.Ldgemm_tcopy_L2_M4_BEGIN: mov A01, A add A02, A01, LDA add A, A02, LDA @@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN: asr I, N, #2 // I = N / 4 cmp I, #0 - ble dgemm_tcopy_L2_M4_40 + ble .Ldgemm_tcopy_L2_M4_40 .align 5 -dgemm_tcopy_L2_M4_20: +.Ldgemm_tcopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne dgemm_tcopy_L2_M4_20 + bne .Ldgemm_tcopy_L2_M4_20 -dgemm_tcopy_L2_M4_40: +.Ldgemm_tcopy_L2_M4_40: tst N , #2 - ble dgemm_tcopy_L2_M4_60 + ble .Ldgemm_tcopy_L2_M4_60 COPY2x2 -dgemm_tcopy_L2_M4_60: +.Ldgemm_tcopy_L2_M4_60: tst N , #1 - ble dgemm_tcopy_L2_M4_END + ble .Ldgemm_tcopy_L2_M4_END COPY1x2 -dgemm_tcopy_L2_M4_END: +.Ldgemm_tcopy_L2_M4_END: /*********************************************************************************************/ -dgemm_tcopy_L1_BEGIN: +.Ldgemm_tcopy_L1_BEGIN: tst M, #1 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 -dgemm_tcopy_L1_M4_BEGIN: +.Ldgemm_tcopy_L1_M4_BEGIN: mov A01, A // A01 = A mov B01, B asr I, N, #2 // I = M / 4 cmp I, #0 - ble dgemm_tcopy_L1_M4_40 + ble .Ldgemm_tcopy_L1_M4_40 .align 5 -dgemm_tcopy_L1_M4_20: +.Ldgemm_tcopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne dgemm_tcopy_L1_M4_20 + bne .Ldgemm_tcopy_L1_M4_20 -dgemm_tcopy_L1_M4_40: +.Ldgemm_tcopy_L1_M4_40: tst N , #2 - ble dgemm_tcopy_L1_M4_60 + ble .Ldgemm_tcopy_L1_M4_60 COPY2x1 -dgemm_tcopy_L1_M4_60: +.Ldgemm_tcopy_L1_M4_60: tst N , #1 - ble dgemm_tcopy_L1_M4_END + ble .Ldgemm_tcopy_L1_M4_END COPY1x1 -dgemm_tcopy_L1_M4_END: +.Ldgemm_tcopy_L1_M4_END: -dgemm_tcopy_L999: +.Ldgemm_tcopy_L999: mov x0, #0 // set return value RESTORE_REGS ret diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S index 1c57e30..9ab51ff 100644 --- a/kernel/arm64/dgemm_tcopy_8.S +++ b/kernel/arm64/dgemm_tcopy_8.S @@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl M8, M, #6 // M8 = M * 8 * SIZE -dgemm_tcopy_L8_BEGIN: +.Ldgemm_tcopy_L8_BEGIN: asr J, M, #3 // J = M / 4 cmp J, #0 - ble dgemm_tcopy_L4_BEGIN + ble .Ldgemm_tcopy_L4_BEGIN .align 5 -dgemm_tcopy_L8_M8_BEGIN: +.Ldgemm_tcopy_L8_M8_BEGIN: mov A01, A add A02, A01, LDA @@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L8_M8_40 + ble .Ldgemm_tcopy_L8_M8_40 .align 5 -dgemm_tcopy_L8_M8_20: +.Ldgemm_tcopy_L8_M8_20: COPY8x8 subs I , I , #1 - bne dgemm_tcopy_L8_M8_20 + bne .Ldgemm_tcopy_L8_M8_20 -dgemm_tcopy_L8_M8_40: +.Ldgemm_tcopy_L8_M8_40: tst N , #4 - ble dgemm_tcopy_L8_M8_60 + ble .Ldgemm_tcopy_L8_M8_60 COPY4x8 -dgemm_tcopy_L8_M8_60: +.Ldgemm_tcopy_L8_M8_60: tst N , #2 - ble dgemm_tcopy_L8_M8_80 + ble .Ldgemm_tcopy_L8_M8_80 COPY2x8 -dgemm_tcopy_L8_M8_80: +.Ldgemm_tcopy_L8_M8_80: tst N, #1 - ble dgemm_tcopy_L8_M8_END + ble .Ldgemm_tcopy_L8_M8_END COPY1x8 -dgemm_tcopy_L8_M8_END: +.Ldgemm_tcopy_L8_M8_END: subs J , J, #1 // j-- - bne dgemm_tcopy_L8_M8_BEGIN + bne .Ldgemm_tcopy_L8_M8_BEGIN /*********************************************************************************************/ -dgemm_tcopy_L4_BEGIN: +.Ldgemm_tcopy_L4_BEGIN: tst M, #7 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #4 - ble dgemm_tcopy_L2_BEGIN + ble .Ldgemm_tcopy_L2_BEGIN -dgemm_tcopy_L4_M8_BEGIN: +.Ldgemm_tcopy_L4_M8_BEGIN: mov A01, A add A02, A01, LDA @@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L4_M8_40 + ble .Ldgemm_tcopy_L4_M8_40 .align 5 -dgemm_tcopy_L4_M8_20: +.Ldgemm_tcopy_L4_M8_20: COPY8x4 subs I , I , #1 - bne dgemm_tcopy_L4_M8_20 + bne .Ldgemm_tcopy_L4_M8_20 -dgemm_tcopy_L4_M8_40: +.Ldgemm_tcopy_L4_M8_40: tst N , #4 - ble dgemm_tcopy_L4_M8_60 + ble .Ldgemm_tcopy_L4_M8_60 COPY4x4 -dgemm_tcopy_L4_M8_60: +.Ldgemm_tcopy_L4_M8_60: tst N , #2 - ble dgemm_tcopy_L4_M8_80 + ble .Ldgemm_tcopy_L4_M8_80 COPY2x4 -dgemm_tcopy_L4_M8_80: +.Ldgemm_tcopy_L4_M8_80: tst N, #1 - ble dgemm_tcopy_L4_M8_END + ble .Ldgemm_tcopy_L4_M8_END COPY1x4 -dgemm_tcopy_L4_M8_END: +.Ldgemm_tcopy_L4_M8_END: /*********************************************************************************************/ -dgemm_tcopy_L2_BEGIN: +.Ldgemm_tcopy_L2_BEGIN: tst M, #3 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #2 - ble dgemm_tcopy_L1_BEGIN + ble .Ldgemm_tcopy_L1_BEGIN -dgemm_tcopy_L2_M8_BEGIN: +.Ldgemm_tcopy_L2_M8_BEGIN: mov A01, A add A02, A01, LDA add A, A02, LDA @@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L2_M8_40 + ble .Ldgemm_tcopy_L2_M8_40 .align 5 -dgemm_tcopy_L2_M8_20: +.Ldgemm_tcopy_L2_M8_20: COPY8x2 subs I , I , #1 - bne dgemm_tcopy_L2_M8_20 + bne .Ldgemm_tcopy_L2_M8_20 -dgemm_tcopy_L2_M8_40: +.Ldgemm_tcopy_L2_M8_40: tst N , #4 - ble dgemm_tcopy_L2_M8_60 + ble .Ldgemm_tcopy_L2_M8_60 COPY4x2 -dgemm_tcopy_L2_M8_60: +.Ldgemm_tcopy_L2_M8_60: tst N , #2 - ble dgemm_tcopy_L2_M8_80 + ble .Ldgemm_tcopy_L2_M8_80 COPY2x2 -dgemm_tcopy_L2_M8_80: +.Ldgemm_tcopy_L2_M8_80: tst N , #1 - ble dgemm_tcopy_L2_M8_END + ble .Ldgemm_tcopy_L2_M8_END COPY1x2 -dgemm_tcopy_L2_M8_END: +.Ldgemm_tcopy_L2_M8_END: /*********************************************************************************************/ -dgemm_tcopy_L1_BEGIN: +.Ldgemm_tcopy_L1_BEGIN: tst M, #1 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 -dgemm_tcopy_L1_M8_BEGIN: +.Ldgemm_tcopy_L1_M8_BEGIN: mov A01, A // A01 = A mov B01, B asr I, N, #3 // I = M / 8 cmp I, #0 - ble dgemm_tcopy_L1_M8_40 + ble .Ldgemm_tcopy_L1_M8_40 .align 5 -dgemm_tcopy_L1_M8_20: +.Ldgemm_tcopy_L1_M8_20: COPY8x1 subs I , I , #1 - bne dgemm_tcopy_L1_M8_20 + bne .Ldgemm_tcopy_L1_M8_20 -dgemm_tcopy_L1_M8_40: +.Ldgemm_tcopy_L1_M8_40: tst N , #4 - ble dgemm_tcopy_L1_M8_60 + ble .Ldgemm_tcopy_L1_M8_60 COPY4x1 -dgemm_tcopy_L1_M8_60: +.Ldgemm_tcopy_L1_M8_60: tst N , #2 - ble dgemm_tcopy_L1_M8_80 + ble .Ldgemm_tcopy_L1_M8_80 COPY2x1 -dgemm_tcopy_L1_M8_80: +.Ldgemm_tcopy_L1_M8_80: tst N , #1 - ble dgemm_tcopy_L1_M8_END + ble .Ldgemm_tcopy_L1_M8_END COPY1x1 -dgemm_tcopy_L1_M8_END: +.Ldgemm_tcopy_L1_M8_END: -dgemm_tcopy_L999: +.Ldgemm_tcopy_L999: mov x0, #0 // set return value RESTORE_REGS ret diff --git a/kernel/arm64/dot.S b/kernel/arm64/dot.S index 35d4779..a1a5bf2 100644 --- a/kernel/arm64/dot.S +++ b/kernel/arm64/dot.S @@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble dot_kernel_L999 + ble .Ldot_kernel_L999 cmp INC_X, #1 - bne dot_kernel_S_BEGIN + bne .Ldot_kernel_S_BEGIN cmp INC_Y, #1 - bne dot_kernel_S_BEGIN + bne .Ldot_kernel_S_BEGIN -dot_kernel_F_BEGIN: +.Ldot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq dot_kernel_F1 + beq .Ldot_kernel_F1 -dot_kernel_F4: +.Ldot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne dot_kernel_F4 + bne .Ldot_kernel_F4 KERNEL_F4_FINALIZE -dot_kernel_F1: +.Ldot_kernel_F1: ands I, N, #3 - ble dot_kernel_L999 + ble .Ldot_kernel_L999 -dot_kernel_F10: +.Ldot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne dot_kernel_F10 + bne .Ldot_kernel_F10 ret -dot_kernel_S_BEGIN: +.Ldot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble dot_kernel_S1 + ble .Ldot_kernel_S1 -dot_kernel_S4: +.Ldot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -206,21 +206,21 @@ dot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S4 + bne .Ldot_kernel_S4 -dot_kernel_S1: +.Ldot_kernel_S1: ands I, N, #3 - ble dot_kernel_L999 + ble .Ldot_kernel_L999 -dot_kernel_S10: +.Ldot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S10 + bne .Ldot_kernel_S10 -dot_kernel_L999: +.Ldot_kernel_L999: ret diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S index 34fb8c2..b528aeb 100644 --- a/kernel/arm64/dtrmm_kernel_4x4.S +++ b/kernel/arm64/dtrmm_kernel_4x4.S @@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M4_32 + blt .Ldtrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L4_M4_22a + ble .Ldtrmm_kernel_L4_M4_22a .align 5 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_22a: +.Ldtrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_32: +.Ldtrmm_kernel_L4_M4_32: tst counterL, #1 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: INIT4x4 -dtrmm_kernel_L4_M4_44: +.Ldtrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_46: +.Ldtrmm_kernel_L4_M4_46: KERNEL4x4_SUB -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 @@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M4_20 + bne .Ldtrmm_kernel_L4_M4_20 -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -838,19 +838,19 @@ dtrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L4_BEGIN + bgt .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 @@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M4_20 + bgt .Ldtrmm_kernel_L2_M4_20 -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 @@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M4_20 + bgt .Ldtrmm_kernel_L1_M4_20 -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S index 4aecf28..47956de 100644 --- a/kernel/arm64/dtrmm_kernel_4x8.S +++ b/kernel/arm64/dtrmm_kernel_4x8.S @@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble dtrmm_kernel_L4_BEGIN + ble .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L8_BEGIN: +.Ldtrmm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L8_M4_BEGIN: +.Ldtrmm_kernel_L8_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L8_M2_BEGIN + ble .Ldtrmm_kernel_L8_M2_BEGIN -dtrmm_kernel_L8_M4_20: +.Ldtrmm_kernel_L8_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20: asr counterL, tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L8_M4_32 + blt .Ldtrmm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L8_M4_22a + ble .Ldtrmm_kernel_L8_M4_22a .align 5 -dtrmm_kernel_L8_M4_22: +.Ldtrmm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M4_22 + bgt .Ldtrmm_kernel_L8_M4_22 -dtrmm_kernel_L8_M4_22a: +.Ldtrmm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b dtrmm_kernel_L8_M4_44 + b .Ldtrmm_kernel_L8_M4_44 -dtrmm_kernel_L8_M4_32: +.Ldtrmm_kernel_L8_M4_32: tst counterL, #1 - ble dtrmm_kernel_L8_M4_40 + ble .Ldtrmm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b dtrmm_kernel_L8_M4_44 + b .Ldtrmm_kernel_L8_M4_44 -dtrmm_kernel_L8_M4_40: +.Ldtrmm_kernel_L8_M4_40: INIT4x8 -dtrmm_kernel_L8_M4_44: +.Ldtrmm_kernel_L8_M4_44: ands counterL, tempK, #1 - ble dtrmm_kernel_L8_M4_100 + ble .Ldtrmm_kernel_L8_M4_100 -dtrmm_kernel_L8_M4_46: +.Ldtrmm_kernel_L8_M4_46: KERNEL4x8_SUB -dtrmm_kernel_L8_M4_100: +.Ldtrmm_kernel_L8_M4_100: SAVE4x8 @@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L8_M4_END: +.Ldtrmm_kernel_L8_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L8_M4_20 + bne .Ldtrmm_kernel_L8_M4_20 -dtrmm_kernel_L8_M2_BEGIN: +.Ldtrmm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L8_END + ble .Ldtrmm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L8_M1_BEGIN + ble .Ldtrmm_kernel_L8_M1_BEGIN -dtrmm_kernel_L8_M2_20: +.Ldtrmm_kernel_L8_M2_20: INIT2x8 @@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20: asr counterL, tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L8_M2_40 + ble .Ldtrmm_kernel_L8_M2_40 -dtrmm_kernel_L8_M2_22: +.Ldtrmm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M2_22 + bgt .Ldtrmm_kernel_L8_M2_22 -dtrmm_kernel_L8_M2_40: +.Ldtrmm_kernel_L8_M2_40: ands counterL, tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L8_M2_100 + ble .Ldtrmm_kernel_L8_M2_100 -dtrmm_kernel_L8_M2_42: +.Ldtrmm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M2_42 + bgt .Ldtrmm_kernel_L8_M2_42 -dtrmm_kernel_L8_M2_100: +.Ldtrmm_kernel_L8_M2_100: SAVE2x8 @@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L8_M2_END: +.Ldtrmm_kernel_L8_M2_END: -dtrmm_kernel_L8_M1_BEGIN: +.Ldtrmm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L8_END + ble .Ldtrmm_kernel_L8_END -dtrmm_kernel_L8_M1_20: +.Ldtrmm_kernel_L8_M1_20: INIT1x8 @@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20: asr counterL, tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L8_M1_40 + ble .Ldtrmm_kernel_L8_M1_40 -dtrmm_kernel_L8_M1_22: +.Ldtrmm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M1_22 + bgt .Ldtrmm_kernel_L8_M1_22 -dtrmm_kernel_L8_M1_40: +.Ldtrmm_kernel_L8_M1_40: ands counterL, tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L8_M1_100 + ble .Ldtrmm_kernel_L8_M1_100 -dtrmm_kernel_L8_M1_42: +.Ldtrmm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M1_42 + bgt .Ldtrmm_kernel_L8_M1_42 -dtrmm_kernel_L8_M1_100: +.Ldtrmm_kernel_L8_M1_100: SAVE1x8 @@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L8_END: +.Ldtrmm_kernel_L8_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 8 * 8 @@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L8_BEGIN + bgt .Ldtrmm_kernel_L8_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble dtrmm_kernel_L999 + ble .Ldtrmm_kernel_L999 tst counterJ , #4 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20: asr counterL, tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M4_32 + blt .Ldtrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L4_M4_22a + ble .Ldtrmm_kernel_L4_M4_22a .align 5 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_22a: +.Ldtrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_32: +.Ldtrmm_kernel_L4_M4_32: tst counterL, #1 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: INIT4x4 -dtrmm_kernel_L4_M4_44: +.Ldtrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_46: +.Ldtrmm_kernel_L4_M4_46: KERNEL4x4_SUB -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M4_20 + bne .Ldtrmm_kernel_L4_M4_20 -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END: /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M4_20 + bgt .Ldtrmm_kernel_L2_M4_20 -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M4_20 + bgt .Ldtrmm_kernel_L1_M4_20 -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index 2b81737..0ac5a5f 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M8_BEGIN: +.Ldtrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L4_M4_BEGIN + ble .Ldtrmm_kernel_L4_M4_BEGIN .align 5 -dtrmm_kernel_L4_M8_20: +.Ldtrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20: asr counterL , tempK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M8_32 + blt .Ldtrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K @@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dtrmm_kernel_L4_M8_22a + ble .Ldtrmm_kernel_L4_M8_22a .align 5 -dtrmm_kernel_L4_M8_22: +.Ldtrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M8_22 + bgt .Ldtrmm_kernel_L4_M8_22 .align 5 -dtrmm_kernel_L4_M8_22a: +.Ldtrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dtrmm_kernel_L4_M8_44 + b .Ldtrmm_kernel_L4_M8_44 .align 5 -dtrmm_kernel_L4_M8_32: +.Ldtrmm_kernel_L4_M8_32: tst counterL, #1 - ble dtrmm_kernel_L4_M8_40 + ble .Ldtrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dtrmm_kernel_L4_M8_44 + b .Ldtrmm_kernel_L4_M8_44 -dtrmm_kernel_L4_M8_40: +.Ldtrmm_kernel_L4_M8_40: INIT8x4 -dtrmm_kernel_L4_M8_44: +.Ldtrmm_kernel_L4_M8_44: ands counterL , tempK, #7 - ble dtrmm_kernel_L4_M8_100 + ble .Ldtrmm_kernel_L4_M8_100 .align 5 -dtrmm_kernel_L4_M8_46: +.Ldtrmm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dtrmm_kernel_L4_M8_46 + bne .Ldtrmm_kernel_L4_M8_46 -dtrmm_kernel_L4_M8_100: +.Ldtrmm_kernel_L4_M8_100: SAVE8x4 @@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -dtrmm_kernel_L4_M8_END: +.Ldtrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M8_20 + bne .Ldtrmm_kernel_L4_M8_20 -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #4 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: INIT4x4 @@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_42: +.Ldtrmm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_42 + bgt .Ldtrmm_kernel_L4_M4_42 -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 @@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L4_BEGIN + bgt .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -dtrmm_kernel_L2_M8_BEGIN: +.Ldtrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L2_M4_BEGIN + ble .Ldtrmm_kernel_L2_M4_BEGIN -dtrmm_kernel_L2_M8_20: +.Ldtrmm_kernel_L2_M8_20: INIT8x2 @@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M8_40 + ble .Ldtrmm_kernel_L2_M8_40 .align 5 -dtrmm_kernel_L2_M8_22: +.Ldtrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M8_22 + bgt .Ldtrmm_kernel_L2_M8_22 -dtrmm_kernel_L2_M8_40: +.Ldtrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M8_100 + ble .Ldtrmm_kernel_L2_M8_100 -dtrmm_kernel_L2_M8_42: +.Ldtrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M8_42 + bgt .Ldtrmm_kernel_L2_M8_42 -dtrmm_kernel_L2_M8_100: +.Ldtrmm_kernel_L2_M8_100: SAVE8x2 @@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -dtrmm_kernel_L2_M8_END: +.Ldtrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M8_20 + bgt .Ldtrmm_kernel_L2_M8_20 -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 @@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next @@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -dtrmm_kernel_L1_M8_BEGIN: +.Ldtrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L1_M4_BEGIN + ble .Ldtrmm_kernel_L1_M4_BEGIN -dtrmm_kernel_L1_M8_20: +.Ldtrmm_kernel_L1_M8_20: INIT8x1 @@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M8_40 + ble .Ldtrmm_kernel_L1_M8_40 .align 5 -dtrmm_kernel_L1_M8_22: +.Ldtrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M8_22 + bgt .Ldtrmm_kernel_L1_M8_22 -dtrmm_kernel_L1_M8_40: +.Ldtrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M8_100 + ble .Ldtrmm_kernel_L1_M8_100 -dtrmm_kernel_L1_M8_42: +.Ldtrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M8_42 + bgt .Ldtrmm_kernel_L1_M8_42 -dtrmm_kernel_L1_M8_100: +.Ldtrmm_kernel_L1_M8_100: SAVE8x1 @@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -dtrmm_kernel_L1_M8_END: +.Ldtrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M8_20 + bgt .Ldtrmm_kernel_L1_M8_20 -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 @@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S index 162f721..658551f 100644 --- a/kernel/arm64/gemv_n.S +++ b/kernel/arm64/gemv_n.S @@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble gemv_n_kernel_L999 + ble .Lgemv_n_kernel_L999 cmp M, xzr - ble gemv_n_kernel_L999 + ble .Lgemv_n_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_X, INC_X, #SHZ mov J, N cmp INC_Y, #1 - bne gemv_n_kernel_S_BEGIN + bne .Lgemv_n_kernel_S_BEGIN -gemv_n_kernel_F_LOOP: +.Lgemv_n_kernel_F_LOOP: ld1 TEMPV, [X], INC_X fmul TEMP, ALPHA, TEMP @@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP: mov Y_IPTR, Y mov Y_OPTR, Y -gemv_n_kernel_F32: +.Lgemv_n_kernel_F32: asr I, M, #5 cmp I, xzr - beq gemv_n_kernel_F4 + beq .Lgemv_n_kernel_F4 -gemv_n_kernel_F320: +.Lgemv_n_kernel_F320: KERNEL_F16 KERNEL_F16 subs I, I, #1 - bne gemv_n_kernel_F320 + bne .Lgemv_n_kernel_F320 -gemv_n_kernel_F4: +.Lgemv_n_kernel_F4: ands I, M, #31 asr I, I, #2 cmp I, xzr - beq gemv_n_kernel_F1 + beq .Lgemv_n_kernel_F1 -gemv_n_kernel_F40: +.Lgemv_n_kernel_F40: KERNEL_F4 subs I, I, #1 - bne gemv_n_kernel_F40 + bne .Lgemv_n_kernel_F40 -gemv_n_kernel_F1: +.Lgemv_n_kernel_F1: ands I, M, #3 - ble gemv_n_kernel_F_END + ble .Lgemv_n_kernel_F_END -gemv_n_kernel_F10: +.Lgemv_n_kernel_F10: KERNEL_F1 subs I, I, #1 - bne gemv_n_kernel_F10 + bne .Lgemv_n_kernel_F10 -gemv_n_kernel_F_END: +.Lgemv_n_kernel_F_END: add A, A, LDA subs J, J, #1 - bne gemv_n_kernel_F_LOOP + bne .Lgemv_n_kernel_F_LOOP - b gemv_n_kernel_L999 + b .Lgemv_n_kernel_L999 -gemv_n_kernel_S_BEGIN: +.Lgemv_n_kernel_S_BEGIN: INIT_S -gemv_n_kernel_S_LOOP: +.Lgemv_n_kernel_S_LOOP: ld1 TEMPV, [X], INC_X fmul TEMP, ALPHA, TEMP @@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble gemv_n_kernel_S1 + ble .Lgemv_n_kernel_S1 -gemv_n_kernel_S4: +.Lgemv_n_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -298,27 +298,27 @@ gemv_n_kernel_S4: KERNEL_S1 subs I, I, #1 - bne gemv_n_kernel_S4 + bne .Lgemv_n_kernel_S4 -gemv_n_kernel_S1: +.Lgemv_n_kernel_S1: ands I, M, #3 - ble gemv_n_kernel_S_END + ble .Lgemv_n_kernel_S_END -gemv_n_kernel_S10: +.Lgemv_n_kernel_S10: KERNEL_S1 subs I, I, #1 - bne gemv_n_kernel_S10 + bne .Lgemv_n_kernel_S10 -gemv_n_kernel_S_END: +.Lgemv_n_kernel_S_END: add A, A, LDA subs J, J, #1 - bne gemv_n_kernel_S_LOOP + bne .Lgemv_n_kernel_S_LOOP -gemv_n_kernel_L999: +.Lgemv_n_kernel_L999: mov w0, wzr diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S index 28325f7..b04367a 100644 --- a/kernel/arm64/gemv_t.S +++ b/kernel/arm64/gemv_t.S @@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble gemv_t_kernel_L999 + ble .Lgemv_t_kernel_L999 cmp M, xzr - ble gemv_t_kernel_L999 + ble .Lgemv_t_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_Y, INC_Y, #SHZ mov J, N cmp INC_X, #1 - bne gemv_t_kernel_S_BEGIN + bne .Lgemv_t_kernel_S_BEGIN -gemv_t_kernel_F_LOOP: +.Lgemv_t_kernel_F_LOOP: fmov TEMP, REG0 fmov TEMP1, REG0 @@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP: mov A_PTR, A mov X_PTR, X -gemv_t_kernel_F32: +.Lgemv_t_kernel_F32: asr I, M, #5 cmp I, xzr - beq gemv_t_kernel_F4 + beq .Lgemv_t_kernel_F4 -gemv_t_kernel_F320: +.Lgemv_t_kernel_F320: KERNEL_F32 subs I, I, #1 - bne gemv_t_kernel_F320 + bne .Lgemv_t_kernel_F320 KERNEL_F32_FINALIZE -gemv_t_kernel_F4: +.Lgemv_t_kernel_F4: ands I, M, #31 asr I, I, #2 cmp I, xzr - beq gemv_t_kernel_F1 + beq .Lgemv_t_kernel_F1 -gemv_t_kernel_F40: +.Lgemv_t_kernel_F40: KERNEL_F4 subs I, I, #1 - bne gemv_t_kernel_F40 + bne .Lgemv_t_kernel_F40 -gemv_t_kernel_F1: +.Lgemv_t_kernel_F1: KERNEL_F4_FINALIZE ands I, M, #3 - ble gemv_t_kernel_F_END + ble .Lgemv_t_kernel_F_END -gemv_t_kernel_F10: +.Lgemv_t_kernel_F10: KERNEL_F1 subs I, I, #1 - bne gemv_t_kernel_F10 + bne .Lgemv_t_kernel_F10 -gemv_t_kernel_F_END: +.Lgemv_t_kernel_F_END: ld1 TMPV1, [Y] add A, A, LDA subs J, J, #1 fmadd TMP1, ALPHA, TEMP, TMP1 st1 TMPV1, [Y], INC_Y - bne gemv_t_kernel_F_LOOP + bne .Lgemv_t_kernel_F_LOOP - b gemv_t_kernel_L999 + b .Lgemv_t_kernel_L999 -gemv_t_kernel_S_BEGIN: +.Lgemv_t_kernel_S_BEGIN: INIT_S -gemv_t_kernel_S_LOOP: +.Lgemv_t_kernel_S_LOOP: fmov TEMP, REG0 mov A_PTR, A @@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble gemv_t_kernel_S1 + ble .Lgemv_t_kernel_S1 -gemv_t_kernel_S4: +.Lgemv_t_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -329,30 +329,30 @@ gemv_t_kernel_S4: KERNEL_S1 subs I, I, #1 - bne gemv_t_kernel_S4 + bne .Lgemv_t_kernel_S4 -gemv_t_kernel_S1: +.Lgemv_t_kernel_S1: ands I, M, #3 - ble gemv_t_kernel_S_END + ble .Lgemv_t_kernel_S_END -gemv_t_kernel_S10: +.Lgemv_t_kernel_S10: KERNEL_S1 subs I, I, #1 - bne gemv_t_kernel_S10 + bne .Lgemv_t_kernel_S10 -gemv_t_kernel_S_END: +.Lgemv_t_kernel_S_END: ld1 TMPV1, [Y] add A, A, LDA subs J, J, #1 fmadd TMP1, ALPHA, TEMP, TMP1 st1 TMPV1, [Y], INC_Y - bne gemv_t_kernel_S_LOOP + bne .Lgemv_t_kernel_S_LOOP -gemv_t_kernel_L999: +.Lgemv_t_kernel_L999: RESTORE_REGS diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S index 6c0d84f..31d0cd6 100644 --- a/kernel/arm64/iamax.S +++ b/kernel/arm64/iamax.S @@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble iamax_kernel_zero + ble .Liamax_kernel_zero cmp INC_X, xzr - ble iamax_kernel_zero + ble .Liamax_kernel_zero cmp INC_X, #1 - bne iamax_kernel_S_BEGIN + bne .Liamax_kernel_S_BEGIN mov x7, X -iamax_kernel_F_BEGIN: +.Liamax_kernel_F_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 asr I, N, #3 cmp I, xzr - beq iamax_kernel_F1 + beq .Liamax_kernel_F1 add Z, Z, #1 -iamax_kernel_F8: +.Liamax_kernel_F8: KERNEL_F8 subs I, I, #1 - bne iamax_kernel_F8 + bne .Liamax_kernel_F8 KERNEL_F8_FINALIZE sub Z, Z, #1 -iamax_kernel_F1: +.Liamax_kernel_F1: ands I, N, #7 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 -iamax_kernel_F10: +.Liamax_kernel_F10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_F10 + bne .Liamax_kernel_F10 - b iamax_kernel_L999 + b .Liamax_kernel_L999 -iamax_kernel_S_BEGIN: +.Liamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble iamax_kernel_S1 + ble .Liamax_kernel_S1 -iamax_kernel_S4: +.Liamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -293,25 +293,25 @@ iamax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S4 + bne .Liamax_kernel_S4 -iamax_kernel_S1: +.Liamax_kernel_S1: ands I, N, #3 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 -iamax_kernel_S10: +.Liamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S10 + bne .Liamax_kernel_S10 -iamax_kernel_L999: +.Liamax_kernel_L999: mov x0, INDEX ret -iamax_kernel_zero: +.Liamax_kernel_zero: mov x0, xzr ret diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S index 9b252ec..42fa4e7 100644 --- a/kernel/arm64/izamax.S +++ b/kernel/arm64/izamax.S @@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble iamax_kernel_zero + ble .Lizamax_kernel_zero cmp INC_X, xzr - ble iamax_kernel_zero + ble .Lizamax_kernel_zero cmp INC_X, #1 - bne iamax_kernel_S_BEGIN + bne .Lizamax_kernel_S_BEGIN mov x7, X -iamax_kernel_F_BEGIN: +.Lizamax_kernel_F_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 asr I, N, #3 cmp I, xzr - ble iamax_kernel_F1 + ble .Lizamax_kernel_F1 add Z, Z, #1 -iamax_kernel_F8: +.Lizamax_kernel_F8: KERNEL_F8 subs I, I, #1 - bne iamax_kernel_F8 + bne .Lizamax_kernel_F8 KERNEL_F8_FINALIZE sub Z, Z, #1 -iamax_kernel_F1: +.Lizamax_kernel_F1: ands I, N, #7 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 -iamax_kernel_F10: +.Lizamax_kernel_F10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_F10 + bne .Lizamax_kernel_F10 - b iamax_kernel_L999 + b .Lizamax_kernel_L999 -iamax_kernel_S_BEGIN: +.Lizamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble iamax_kernel_S1 + ble .Lizamax_kernel_S1 -iamax_kernel_S4: +.Lizamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -341,26 +341,26 @@ iamax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S4 + bne .Lizamax_kernel_S4 -iamax_kernel_S1: +.Lizamax_kernel_S1: ands I, N, #3 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 -iamax_kernel_S10: +.Lizamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S10 + bne .Lizamax_kernel_S10 -iamax_kernel_L999: +.Lizamax_kernel_L999: mov x0, INDEX ret -iamax_kernel_zero: +.Lizamax_kernel_zero: mov x0, xzr ret diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S index 5d06c13..e2cbd4d 100644 --- a/kernel/arm64/nrm2.S +++ b/kernel/arm64/nrm2.S @@ -162,44 +162,44 @@ KERNEL_S1_NEXT: INIT cmp N, #0 - ble nrm2_kernel_L999 + ble .Lnrm2_kernel_L999 cmp INC_X, #0 - beq nrm2_kernel_L999 + beq .Lnrm2_kernel_L999 cmp INC_X, #1 - bne nrm2_kernel_S_BEGIN + bne .Lnrm2_kernel_S_BEGIN -nrm2_kernel_F_BEGIN: +.Lnrm2_kernel_F_BEGIN: asr I, N, #3 // I = N / 8 cmp I, xzr - ble nrm2_kernel_F1 + ble .Lnrm2_kernel_F1 -nrm2_kernel_F8: +.Lnrm2_kernel_F8: KERNEL_F8 subs I, I, #1 - bne nrm2_kernel_F8 + bne .Lnrm2_kernel_F8 -nrm2_kernel_F1: +.Lnrm2_kernel_F1: ands I, N, #7 - ble nrm2_kernel_L999 + ble .Lnrm2_kernel_L999 -nrm2_kernel_F10: +.Lnrm2_kernel_F10: KERNEL_F1 subs I, I, #1 - bne nrm2_kernel_F10 + bne .Lnrm2_kernel_F10 - b nrm2_kernel_L999 + b .Lnrm2_kernel_L999 -nrm2_kernel_S_BEGIN: +.Lnrm2_kernel_S_BEGIN: INIT_S @@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN: .align 5 -nrm2_kernel_S10: +.Lnrm2_kernel_S10: KERNEL_S1 subs I, I, #1 - bne nrm2_kernel_S10 + bne .Lnrm2_kernel_S10 -nrm2_kernel_L999: +.Lnrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ diff --git a/kernel/arm64/rot.S b/kernel/arm64/rot.S index 5721252..00c3085 100644 --- a/kernel/arm64/rot.S +++ b/kernel/arm64/rot.S @@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble rot_kernel_L999 + ble .Lrot_kernel_L999 INIT cmp INC_X, #1 - bne rot_kernel_S_BEGIN + bne .Lrot_kernel_S_BEGIN cmp INC_Y, #1 - bne rot_kernel_S_BEGIN + bne .Lrot_kernel_S_BEGIN -rot_kernel_F_BEGIN: +.Lrot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq rot_kernel_F1 + beq .Lrot_kernel_F1 KERNEL_INIT_F4 -rot_kernel_F4: +.Lrot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne rot_kernel_F4 + bne .Lrot_kernel_F4 -rot_kernel_F1: +.Lrot_kernel_F1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lrot_kernel_L999 INIT_F1 -rot_kernel_F10: +.Lrot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne rot_kernel_F10 + bne .Lrot_kernel_F10 mov w0, wzr ret -rot_kernel_S_BEGIN: +.Lrot_kernel_S_BEGIN: INIT_S INIT_F1 @@ -214,9 +214,9 @@ rot_kernel_S_BEGIN: asr I, N, #2 cmp I, xzr - ble rot_kernel_S1 + ble .Lrot_kernel_S1 -rot_kernel_S4: +.Lrot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -224,22 +224,22 @@ rot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S4 + bne .Lrot_kernel_S4 -rot_kernel_S1: +.Lrot_kernel_S1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lrot_kernel_L999 -rot_kernel_S10: +.Lrot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S10 + bne .Lrot_kernel_S10 -rot_kernel_L999: +.Lrot_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/scal.S b/kernel/arm64/scal.S index 91d469d..09c41cd 100644 --- a/kernel/arm64/scal.S +++ b/kernel/arm64/scal.S @@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble scal_kernel_L999 + ble .Lscal_kernel_L999 fcmp DA, #0.0 - beq scal_kernel_zero + beq .Lscal_kernel_zero cmp INC_X, #1 - bne scal_kernel_S_BEGIN + bne .Lscal_kernel_S_BEGIN -scal_kernel_F_BEGIN: +.Lscal_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq scal_kernel_F1 + beq .Lscal_kernel_F1 KERNEL_INIT_F8 -scal_kernel_F8: +.Lscal_kernel_F8: KERNEL_F8 subs I, I, #1 - bne scal_kernel_F8 + bne .Lscal_kernel_F8 -scal_kernel_F1: +.Lscal_kernel_F1: ands I, N, #7 - ble scal_kernel_L999 + ble .Lscal_kernel_L999 -scal_kernel_F10: +.Lscal_kernel_F10: KERNEL_F1 subs I, I, #1 - bne scal_kernel_F10 + bne .Lscal_kernel_F10 mov w0, wzr ret -scal_kernel_S_BEGIN: +.Lscal_kernel_S_BEGIN: INIT_S mov X_COPY, X asr I, N, #2 cmp I, xzr - ble scal_kernel_S1 + ble .Lscal_kernel_S1 -scal_kernel_S4: +.Lscal_kernel_S4: KERNEL_S4 subs I, I, #1 - bne scal_kernel_S4 + bne .Lscal_kernel_S4 -scal_kernel_S1: +.Lscal_kernel_S1: ands I, N, #3 - ble scal_kernel_L999 + ble .Lscal_kernel_L999 -scal_kernel_S10: +.Lscal_kernel_S10: KERNEL_S1 subs I, I, #1 - bne scal_kernel_S10 + bne .Lscal_kernel_S10 -scal_kernel_L999: +.Lscal_kernel_L999: mov w0, wzr ret -scal_kernel_zero: +.Lscal_kernel_zero: INIT_S -scal_kernel_Z1: +.Lscal_kernel_Z1: st1 DAV, [X], INC_X subs N, N, #1 - bne scal_kernel_Z1 + bne .Lscal_kernel_Z1 mov w0, wzr ret diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 6e3645b..99099ea 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -1070,7 +1070,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1098,11 +1098,11 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1112,21 +1112,21 @@ sgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN .align 5 -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1138,10 +1138,10 @@ sgemm_kernel_L4_M16_20: KERNEL16x4_M2 subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1153,10 +1153,10 @@ sgemm_kernel_L4_M16_22: KERNEL16x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 .align 5 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1167,13 +1167,13 @@ sgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 .align 5 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1184,187 +1184,187 @@ sgemm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #7 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 .align 5 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne sgemm_kernel_L4_M16_46 + bne .Lsgemm_kernel_L4_M16_46 -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1372,9 +1372,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1387,34 +1387,34 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1422,9 +1422,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1436,42 +1436,42 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1479,14 +1479,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -sgemm_kernel_L2_M16_BEGIN: +.Lsgemm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble sgemm_kernel_L2_M8_BEGIN + ble .Lsgemm_kernel_L2_M8_BEGIN -sgemm_kernel_L2_M16_20: +.Lsgemm_kernel_L2_M16_20: INIT16x2 @@ -1494,10 +1494,10 @@ sgemm_kernel_L2_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M16_40 + ble .Lsgemm_kernel_L2_M16_40 .align 5 -sgemm_kernel_L2_M16_22: +.Lsgemm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1509,41 +1509,41 @@ sgemm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_22 + bgt .Lsgemm_kernel_L2_M16_22 -sgemm_kernel_L2_M16_40: +.Lsgemm_kernel_L2_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M16_100 + ble .Lsgemm_kernel_L2_M16_100 -sgemm_kernel_L2_M16_42: +.Lsgemm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_42 + bgt .Lsgemm_kernel_L2_M16_42 -sgemm_kernel_L2_M16_100: +.Lsgemm_kernel_L2_M16_100: SAVE16x2 -sgemm_kernel_L2_M16_END: +.Lsgemm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M16_20 + bgt .Lsgemm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #8 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1551,10 +1551,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1566,38 +1566,38 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1605,10 +1605,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1620,40 +1620,40 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1661,9 +1661,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1676,34 +1676,34 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -1711,9 +1711,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1725,36 +1725,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1762,14 +1762,14 @@ sgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -sgemm_kernel_L1_M16_BEGIN: +.Lsgemm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L1_M8_BEGIN + ble .Lsgemm_kernel_L1_M8_BEGIN -sgemm_kernel_L1_M16_20: +.Lsgemm_kernel_L1_M16_20: INIT16x1 @@ -1777,10 +1777,10 @@ sgemm_kernel_L1_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M16_40 + ble .Lsgemm_kernel_L1_M16_40 .align 5 -sgemm_kernel_L1_M16_22: +.Lsgemm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -1792,42 +1792,42 @@ sgemm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_22 + bgt .Lsgemm_kernel_L1_M16_22 -sgemm_kernel_L1_M16_40: +.Lsgemm_kernel_L1_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M16_100 + ble .Lsgemm_kernel_L1_M16_100 -sgemm_kernel_L1_M16_42: +.Lsgemm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_42 + bgt .Lsgemm_kernel_L1_M16_42 -sgemm_kernel_L1_M16_100: +.Lsgemm_kernel_L1_M16_100: SAVE16x1 -sgemm_kernel_L1_M16_END: +.Lsgemm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M16_20 + bgt .Lsgemm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #8 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -1835,10 +1835,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1850,38 +1850,38 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -1889,10 +1889,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1904,39 +1904,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -1944,9 +1944,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1959,34 +1959,34 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -1994,9 +1994,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2008,28 +2008,28 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S b/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S index 0ee10e1..144d4bc 100644 --- a/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S +++ b/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S @@ -1117,7 +1117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1145,11 +1145,11 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1159,21 +1159,21 @@ sgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN .align 5 -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #4 // L = K / 16 cmp counterL , #2 - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1182,18 +1182,18 @@ sgemm_kernel_L4_M16_20: KERNEL16x4_M1_M2_x1 subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1_M2_x8 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 .align 5 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1_M2_x4 KERNEL16x4_M1_M2_x2 @@ -1201,13 +1201,13 @@ sgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 .align 5 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1216,187 +1216,187 @@ sgemm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #15 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 .align 5 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne sgemm_kernel_L4_M16_46 + bne .Lsgemm_kernel_L4_M16_46 -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1404,9 +1404,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1419,34 +1419,34 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1454,9 +1454,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1468,42 +1468,42 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1511,14 +1511,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -sgemm_kernel_L2_M16_BEGIN: +.Lsgemm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble sgemm_kernel_L2_M8_BEGIN + ble .Lsgemm_kernel_L2_M8_BEGIN -sgemm_kernel_L2_M16_20: +.Lsgemm_kernel_L2_M16_20: INIT16x2 @@ -1526,10 +1526,10 @@ sgemm_kernel_L2_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M16_40 + ble .Lsgemm_kernel_L2_M16_40 .align 5 -sgemm_kernel_L2_M16_22: +.Lsgemm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1541,41 +1541,41 @@ sgemm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_22 + bgt .Lsgemm_kernel_L2_M16_22 -sgemm_kernel_L2_M16_40: +.Lsgemm_kernel_L2_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M16_100 + ble .Lsgemm_kernel_L2_M16_100 -sgemm_kernel_L2_M16_42: +.Lsgemm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_42 + bgt .Lsgemm_kernel_L2_M16_42 -sgemm_kernel_L2_M16_100: +.Lsgemm_kernel_L2_M16_100: SAVE16x2 -sgemm_kernel_L2_M16_END: +.Lsgemm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M16_20 + bgt .Lsgemm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #8 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1583,10 +1583,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1598,38 +1598,38 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1637,10 +1637,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1652,40 +1652,40 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1693,9 +1693,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1708,34 +1708,34 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -1743,9 +1743,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1757,36 +1757,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1794,14 +1794,14 @@ sgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -sgemm_kernel_L1_M16_BEGIN: +.Lsgemm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L1_M8_BEGIN + ble .Lsgemm_kernel_L1_M8_BEGIN -sgemm_kernel_L1_M16_20: +.Lsgemm_kernel_L1_M16_20: INIT16x1 @@ -1809,10 +1809,10 @@ sgemm_kernel_L1_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M16_40 + ble .Lsgemm_kernel_L1_M16_40 .align 5 -sgemm_kernel_L1_M16_22: +.Lsgemm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -1824,42 +1824,42 @@ sgemm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_22 + bgt .Lsgemm_kernel_L1_M16_22 -sgemm_kernel_L1_M16_40: +.Lsgemm_kernel_L1_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M16_100 + ble .Lsgemm_kernel_L1_M16_100 -sgemm_kernel_L1_M16_42: +.Lsgemm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_42 + bgt .Lsgemm_kernel_L1_M16_42 -sgemm_kernel_L1_M16_100: +.Lsgemm_kernel_L1_M16_100: SAVE16x1 -sgemm_kernel_L1_M16_END: +.Lsgemm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M16_20 + bgt .Lsgemm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #8 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -1867,10 +1867,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1882,38 +1882,38 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -1921,10 +1921,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1936,39 +1936,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -1976,9 +1976,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1991,34 +1991,34 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -2026,9 +2026,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2040,28 +2040,28 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S index a5cf7ba..76c11f1 100644 --- a/kernel/arm64/sgemm_kernel_4x4.S +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN: add pA_2, temp, pA_1 add pA_3, temp, pA_2 -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I // do one in the K KERNEL16x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: lsl temp, origK, #4 // k * 4 * 4 = Four rows of A add pA_0, pA_0, temp add pA_0, pA_0, temp @@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END: add pA_2, pA_1, temp add pA_3, pA_2, temp subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: INIT8x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_SUB KERNEL8x4_SUB @@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22: KERNEL8x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_42: +.Lsgemm_kernel_L4_M8_42: KERNEL8x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_42 + bgt .Lsgemm_kernel_L4_M8_42 -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: lsl temp, origK, #4 // k * 4 * 4 add pA_0, pA_0, temp -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_42: +.Lsgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_42 + bgt .Lsgemm_kernel_L4_M4_42 -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: lsl temp, origK, #4 add origPB, origPB, temp // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M4_20 + bgt .Lsgemm_kernel_L2_M4_20 -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN: -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M4_20 + bgt .Lsgemm_kernel_L1_M4_20 -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S index bd47bed..6ba64dd 100644 --- a/kernel/arm64/sgemm_kernel_8x8.S +++ b/kernel/arm64/sgemm_kernel_8x8.S @@ -1263,7 +1263,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1291,12 +1291,12 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble sgemm_kernel_L4_BEGIN + ble .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L8_BEGIN: +.Lsgemm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -1304,156 +1304,156 @@ sgemm_kernel_L8_BEGIN: /******************************************************************************/ -sgemm_kernel_L8_M8_BEGIN: +.Lsgemm_kernel_L8_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble sgemm_kernel_L8_M4_BEGIN + ble .Lsgemm_kernel_L8_M4_BEGIN -sgemm_kernel_L8_M8_20: +.Lsgemm_kernel_L8_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L8_M8_32 + blt .Lsgemm_kernel_L8_M8_32 KERNEL8x8_I // do one in the K KERNEL8x8_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L8_M8_22a + ble .Lsgemm_kernel_L8_M8_22a .align 5 -sgemm_kernel_L8_M8_22: +.Lsgemm_kernel_L8_M8_22: KERNEL8x8_M1 KERNEL8x8_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M8_22 + bgt .Lsgemm_kernel_L8_M8_22 -sgemm_kernel_L8_M8_22a: +.Lsgemm_kernel_L8_M8_22a: KERNEL8x8_M1 KERNEL8x8_E - b sgemm_kernel_L8_M8_44 + b .Lsgemm_kernel_L8_M8_44 -sgemm_kernel_L8_M8_32: +.Lsgemm_kernel_L8_M8_32: tst counterL, #1 - ble sgemm_kernel_L8_M8_40 + ble .Lsgemm_kernel_L8_M8_40 KERNEL8x8_I KERNEL8x8_E - b sgemm_kernel_L8_M8_44 + b .Lsgemm_kernel_L8_M8_44 -sgemm_kernel_L8_M8_40: +.Lsgemm_kernel_L8_M8_40: INIT8x8 -sgemm_kernel_L8_M8_44: +.Lsgemm_kernel_L8_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L8_M8_100 + ble .Lsgemm_kernel_L8_M8_100 -sgemm_kernel_L8_M8_46: +.Lsgemm_kernel_L8_M8_46: KERNEL8x8_SUB -sgemm_kernel_L8_M8_100: +.Lsgemm_kernel_L8_M8_100: SAVE8x8 -sgemm_kernel_L8_M8_END: +.Lsgemm_kernel_L8_M8_END: subs counterI, counterI, #1 - bne sgemm_kernel_L8_M8_20 + bne .Lsgemm_kernel_L8_M8_20 /******************************************************************************/ -sgemm_kernel_L8_M4_BEGIN: +.Lsgemm_kernel_L8_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END tst counterI, #4 - ble sgemm_kernel_L8_M2_BEGIN + ble .Lsgemm_kernel_L8_M2_BEGIN -sgemm_kernel_L8_M4_20: +.Lsgemm_kernel_L8_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L8_M4_32 + blt .Lsgemm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L8_M4_22a + ble .Lsgemm_kernel_L8_M4_22a .align 5 -sgemm_kernel_L8_M4_22: +.Lsgemm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M4_22 + bgt .Lsgemm_kernel_L8_M4_22 -sgemm_kernel_L8_M4_22a: +.Lsgemm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b sgemm_kernel_L8_M4_44 + b .Lsgemm_kernel_L8_M4_44 -sgemm_kernel_L8_M4_32: +.Lsgemm_kernel_L8_M4_32: tst counterL, #1 - ble sgemm_kernel_L8_M4_40 + ble .Lsgemm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b sgemm_kernel_L8_M4_44 + b .Lsgemm_kernel_L8_M4_44 -sgemm_kernel_L8_M4_40: +.Lsgemm_kernel_L8_M4_40: INIT4x8 -sgemm_kernel_L8_M4_44: +.Lsgemm_kernel_L8_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L8_M4_100 + ble .Lsgemm_kernel_L8_M4_100 -sgemm_kernel_L8_M4_46: +.Lsgemm_kernel_L8_M4_46: KERNEL4x8_SUB -sgemm_kernel_L8_M4_100: +.Lsgemm_kernel_L8_M4_100: SAVE4x8 -sgemm_kernel_L8_M4_END: +.Lsgemm_kernel_L8_M4_END: /******************************************************************************/ -sgemm_kernel_L8_M2_BEGIN: +.Lsgemm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L8_M1_BEGIN + ble .Lsgemm_kernel_L8_M1_BEGIN -sgemm_kernel_L8_M2_20: +.Lsgemm_kernel_L8_M2_20: INIT2x8 @@ -1461,9 +1461,9 @@ sgemm_kernel_L8_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L8_M2_40 + ble .Lsgemm_kernel_L8_M2_40 -sgemm_kernel_L8_M2_22: +.Lsgemm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1476,35 +1476,35 @@ sgemm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M2_22 + bgt .Lsgemm_kernel_L8_M2_22 -sgemm_kernel_L8_M2_40: +.Lsgemm_kernel_L8_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L8_M2_100 + ble .Lsgemm_kernel_L8_M2_100 -sgemm_kernel_L8_M2_42: +.Lsgemm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M2_42 + bgt .Lsgemm_kernel_L8_M2_42 -sgemm_kernel_L8_M2_100: +.Lsgemm_kernel_L8_M2_100: SAVE2x8 -sgemm_kernel_L8_M2_END: +.Lsgemm_kernel_L8_M2_END: /******************************************************************************/ -sgemm_kernel_L8_M1_BEGIN: +.Lsgemm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END -sgemm_kernel_L8_M1_20: +.Lsgemm_kernel_L8_M1_20: INIT1x8 @@ -1512,9 +1512,9 @@ sgemm_kernel_L8_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L8_M1_40 + ble .Lsgemm_kernel_L8_M1_40 -sgemm_kernel_L8_M1_22: +.Lsgemm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1526,43 +1526,43 @@ sgemm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M1_22 + bgt .Lsgemm_kernel_L8_M1_22 -sgemm_kernel_L8_M1_40: +.Lsgemm_kernel_L8_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L8_M1_100 + ble .Lsgemm_kernel_L8_M1_100 -sgemm_kernel_L8_M1_42: +.Lsgemm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M1_42 + bgt .Lsgemm_kernel_L8_M1_42 -sgemm_kernel_L8_M1_100: +.Lsgemm_kernel_L8_M1_100: SAVE1x8 -sgemm_kernel_L8_END: +.Lsgemm_kernel_L8_END: lsl temp, origK, #5 // B = B + K * 4 * 8 add origPB, origPB, temp subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L8_BEGIN + bgt .Lsgemm_kernel_L8_BEGIN /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #4 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1572,156 +1572,156 @@ sgemm_kernel_L4_BEGIN: /******************************************************************************/ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M8_20 + bne .Lsgemm_kernel_L4_M8_20 /******************************************************************************/ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: /******************************************************************************/ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1729,9 +1729,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1744,35 +1744,35 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: /******************************************************************************/ -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1780,9 +1780,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1794,39 +1794,39 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1836,14 +1836,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction /******************************************************************************/ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1851,10 +1851,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1866,42 +1866,42 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M8_20 + bgt .Lsgemm_kernel_L2_M8_20 /******************************************************************************/ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1909,10 +1909,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1924,39 +1924,39 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: /******************************************************************************/ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1964,9 +1964,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1979,35 +1979,35 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: /******************************************************************************/ -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -2015,9 +2015,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2029,37 +2029,37 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2069,14 +2069,14 @@ sgemm_kernel_L1_BEGIN: /******************************************************************************/ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 cmp counterI, #0 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -2084,10 +2084,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2099,42 +2099,42 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M8_20 + bgt .Lsgemm_kernel_L1_M8_20 /******************************************************************************/ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -2142,10 +2142,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2157,39 +2157,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: /******************************************************************************/ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -2197,9 +2197,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2212,35 +2212,35 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: /******************************************************************************/ -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -2248,9 +2248,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2262,30 +2262,30 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: /******************************************************************************/ -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 77e0510..985a0a9 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -1035,7 +1035,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1066,11 +1066,11 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1084,15 +1084,15 @@ strmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -strmm_kernel_L4_M16_BEGIN: +.Lstrmm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble strmm_kernel_L4_M8_BEGIN + ble .Lstrmm_kernel_L4_M8_BEGIN .align 5 -strmm_kernel_L4_M16_20: +.Lstrmm_kernel_L4_M16_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1114,7 +1114,7 @@ strmm_kernel_L4_M16_20: asr counterL , tempK, #3 cmp counterL , #2 - blt strmm_kernel_L4_M16_32 + blt .Lstrmm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1126,10 +1126,10 @@ strmm_kernel_L4_M16_20: KERNEL16x4_M2 subs counterL, counterL, #2 - ble strmm_kernel_L4_M16_22a + ble .Lstrmm_kernel_L4_M16_22a .align 5 -strmm_kernel_L4_M16_22: +.Lstrmm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1141,10 +1141,10 @@ strmm_kernel_L4_M16_22: KERNEL16x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M16_22 + bgt .Lstrmm_kernel_L4_M16_22 .align 5 -strmm_kernel_L4_M16_22a: +.Lstrmm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1155,13 +1155,13 @@ strmm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b strmm_kernel_L4_M16_44 + b .Lstrmm_kernel_L4_M16_44 .align 5 -strmm_kernel_L4_M16_32: +.Lstrmm_kernel_L4_M16_32: tst counterL, #1 - ble strmm_kernel_L4_M16_40 + ble .Lstrmm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1172,25 +1172,25 @@ strmm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b strmm_kernel_L4_M16_44 + b .Lstrmm_kernel_L4_M16_44 -strmm_kernel_L4_M16_40: +.Lstrmm_kernel_L4_M16_40: INIT16x4 -strmm_kernel_L4_M16_44: +.Lstrmm_kernel_L4_M16_44: ands counterL , tempK, #7 - ble strmm_kernel_L4_M16_100 + ble .Lstrmm_kernel_L4_M16_100 .align 5 -strmm_kernel_L4_M16_46: +.Lstrmm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne strmm_kernel_L4_M16_46 + bne .Lstrmm_kernel_L4_M16_46 -strmm_kernel_L4_M16_100: +.Lstrmm_kernel_L4_M16_100: SAVE16x4 @@ -1213,22 +1213,22 @@ strmm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -strmm_kernel_L4_M16_END: +.Lstrmm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M16_20 + bne .Lstrmm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L4_M8_BEGIN: +.Lstrmm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #8 - ble strmm_kernel_L4_M4_BEGIN + ble .Lstrmm_kernel_L4_M4_BEGIN -strmm_kernel_L4_M8_20: +.Lstrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1250,54 +1250,54 @@ strmm_kernel_L4_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M8_32 + blt .Lstrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M8_22a + ble .Lstrmm_kernel_L4_M8_22a .align 5 -strmm_kernel_L4_M8_22: +.Lstrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M8_22 + bgt .Lstrmm_kernel_L4_M8_22 -strmm_kernel_L4_M8_22a: +.Lstrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_32: +.Lstrmm_kernel_L4_M8_32: tst counterL, #1 - ble strmm_kernel_L4_M8_40 + ble .Lstrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_40: +.Lstrmm_kernel_L4_M8_40: INIT8x4 -strmm_kernel_L4_M8_44: +.Lstrmm_kernel_L4_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M8_100 + ble .Lstrmm_kernel_L4_M8_100 -strmm_kernel_L4_M8_46: +.Lstrmm_kernel_L4_M8_46: KERNEL8x4_SUB -strmm_kernel_L4_M8_100: +.Lstrmm_kernel_L4_M8_100: SAVE8x4 @@ -1317,20 +1317,20 @@ strmm_kernel_L4_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L4_M8_END: +.Lstrmm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #4 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1350,54 +1350,54 @@ strmm_kernel_L4_M4_20: #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -1415,20 +1415,20 @@ strmm_kernel_L4_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -1451,9 +1451,9 @@ strmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1466,22 +1466,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -1500,15 +1500,15 @@ strmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -1531,9 +1531,9 @@ strmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1545,22 +1545,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -1579,26 +1579,26 @@ strmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L4_BEGIN + bgt .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1609,14 +1609,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -strmm_kernel_L2_M16_BEGIN: +.Lstrmm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble strmm_kernel_L2_M8_BEGIN + ble .Lstrmm_kernel_L2_M8_BEGIN -strmm_kernel_L2_M16_20: +.Lstrmm_kernel_L2_M16_20: INIT16x2 @@ -1640,10 +1640,10 @@ strmm_kernel_L2_M16_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M16_40 + ble .Lstrmm_kernel_L2_M16_40 .align 5 -strmm_kernel_L2_M16_22: +.Lstrmm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1655,22 +1655,22 @@ strmm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M16_22 + bgt .Lstrmm_kernel_L2_M16_22 -strmm_kernel_L2_M16_40: +.Lstrmm_kernel_L2_M16_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M16_100 + ble .Lstrmm_kernel_L2_M16_100 -strmm_kernel_L2_M16_42: +.Lstrmm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M16_42 + bgt .Lstrmm_kernel_L2_M16_42 -strmm_kernel_L2_M16_100: +.Lstrmm_kernel_L2_M16_100: SAVE16x2 @@ -1690,22 +1690,22 @@ strmm_kernel_L2_M16_100: add tempOffset, tempOffset, #16 #endif -strmm_kernel_L2_M16_END: +.Lstrmm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M16_20 + bgt .Lstrmm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L2_M8_BEGIN: +.Lstrmm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #8 - ble strmm_kernel_L2_M4_BEGIN + ble .Lstrmm_kernel_L2_M4_BEGIN -strmm_kernel_L2_M8_20: +.Lstrmm_kernel_L2_M8_20: INIT8x2 @@ -1729,10 +1729,10 @@ strmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M8_40 + ble .Lstrmm_kernel_L2_M8_40 .align 5 -strmm_kernel_L2_M8_22: +.Lstrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1744,22 +1744,22 @@ strmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_22 + bgt .Lstrmm_kernel_L2_M8_22 -strmm_kernel_L2_M8_40: +.Lstrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M8_100 + ble .Lstrmm_kernel_L2_M8_100 -strmm_kernel_L2_M8_42: +.Lstrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_42 + bgt .Lstrmm_kernel_L2_M8_42 -strmm_kernel_L2_M8_100: +.Lstrmm_kernel_L2_M8_100: SAVE8x2 @@ -1779,19 +1779,19 @@ strmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L2_M8_END: +.Lstrmm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #4 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -1814,10 +1814,10 @@ strmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1829,22 +1829,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -1863,21 +1863,21 @@ strmm_kernel_L2_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -1900,9 +1900,9 @@ strmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1915,22 +1915,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1949,15 +1949,15 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -1980,9 +1980,9 @@ strmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1994,22 +1994,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -2028,7 +2028,7 @@ strmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2036,11 +2036,11 @@ strmm_kernel_L2_END: /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2051,14 +2051,14 @@ strmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -strmm_kernel_L1_M16_BEGIN: +.Lstrmm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble strmm_kernel_L1_M8_BEGIN + ble .Lstrmm_kernel_L1_M8_BEGIN -strmm_kernel_L1_M16_20: +.Lstrmm_kernel_L1_M16_20: INIT16x1 @@ -2082,10 +2082,10 @@ strmm_kernel_L1_M16_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M16_40 + ble .Lstrmm_kernel_L1_M16_40 .align 5 -strmm_kernel_L1_M16_22: +.Lstrmm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -2097,22 +2097,22 @@ strmm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M16_22 + bgt .Lstrmm_kernel_L1_M16_22 -strmm_kernel_L1_M16_40: +.Lstrmm_kernel_L1_M16_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M16_100 + ble .Lstrmm_kernel_L1_M16_100 -strmm_kernel_L1_M16_42: +.Lstrmm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M16_42 + bgt .Lstrmm_kernel_L1_M16_42 -strmm_kernel_L1_M16_100: +.Lstrmm_kernel_L1_M16_100: SAVE16x1 @@ -2132,23 +2132,23 @@ strmm_kernel_L1_M16_100: add tempOffset, tempOffset, #16 #endif -strmm_kernel_L1_M16_END: +.Lstrmm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M16_20 + bgt .Lstrmm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L1_M8_BEGIN: +.Lstrmm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #8 - ble strmm_kernel_L1_M4_BEGIN + ble .Lstrmm_kernel_L1_M4_BEGIN -strmm_kernel_L1_M8_20: +.Lstrmm_kernel_L1_M8_20: INIT8x1 @@ -2172,10 +2172,10 @@ strmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M8_40 + ble .Lstrmm_kernel_L1_M8_40 .align 5 -strmm_kernel_L1_M8_22: +.Lstrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2187,22 +2187,22 @@ strmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_22 + bgt .Lstrmm_kernel_L1_M8_22 -strmm_kernel_L1_M8_40: +.Lstrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M8_100 + ble .Lstrmm_kernel_L1_M8_100 -strmm_kernel_L1_M8_42: +.Lstrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_42 + bgt .Lstrmm_kernel_L1_M8_42 -strmm_kernel_L1_M8_100: +.Lstrmm_kernel_L1_M8_100: SAVE8x1 @@ -2222,19 +2222,19 @@ strmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L1_M8_END: +.Lstrmm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #4 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -2257,10 +2257,10 @@ strmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2272,22 +2272,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -2306,20 +2306,20 @@ strmm_kernel_L1_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -2342,9 +2342,9 @@ strmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2357,22 +2357,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -2391,15 +2391,15 @@ strmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -2422,9 +2422,9 @@ strmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2436,28 +2436,28 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S index eeb3e6e..5f7818c 100644 --- a/kernel/arm64/strmm_kernel_4x4.S +++ b/kernel/arm64/strmm_kernel_4x4.S @@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -539,11 +539,11 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M4_20 + bne .Lstrmm_kernel_L4_M4_20 -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100: #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100: #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) @@ -825,19 +825,19 @@ strmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L4_BEGIN + bgt .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M4_20 + bgt .Lstrmm_kernel_L2_M4_20 -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1107,11 +1107,11 @@ strmm_kernel_L2_END: /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M4_20 + bgt .Lstrmm_kernel_L1_M4_20 -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100: #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 @@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100: #endif #endif -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: #if 0 #if !defined(LEFT) @@ -1385,7 +1385,7 @@ strmm_kernel_L1_END: #endif #endif -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S index 843f0c8..cd18e68 100644 --- a/kernel/arm64/strmm_kernel_8x8.S +++ b/kernel/arm64/strmm_kernel_8x8.S @@ -1257,7 +1257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1288,12 +1288,12 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble strmm_kernel_L4_BEGIN + ble .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L8_BEGIN: +.Lstrmm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -1305,14 +1305,14 @@ strmm_kernel_L8_BEGIN: /******************************************************************************/ -strmm_kernel_L8_M8_BEGIN: +.Lstrmm_kernel_L8_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble strmm_kernel_L8_M4_BEGIN + ble .Lstrmm_kernel_L8_M4_BEGIN -strmm_kernel_L8_M8_20: +.Lstrmm_kernel_L8_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1333,54 +1333,54 @@ strmm_kernel_L8_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L8_M8_32 + blt .Lstrmm_kernel_L8_M8_32 KERNEL8x8_I // do one in the K KERNEL8x8_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L8_M8_22a + ble .Lstrmm_kernel_L8_M8_22a .align 5 -strmm_kernel_L8_M8_22: +.Lstrmm_kernel_L8_M8_22: KERNEL8x8_M1 KERNEL8x8_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L8_M8_22 + bgt .Lstrmm_kernel_L8_M8_22 -strmm_kernel_L8_M8_22a: +.Lstrmm_kernel_L8_M8_22a: KERNEL8x8_M1 KERNEL8x8_E - b strmm_kernel_L8_M8_44 + b .Lstrmm_kernel_L8_M8_44 -strmm_kernel_L8_M8_32: +.Lstrmm_kernel_L8_M8_32: tst counterL, #1 - ble strmm_kernel_L8_M8_40 + ble .Lstrmm_kernel_L8_M8_40 KERNEL8x8_I KERNEL8x8_E - b strmm_kernel_L8_M8_44 + b .Lstrmm_kernel_L8_M8_44 -strmm_kernel_L8_M8_40: +.Lstrmm_kernel_L8_M8_40: INIT8x8 -strmm_kernel_L8_M8_44: +.Lstrmm_kernel_L8_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L8_M8_100 + ble .Lstrmm_kernel_L8_M8_100 -strmm_kernel_L8_M8_46: +.Lstrmm_kernel_L8_M8_46: KERNEL8x8_SUB -strmm_kernel_L8_M8_100: +.Lstrmm_kernel_L8_M8_100: SAVE8x8 @@ -1399,22 +1399,22 @@ strmm_kernel_L8_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L8_M8_END: +.Lstrmm_kernel_L8_M8_END: subs counterI, counterI, #1 - bne strmm_kernel_L8_M8_20 + bne .Lstrmm_kernel_L8_M8_20 /******************************************************************************/ -strmm_kernel_L8_M4_BEGIN: +.Lstrmm_kernel_L8_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END tst counterI, #4 - ble strmm_kernel_L8_M2_BEGIN + ble .Lstrmm_kernel_L8_M2_BEGIN -strmm_kernel_L8_M4_20: +.Lstrmm_kernel_L8_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1436,54 +1436,54 @@ strmm_kernel_L8_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L8_M4_32 + blt .Lstrmm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L8_M4_22a + ble .Lstrmm_kernel_L8_M4_22a .align 5 -strmm_kernel_L8_M4_22: +.Lstrmm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L8_M4_22 + bgt .Lstrmm_kernel_L8_M4_22 -strmm_kernel_L8_M4_22a: +.Lstrmm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b strmm_kernel_L8_M4_44 + b .Lstrmm_kernel_L8_M4_44 -strmm_kernel_L8_M4_32: +.Lstrmm_kernel_L8_M4_32: tst counterL, #1 - ble strmm_kernel_L8_M4_40 + ble .Lstrmm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b strmm_kernel_L8_M4_44 + b .Lstrmm_kernel_L8_M4_44 -strmm_kernel_L8_M4_40: +.Lstrmm_kernel_L8_M4_40: INIT4x8 -strmm_kernel_L8_M4_44: +.Lstrmm_kernel_L8_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L8_M4_100 + ble .Lstrmm_kernel_L8_M4_100 -strmm_kernel_L8_M4_46: +.Lstrmm_kernel_L8_M4_46: KERNEL4x8_SUB -strmm_kernel_L8_M4_100: +.Lstrmm_kernel_L8_M4_100: SAVE4x8 @@ -1503,20 +1503,20 @@ strmm_kernel_L8_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L8_M4_END: +.Lstrmm_kernel_L8_M4_END: /******************************************************************************/ -strmm_kernel_L8_M2_BEGIN: +.Lstrmm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L8_M1_BEGIN + ble .Lstrmm_kernel_L8_M1_BEGIN -strmm_kernel_L8_M2_20: +.Lstrmm_kernel_L8_M2_20: INIT2x8 @@ -1540,9 +1540,9 @@ strmm_kernel_L8_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L8_M2_40 + ble .Lstrmm_kernel_L8_M2_40 -strmm_kernel_L8_M2_22: +.Lstrmm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1555,22 +1555,22 @@ strmm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M2_22 + bgt .Lstrmm_kernel_L8_M2_22 -strmm_kernel_L8_M2_40: +.Lstrmm_kernel_L8_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L8_M2_100 + ble .Lstrmm_kernel_L8_M2_100 -strmm_kernel_L8_M2_42: +.Lstrmm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M2_42 + bgt .Lstrmm_kernel_L8_M2_42 -strmm_kernel_L8_M2_100: +.Lstrmm_kernel_L8_M2_100: SAVE2x8 @@ -1590,16 +1590,16 @@ strmm_kernel_L8_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L8_M2_END: +.Lstrmm_kernel_L8_M2_END: /******************************************************************************/ -strmm_kernel_L8_M1_BEGIN: +.Lstrmm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END -strmm_kernel_L8_M1_20: +.Lstrmm_kernel_L8_M1_20: INIT1x8 @@ -1623,9 +1623,9 @@ strmm_kernel_L8_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L8_M1_40 + ble .Lstrmm_kernel_L8_M1_40 -strmm_kernel_L8_M1_22: +.Lstrmm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1637,22 +1637,22 @@ strmm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M1_22 + bgt .Lstrmm_kernel_L8_M1_22 -strmm_kernel_L8_M1_40: +.Lstrmm_kernel_L8_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L8_M1_100 + ble .Lstrmm_kernel_L8_M1_100 -strmm_kernel_L8_M1_42: +.Lstrmm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M1_42 + bgt .Lstrmm_kernel_L8_M1_42 -strmm_kernel_L8_M1_100: +.Lstrmm_kernel_L8_M1_100: SAVE1x8 @@ -1672,7 +1672,7 @@ strmm_kernel_L8_M1_100: add tempOffset, tempOffset, #1 #endif -strmm_kernel_L8_END: +.Lstrmm_kernel_L8_END: lsl temp, origK, #5 // B = B + K * 4 * 8 add origPB, origPB, temp @@ -1681,19 +1681,19 @@ strmm_kernel_L8_END: #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L8_BEGIN + bgt .Lstrmm_kernel_L8_BEGIN /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #4 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1707,14 +1707,14 @@ strmm_kernel_L4_BEGIN: /******************************************************************************/ -strmm_kernel_L4_M8_BEGIN: +.Lstrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble strmm_kernel_L4_M4_BEGIN + ble .Lstrmm_kernel_L4_M4_BEGIN -strmm_kernel_L4_M8_20: +.Lstrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1736,54 +1736,54 @@ strmm_kernel_L4_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M8_32 + blt .Lstrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M8_22a + ble .Lstrmm_kernel_L4_M8_22a .align 5 -strmm_kernel_L4_M8_22: +.Lstrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M8_22 + bgt .Lstrmm_kernel_L4_M8_22 -strmm_kernel_L4_M8_22a: +.Lstrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_32: +.Lstrmm_kernel_L4_M8_32: tst counterL, #1 - ble strmm_kernel_L4_M8_40 + ble .Lstrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_40: +.Lstrmm_kernel_L4_M8_40: INIT8x4 -strmm_kernel_L4_M8_44: +.Lstrmm_kernel_L4_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M8_100 + ble .Lstrmm_kernel_L4_M8_100 -strmm_kernel_L4_M8_46: +.Lstrmm_kernel_L4_M8_46: KERNEL8x4_SUB -strmm_kernel_L4_M8_100: +.Lstrmm_kernel_L4_M8_100: SAVE8x4 @@ -1802,22 +1802,22 @@ strmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L4_M8_END: +.Lstrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M8_20 + bne .Lstrmm_kernel_L4_M8_20 /******************************************************************************/ -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #4 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1837,54 +1837,54 @@ strmm_kernel_L4_M4_20: #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -1902,20 +1902,20 @@ strmm_kernel_L4_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: /******************************************************************************/ -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -1938,9 +1938,9 @@ strmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1953,22 +1953,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -1987,16 +1987,16 @@ strmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: /******************************************************************************/ -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -2019,9 +2019,9 @@ strmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -2033,22 +2033,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -2067,7 +2067,7 @@ strmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 @@ -2076,14 +2076,14 @@ strmm_kernel_L4_END: /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -2096,14 +2096,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction /******************************************************************************/ -strmm_kernel_L2_M8_BEGIN: +.Lstrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 - ble strmm_kernel_L2_M4_BEGIN + ble .Lstrmm_kernel_L2_M4_BEGIN -strmm_kernel_L2_M8_20: +.Lstrmm_kernel_L2_M8_20: INIT8x2 @@ -2126,10 +2126,10 @@ strmm_kernel_L2_M8_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M8_40 + ble .Lstrmm_kernel_L2_M8_40 .align 5 -strmm_kernel_L2_M8_22: +.Lstrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -2141,22 +2141,22 @@ strmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_22 + bgt .Lstrmm_kernel_L2_M8_22 -strmm_kernel_L2_M8_40: +.Lstrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M8_100 + ble .Lstrmm_kernel_L2_M8_100 -strmm_kernel_L2_M8_42: +.Lstrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_42 + bgt .Lstrmm_kernel_L2_M8_42 -strmm_kernel_L2_M8_100: +.Lstrmm_kernel_L2_M8_100: SAVE8x2 @@ -2175,23 +2175,23 @@ strmm_kernel_L2_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L2_M8_END: +.Lstrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M8_20 + bgt .Lstrmm_kernel_L2_M8_20 /******************************************************************************/ -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #4 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -2214,10 +2214,10 @@ strmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -2229,22 +2229,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -2263,20 +2263,20 @@ strmm_kernel_L2_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: /******************************************************************************/ -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -2299,9 +2299,9 @@ strmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -2314,22 +2314,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2348,16 +2348,16 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: /******************************************************************************/ -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -2380,9 +2380,9 @@ strmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2394,22 +2394,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -2428,7 +2428,7 @@ strmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2437,11 +2437,11 @@ strmm_kernel_L2_END: /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2454,14 +2454,14 @@ strmm_kernel_L1_BEGIN: /******************************************************************************/ -strmm_kernel_L1_M8_BEGIN: +.Lstrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 cmp counterI, #0 - ble strmm_kernel_L1_M4_BEGIN + ble .Lstrmm_kernel_L1_M4_BEGIN -strmm_kernel_L1_M8_20: +.Lstrmm_kernel_L1_M8_20: INIT8x1 @@ -2484,10 +2484,10 @@ strmm_kernel_L1_M8_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M8_40 + ble .Lstrmm_kernel_L1_M8_40 .align 5 -strmm_kernel_L1_M8_22: +.Lstrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2499,22 +2499,22 @@ strmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_22 + bgt .Lstrmm_kernel_L1_M8_22 -strmm_kernel_L1_M8_40: +.Lstrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M8_100 + ble .Lstrmm_kernel_L1_M8_100 -strmm_kernel_L1_M8_42: +.Lstrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_42 + bgt .Lstrmm_kernel_L1_M8_42 -strmm_kernel_L1_M8_100: +.Lstrmm_kernel_L1_M8_100: SAVE8x1 @@ -2533,23 +2533,23 @@ strmm_kernel_L1_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L1_M8_END: +.Lstrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M8_20 + bgt .Lstrmm_kernel_L1_M8_20 /******************************************************************************/ -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #4 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -2572,10 +2572,10 @@ strmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2587,22 +2587,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -2621,20 +2621,20 @@ strmm_kernel_L1_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: /******************************************************************************/ -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -2657,9 +2657,9 @@ strmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2672,22 +2672,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -2706,16 +2706,16 @@ strmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: /******************************************************************************/ -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -2738,9 +2738,9 @@ strmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2752,30 +2752,30 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: /******************************************************************************/ -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/swap.S b/kernel/arm64/swap.S index 37ed83f..184e02e 100644 --- a/kernel/arm64/swap.S +++ b/kernel/arm64/swap.S @@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble swap_kernel_L999 + ble .Lswap_kernel_L999 cmp INC_X, #1 - bne swap_kernel_S_BEGIN + bne .Lswap_kernel_S_BEGIN cmp INC_Y, #1 - bne swap_kernel_S_BEGIN + bne .Lswap_kernel_S_BEGIN -swap_kernel_F_BEGIN: +.Lswap_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq swap_kernel_F1 + beq .Lswap_kernel_F1 -swap_kernel_F8: +.Lswap_kernel_F8: KERNEL_F8 subs I, I, #1 - bne swap_kernel_F8 + bne .Lswap_kernel_F8 -swap_kernel_F1: +.Lswap_kernel_F1: ands I, N, #7 - ble swap_kernel_L999 + ble .Lswap_kernel_L999 -swap_kernel_F10: +.Lswap_kernel_F10: KERNEL_F1 subs I, I, #1 - bne swap_kernel_F10 + bne .Lswap_kernel_F10 - b swap_kernel_L999 + b .Lswap_kernel_L999 -swap_kernel_S_BEGIN: +.Lswap_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble swap_kernel_S1 + ble .Lswap_kernel_S1 -swap_kernel_S4: +.Lswap_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -244,21 +244,21 @@ swap_kernel_S4: KERNEL_S1 subs I, I, #1 - bne swap_kernel_S4 + bne .Lswap_kernel_S4 -swap_kernel_S1: +.Lswap_kernel_S1: ands I, N, #3 - ble swap_kernel_L999 + ble .Lswap_kernel_L999 -swap_kernel_S10: +.Lswap_kernel_S10: KERNEL_S1 subs I, I, #1 - bne swap_kernel_S10 + bne .Lswap_kernel_S10 -swap_kernel_L999: +.Lswap_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zamax.S b/kernel/arm64/zamax.S index 7db339f..c2c0a53 100644 --- a/kernel/arm64/zamax.S +++ b/kernel/arm64/zamax.S @@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble amax_kernel_zero + ble .Lzamax_kernel_zero cmp INC_X, xzr - ble amax_kernel_zero + ble .Lzamax_kernel_zero cmp INC_X, #1 - bne amax_kernel_S_BEGIN + bne .Lzamax_kernel_S_BEGIN -amax_kernel_F_BEGIN: +.Lzamax_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq amax_kernel_F1_INIT + beq .Lzamax_kernel_F1_INIT INIT_F4 subs I, I, #1 - beq amax_kernel_F1 + beq .Lzamax_kernel_F1 -amax_kernel_F4: +.Lzamax_kernel_F4: KERNEL_F4 subs I, I, #1 - bne amax_kernel_F4 + bne .Lzamax_kernel_F4 -amax_kernel_F1: +.Lzamax_kernel_F1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 -amax_kernel_F10: +.Lzamax_kernel_F10: KERNEL_F1 subs I, I, #1 - bne amax_kernel_F10 + bne .Lzamax_kernel_F10 ret -amax_kernel_F1_INIT: +.Lzamax_kernel_F1_INIT: INIT_F1 subs N, N, #1 - b amax_kernel_F1 + b .Lzamax_kernel_F1 -amax_kernel_S_BEGIN: +.Lzamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble amax_kernel_S1 + ble .Lzamax_kernel_S1 -amax_kernel_S4: +.Lzamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -247,25 +247,25 @@ amax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S4 + bne .Lzamax_kernel_S4 -amax_kernel_S1: +.Lzamax_kernel_S1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 -amax_kernel_S10: +.Lzamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S10 + bne .Lzamax_kernel_S10 -amax_kernel_L999: +.Lzamax_kernel_L999: ret -amax_kernel_zero: +.Lzamax_kernel_zero: fmov MAXF, REG0 ret diff --git a/kernel/arm64/zasum.S b/kernel/arm64/zasum.S index bf586d3..0d5ec95 100644 --- a/kernel/arm64/zasum.S +++ b/kernel/arm64/zasum.S @@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov SUMF, REG0 cmp N, xzr - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lzasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lzasum_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq asum_kernel_F1 + beq .Lzasum_kernel_F1 -asum_kernel_F4: +.Lzasum_kernel_F4: KERNEL_F4 subs I, I, #1 - bne asum_kernel_F4 + bne .Lzasum_kernel_F4 KERNEL_F4_FINALIZE -asum_kernel_F1: +.Lzasum_kernel_F1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 -asum_kernel_F10: +.Lzasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lzasum_kernel_F10 -asum_kernel_L999: +.Lzasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lzasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lzasum_kernel_S1 -asum_kernel_S4: +.Lzasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -145,19 +145,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lzasum_kernel_S4 -asum_kernel_S1: +.Lzasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 -asum_kernel_S10: +.Lzasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lzasum_kernel_S10 ret diff --git a/kernel/arm64/zaxpy.S b/kernel/arm64/zaxpy.S index 70c2499..46d7b04 100644 --- a/kernel/arm64/zaxpy.S +++ b/kernel/arm64/zaxpy.S @@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 mov Y_COPY, Y fcmp DA_R, #0.0 bne .L1 fcmp DA_I, #0.0 - beq zaxpy_kernel_L999 + beq .Lzaxpy_kernel_L999 .L1: INIT cmp INC_X, #1 - bne zaxpy_kernel_S_BEGIN + bne .Lzaxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne zaxpy_kernel_S_BEGIN + bne .Lzaxpy_kernel_S_BEGIN -zaxpy_kernel_F_BEGIN: +.Lzaxpy_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq zaxpy_kernel_F1 + beq .Lzaxpy_kernel_F1 KERNEL_INIT_F4 -zaxpy_kernel_F4: +.Lzaxpy_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zaxpy_kernel_F4 + bne .Lzaxpy_kernel_F4 -zaxpy_kernel_F1: +.Lzaxpy_kernel_F1: ands I, N, #3 - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 -zaxpy_kernel_F10: +.Lzaxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zaxpy_kernel_F10 + bne .Lzaxpy_kernel_F10 mov w0, wzr ret -zaxpy_kernel_S_BEGIN: +.Lzaxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble zaxpy_kernel_S1 + ble .Lzaxpy_kernel_S1 -zaxpy_kernel_S4: +.Lzaxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -304,21 +304,21 @@ zaxpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zaxpy_kernel_S4 + bne .Lzaxpy_kernel_S4 -zaxpy_kernel_S1: +.Lzaxpy_kernel_S1: ands I, N, #3 - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 -zaxpy_kernel_S10: +.Lzaxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zaxpy_kernel_S10 + bne .Lzaxpy_kernel_S10 -zaxpy_kernel_L999: +.Lzaxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zdot.S b/kernel/arm64/zdot.S index 3e8e3d7..044ace3 100644 --- a/kernel/arm64/zdot.S +++ b/kernel/arm64/zdot.S @@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 cmp INC_X, #1 - bne dot_kernel_S_BEGIN + bne .Lzdot_kernel_S_BEGIN cmp INC_Y, #1 - bne dot_kernel_S_BEGIN + bne .Lzdot_kernel_S_BEGIN -dot_kernel_F_BEGIN: +.Lzdot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq dot_kernel_F1 + beq .Lzdot_kernel_F1 -dot_kernel_F4: +.Lzdot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne dot_kernel_F4 + bne .Lzdot_kernel_F4 KERNEL_F4_FINALIZE -dot_kernel_F1: +.Lzdot_kernel_F1: ands I, N, #3 - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 -dot_kernel_F10: +.Lzdot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne dot_kernel_F10 + bne .Lzdot_kernel_F10 ret -dot_kernel_S_BEGIN: +.Lzdot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble dot_kernel_S1 + ble .Lzdot_kernel_S1 -dot_kernel_S4: +.Lzdot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -281,21 +281,21 @@ dot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S4 + bne .Lzdot_kernel_S4 -dot_kernel_S1: +.Lzdot_kernel_S1: ands I, N, #3 - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 -dot_kernel_S10: +.Lzdot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S10 + bne .Lzdot_kernel_S10 -dot_kernel_L999: +.Lzdot_kernel_L999: ret diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 08a1531..f8e877f 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble zgemm_kernel_L2_BEGIN + ble .Lzgemm_kernel_L2_BEGIN -zgemm_kernel_L4_BEGIN: +.Lzgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -zgemm_kernel_L4_M4_BEGIN: +.Lzgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L4_M2_BEGIN + ble .Lzgemm_kernel_L4_M2_BEGIN .align 5 -zgemm_kernel_L4_M4_20: +.Lzgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt zgemm_kernel_L4_M4_32 + blt .Lzgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 - ble zgemm_kernel_L4_M4_22a + ble .Lzgemm_kernel_L4_M4_22a .align 5 -zgemm_kernel_L4_M4_22: +.Lzgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M4_22 + bgt .Lzgemm_kernel_L4_M4_22 .align 5 -zgemm_kernel_L4_M4_22a: +.Lzgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 .align 5 -zgemm_kernel_L4_M4_32: +.Lzgemm_kernel_L4_M4_32: tst counterL, #1 - ble zgemm_kernel_L4_M4_40 + ble .Lzgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 -zgemm_kernel_L4_M4_40: +.Lzgemm_kernel_L4_M4_40: INIT4x4 -zgemm_kernel_L4_M4_44: +.Lzgemm_kernel_L4_M4_44: ands counterL , origK, #7 - ble zgemm_kernel_L4_M4_100 + ble .Lzgemm_kernel_L4_M4_100 .align 5 -zgemm_kernel_L4_M4_46: +.Lzgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne zgemm_kernel_L4_M4_46 + bne .Lzgemm_kernel_L4_M4_46 -zgemm_kernel_L4_M4_100: +.Lzgemm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE4x4 -zgemm_kernel_L4_M4_END: +.Lzgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne zgemm_kernel_L4_M4_20 + bne .Lzgemm_kernel_L4_M4_20 -zgemm_kernel_L4_M2_BEGIN: +.Lzgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L4_M1_BEGIN + ble .Lzgemm_kernel_L4_M1_BEGIN -zgemm_kernel_L4_M2_20: +.Lzgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M2_40 + ble .Lzgemm_kernel_L4_M2_40 -zgemm_kernel_L4_M2_22: +.Lzgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_22 + bgt .Lzgemm_kernel_L4_M2_22 -zgemm_kernel_L4_M2_40: +.Lzgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M2_100 + ble .Lzgemm_kernel_L4_M2_100 -zgemm_kernel_L4_M2_42: +.Lzgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_42 + bgt .Lzgemm_kernel_L4_M2_42 -zgemm_kernel_L4_M2_100: +.Lzgemm_kernel_L4_M2_100: SAVE2x4 -zgemm_kernel_L4_M2_END: +.Lzgemm_kernel_L4_M2_END: -zgemm_kernel_L4_M1_BEGIN: +.Lzgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END -zgemm_kernel_L4_M1_20: +.Lzgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M1_40 + ble .Lzgemm_kernel_L4_M1_40 -zgemm_kernel_L4_M1_22: +.Lzgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_22 + bgt .Lzgemm_kernel_L4_M1_22 -zgemm_kernel_L4_M1_40: +.Lzgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M1_100 + ble .Lzgemm_kernel_L4_M1_100 -zgemm_kernel_L4_M1_42: +.Lzgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_42 + bgt .Lzgemm_kernel_L4_M1_42 -zgemm_kernel_L4_M1_100: +.Lzgemm_kernel_L4_M1_100: SAVE1x4 -zgemm_kernel_L4_END: +.Lzgemm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 subs counterJ, counterJ , #1 // j-- - bgt zgemm_kernel_L4_BEGIN + bgt .Lzgemm_kernel_L4_BEGIN /******************************************************************************/ -zgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble zgemm_kernel_L999 + ble .Lzgemm_kernel_L999 tst counterJ , #2 - ble zgemm_kernel_L1_BEGIN + ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction -zgemm_kernel_L2_M4_BEGIN: +.Lzgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble zgemm_kernel_L2_M2_BEGIN + ble .Lzgemm_kernel_L2_M2_BEGIN -zgemm_kernel_L2_M4_20: +.Lzgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M4_40 + ble .Lzgemm_kernel_L2_M4_40 .align 5 -zgemm_kernel_L2_M4_22: +.Lzgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_22 + bgt .Lzgemm_kernel_L2_M4_22 -zgemm_kernel_L2_M4_40: +.Lzgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M4_100 + ble .Lzgemm_kernel_L2_M4_100 -zgemm_kernel_L2_M4_42: +.Lzgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_42 + bgt .Lzgemm_kernel_L2_M4_42 -zgemm_kernel_L2_M4_100: +.Lzgemm_kernel_L2_M4_100: SAVE4x2 -zgemm_kernel_L2_M4_END: +.Lzgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L2_M4_20 + bgt .Lzgemm_kernel_L2_M4_20 -zgemm_kernel_L2_M2_BEGIN: +.Lzgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L2_M1_BEGIN + ble .Lzgemm_kernel_L2_M1_BEGIN -zgemm_kernel_L2_M2_20: +.Lzgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M2_40 + ble .Lzgemm_kernel_L2_M2_40 -zgemm_kernel_L2_M2_22: +.Lzgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_22 + bgt .Lzgemm_kernel_L2_M2_22 -zgemm_kernel_L2_M2_40: +.Lzgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M2_100 + ble .Lzgemm_kernel_L2_M2_100 -zgemm_kernel_L2_M2_42: +.Lzgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_42 + bgt .Lzgemm_kernel_L2_M2_42 -zgemm_kernel_L2_M2_100: +.Lzgemm_kernel_L2_M2_100: SAVE2x2 -zgemm_kernel_L2_M2_END: +.Lzgemm_kernel_L2_M2_END: -zgemm_kernel_L2_M1_BEGIN: +.Lzgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END -zgemm_kernel_L2_M1_20: +.Lzgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble zgemm_kernel_L2_M1_40 + ble .Lzgemm_kernel_L2_M1_40 -zgemm_kernel_L2_M1_22: +.Lzgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_22 + bgt .Lzgemm_kernel_L2_M1_22 -zgemm_kernel_L2_M1_40: +.Lzgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M1_100 + ble .Lzgemm_kernel_L2_M1_100 -zgemm_kernel_L2_M1_42: +.Lzgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_42 + bgt .Lzgemm_kernel_L2_M1_42 -zgemm_kernel_L2_M1_100: +.Lzgemm_kernel_L2_M1_100: SAVE1x2 -zgemm_kernel_L2_END: +.Lzgemm_kernel_L2_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ -zgemm_kernel_L1_BEGIN: +.Lzgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble zgemm_kernel_L999 // done + ble .Lzgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN: -zgemm_kernel_L1_M4_BEGIN: +.Lzgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L1_M2_BEGIN + ble .Lzgemm_kernel_L1_M2_BEGIN -zgemm_kernel_L1_M4_20: +.Lzgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M4_40 + ble .Lzgemm_kernel_L1_M4_40 .align 5 -zgemm_kernel_L1_M4_22: +.Lzgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_22 + bgt .Lzgemm_kernel_L1_M4_22 -zgemm_kernel_L1_M4_40: +.Lzgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M4_100 + ble .Lzgemm_kernel_L1_M4_100 -zgemm_kernel_L1_M4_42: +.Lzgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_42 + bgt .Lzgemm_kernel_L1_M4_42 -zgemm_kernel_L1_M4_100: +.Lzgemm_kernel_L1_M4_100: SAVE4x1 -zgemm_kernel_L1_M4_END: +.Lzgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L1_M4_20 + bgt .Lzgemm_kernel_L1_M4_20 -zgemm_kernel_L1_M2_BEGIN: +.Lzgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L1_M1_BEGIN + ble .Lzgemm_kernel_L1_M1_BEGIN -zgemm_kernel_L1_M2_20: +.Lzgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M2_40 + ble .Lzgemm_kernel_L1_M2_40 -zgemm_kernel_L1_M2_22: +.Lzgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_22 + bgt .Lzgemm_kernel_L1_M2_22 -zgemm_kernel_L1_M2_40: +.Lzgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M2_100 + ble .Lzgemm_kernel_L1_M2_100 -zgemm_kernel_L1_M2_42: +.Lzgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_42 + bgt .Lzgemm_kernel_L1_M2_42 -zgemm_kernel_L1_M2_100: +.Lzgemm_kernel_L1_M2_100: SAVE2x1 -zgemm_kernel_L1_M2_END: +.Lzgemm_kernel_L1_M2_END: -zgemm_kernel_L1_M1_BEGIN: +.Lzgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END -zgemm_kernel_L1_M1_20: +.Lzgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M1_40 + ble .Lzgemm_kernel_L1_M1_40 -zgemm_kernel_L1_M1_22: +.Lzgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_22 + bgt .Lzgemm_kernel_L1_M1_22 -zgemm_kernel_L1_M1_40: +.Lzgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M1_100 + ble .Lzgemm_kernel_L1_M1_100 -zgemm_kernel_L1_M1_42: +.Lzgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_42 + bgt .Lzgemm_kernel_L1_M1_42 -zgemm_kernel_L1_M1_100: +.Lzgemm_kernel_L1_M1_100: SAVE1x1 -zgemm_kernel_L1_END: +.Lzgemm_kernel_L1_END: -zgemm_kernel_L999: +.Lzgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S b/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S index e5b4cba..8e6ff65 100644 --- a/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S +++ b/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S @@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble zgemm_kernel_L2_BEGIN + ble .Lzgemm_kernel_L2_BEGIN -zgemm_kernel_L4_BEGIN: +.Lzgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -zgemm_kernel_L4_M4_BEGIN: +.Lzgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L4_M2_BEGIN + ble .Lzgemm_kernel_L4_M2_BEGIN .align 5 -zgemm_kernel_L4_M4_20: +.Lzgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt zgemm_kernel_L4_M4_32 + blt .Lzgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 - ble zgemm_kernel_L4_M4_22a + ble .Lzgemm_kernel_L4_M4_22a .align 5 -zgemm_kernel_L4_M4_22: +.Lzgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M4_22 + bgt .Lzgemm_kernel_L4_M4_22 .align 5 -zgemm_kernel_L4_M4_22a: +.Lzgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 .align 5 -zgemm_kernel_L4_M4_32: +.Lzgemm_kernel_L4_M4_32: tst counterL, #1 - ble zgemm_kernel_L4_M4_40 + ble .Lzgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 -zgemm_kernel_L4_M4_40: +.Lzgemm_kernel_L4_M4_40: INIT4x4 -zgemm_kernel_L4_M4_44: +.Lzgemm_kernel_L4_M4_44: ands counterL , origK, #7 - ble zgemm_kernel_L4_M4_100 + ble .Lzgemm_kernel_L4_M4_100 .align 5 -zgemm_kernel_L4_M4_46: +.Lzgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne zgemm_kernel_L4_M4_46 + bne .Lzgemm_kernel_L4_M4_46 -zgemm_kernel_L4_M4_100: +.Lzgemm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE4x4 -zgemm_kernel_L4_M4_END: +.Lzgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne zgemm_kernel_L4_M4_20 + bne .Lzgemm_kernel_L4_M4_20 -zgemm_kernel_L4_M2_BEGIN: +.Lzgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L4_M1_BEGIN + ble .Lzgemm_kernel_L4_M1_BEGIN -zgemm_kernel_L4_M2_20: +.Lzgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M2_40 + ble .Lzgemm_kernel_L4_M2_40 -zgemm_kernel_L4_M2_22: +.Lzgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_22 + bgt .Lzgemm_kernel_L4_M2_22 -zgemm_kernel_L4_M2_40: +.Lzgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M2_100 + ble .Lzgemm_kernel_L4_M2_100 -zgemm_kernel_L4_M2_42: +.Lzgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_42 + bgt .Lzgemm_kernel_L4_M2_42 -zgemm_kernel_L4_M2_100: +.Lzgemm_kernel_L4_M2_100: SAVE2x4 -zgemm_kernel_L4_M2_END: +.Lzgemm_kernel_L4_M2_END: -zgemm_kernel_L4_M1_BEGIN: +.Lzgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END -zgemm_kernel_L4_M1_20: +.Lzgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M1_40 + ble .Lzgemm_kernel_L4_M1_40 -zgemm_kernel_L4_M1_22: +.Lzgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_22 + bgt .Lzgemm_kernel_L4_M1_22 -zgemm_kernel_L4_M1_40: +.Lzgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M1_100 + ble .Lzgemm_kernel_L4_M1_100 -zgemm_kernel_L4_M1_42: +.Lzgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_42 + bgt .Lzgemm_kernel_L4_M1_42 -zgemm_kernel_L4_M1_100: +.Lzgemm_kernel_L4_M1_100: SAVE1x4 -zgemm_kernel_L4_END: +.Lzgemm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 subs counterJ, counterJ , #1 // j-- - bgt zgemm_kernel_L4_BEGIN + bgt .Lzgemm_kernel_L4_BEGIN /******************************************************************************/ -zgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble zgemm_kernel_L999 + ble .Lzgemm_kernel_L999 tst counterJ , #2 - ble zgemm_kernel_L1_BEGIN + ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction -zgemm_kernel_L2_M4_BEGIN: +.Lzgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble zgemm_kernel_L2_M2_BEGIN + ble .Lzgemm_kernel_L2_M2_BEGIN -zgemm_kernel_L2_M4_20: +.Lzgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M4_40 + ble .Lzgemm_kernel_L2_M4_40 .align 5 -zgemm_kernel_L2_M4_22: +.Lzgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_22 + bgt .Lzgemm_kernel_L2_M4_22 -zgemm_kernel_L2_M4_40: +.Lzgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M4_100 + ble .Lzgemm_kernel_L2_M4_100 -zgemm_kernel_L2_M4_42: +.Lzgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_42 + bgt .Lzgemm_kernel_L2_M4_42 -zgemm_kernel_L2_M4_100: +.Lzgemm_kernel_L2_M4_100: SAVE4x2 -zgemm_kernel_L2_M4_END: +.Lzgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L2_M4_20 + bgt .Lzgemm_kernel_L2_M4_20 -zgemm_kernel_L2_M2_BEGIN: +.Lzgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L2_M1_BEGIN + ble .Lzgemm_kernel_L2_M1_BEGIN -zgemm_kernel_L2_M2_20: +.Lzgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M2_40 + ble .Lzgemm_kernel_L2_M2_40 -zgemm_kernel_L2_M2_22: +.Lzgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_22 + bgt .Lzgemm_kernel_L2_M2_22 -zgemm_kernel_L2_M2_40: +.Lzgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M2_100 + ble .Lzgemm_kernel_L2_M2_100 -zgemm_kernel_L2_M2_42: +.Lzgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_42 + bgt .Lzgemm_kernel_L2_M2_42 -zgemm_kernel_L2_M2_100: +.Lzgemm_kernel_L2_M2_100: SAVE2x2 -zgemm_kernel_L2_M2_END: +.Lzgemm_kernel_L2_M2_END: -zgemm_kernel_L2_M1_BEGIN: +.Lzgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END -zgemm_kernel_L2_M1_20: +.Lzgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble zgemm_kernel_L2_M1_40 + ble .Lzgemm_kernel_L2_M1_40 -zgemm_kernel_L2_M1_22: +.Lzgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_22 + bgt .Lzgemm_kernel_L2_M1_22 -zgemm_kernel_L2_M1_40: +.Lzgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M1_100 + ble .Lzgemm_kernel_L2_M1_100 -zgemm_kernel_L2_M1_42: +.Lzgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_42 + bgt .Lzgemm_kernel_L2_M1_42 -zgemm_kernel_L2_M1_100: +.Lzgemm_kernel_L2_M1_100: SAVE1x2 -zgemm_kernel_L2_END: +.Lzgemm_kernel_L2_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ -zgemm_kernel_L1_BEGIN: +.Lzgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble zgemm_kernel_L999 // done + ble .Lzgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN: -zgemm_kernel_L1_M4_BEGIN: +.Lzgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L1_M2_BEGIN + ble .Lzgemm_kernel_L1_M2_BEGIN -zgemm_kernel_L1_M4_20: +.Lzgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M4_40 + ble .Lzgemm_kernel_L1_M4_40 .align 5 -zgemm_kernel_L1_M4_22: +.Lzgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_22 + bgt .Lzgemm_kernel_L1_M4_22 -zgemm_kernel_L1_M4_40: +.Lzgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M4_100 + ble .Lzgemm_kernel_L1_M4_100 -zgemm_kernel_L1_M4_42: +.Lzgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_42 + bgt .Lzgemm_kernel_L1_M4_42 -zgemm_kernel_L1_M4_100: +.Lzgemm_kernel_L1_M4_100: SAVE4x1 -zgemm_kernel_L1_M4_END: +.Lzgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L1_M4_20 + bgt .Lzgemm_kernel_L1_M4_20 -zgemm_kernel_L1_M2_BEGIN: +.Lzgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L1_M1_BEGIN + ble .Lzgemm_kernel_L1_M1_BEGIN -zgemm_kernel_L1_M2_20: +.Lzgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M2_40 + ble .Lzgemm_kernel_L1_M2_40 -zgemm_kernel_L1_M2_22: +.Lzgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_22 + bgt .Lzgemm_kernel_L1_M2_22 -zgemm_kernel_L1_M2_40: +.Lzgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M2_100 + ble .Lzgemm_kernel_L1_M2_100 -zgemm_kernel_L1_M2_42: +.Lzgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_42 + bgt .Lzgemm_kernel_L1_M2_42 -zgemm_kernel_L1_M2_100: +.Lzgemm_kernel_L1_M2_100: SAVE2x1 -zgemm_kernel_L1_M2_END: +.Lzgemm_kernel_L1_M2_END: -zgemm_kernel_L1_M1_BEGIN: +.Lzgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END -zgemm_kernel_L1_M1_20: +.Lzgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M1_40 + ble .Lzgemm_kernel_L1_M1_40 -zgemm_kernel_L1_M1_22: +.Lzgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_22 + bgt .Lzgemm_kernel_L1_M1_22 -zgemm_kernel_L1_M1_40: +.Lzgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M1_100 + ble .Lzgemm_kernel_L1_M1_100 -zgemm_kernel_L1_M1_42: +.Lzgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_42 + bgt .Lzgemm_kernel_L1_M1_42 -zgemm_kernel_L1_M1_100: +.Lzgemm_kernel_L1_M1_100: SAVE1x1 -zgemm_kernel_L1_END: +.Lzgemm_kernel_L1_END: -zgemm_kernel_L999: +.Lzgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S index a28d1b0..28afcad 100644 --- a/kernel/arm64/zgemv_n.S +++ b/kernel/arm64/zgemv_n.S @@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble zgemv_n_kernel_L999 + ble .Lzgemv_n_kernel_L999 cmp M, xzr - ble zgemv_n_kernel_L999 + ble .Lzgemv_n_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_X, INC_X, #SHZ @@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. INIT cmp INC_Y, #1 - bne zgemv_n_kernel_S_BEGIN + bne .Lzgemv_n_kernel_S_BEGIN -zgemv_n_kernel_F_LOOP: +.Lzgemv_n_kernel_F_LOOP: mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y @@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP: asr I, M, #2 cmp I, xzr - beq zgemv_n_kernel_F1 + beq .Lzgemv_n_kernel_F1 -zgemv_n_kernel_F4: +.Lzgemv_n_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zgemv_n_kernel_F4 + bne .Lzgemv_n_kernel_F4 -zgemv_n_kernel_F1: +.Lzgemv_n_kernel_F1: ands I, M, #3 - ble zgemv_n_kernel_F_END + ble .Lzgemv_n_kernel_F_END -zgemv_n_kernel_F10: +.Lzgemv_n_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zgemv_n_kernel_F10 + bne .Lzgemv_n_kernel_F10 -zgemv_n_kernel_F_END: +.Lzgemv_n_kernel_F_END: add A, A, LDA subs J, J, #1 - bne zgemv_n_kernel_F_LOOP + bne .Lzgemv_n_kernel_F_LOOP - b zgemv_n_kernel_L999 + b .Lzgemv_n_kernel_L999 -zgemv_n_kernel_S_BEGIN: +.Lzgemv_n_kernel_S_BEGIN: INIT_S -zgemv_n_kernel_S_LOOP: +.Lzgemv_n_kernel_S_LOOP: mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y @@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble zgemv_n_kernel_S1 + ble .Lzgemv_n_kernel_S1 -zgemv_n_kernel_S4: +.Lzgemv_n_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -440,27 +440,27 @@ zgemv_n_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zgemv_n_kernel_S4 + bne .Lzgemv_n_kernel_S4 -zgemv_n_kernel_S1: +.Lzgemv_n_kernel_S1: ands I, M, #3 - ble zgemv_n_kernel_S_END + ble .Lzgemv_n_kernel_S_END -zgemv_n_kernel_S10: +.Lzgemv_n_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zgemv_n_kernel_S10 + bne .Lzgemv_n_kernel_S10 -zgemv_n_kernel_S_END: +.Lzgemv_n_kernel_S_END: add A, A, LDA subs J, J, #1 - bne zgemv_n_kernel_S_LOOP + bne .Lzgemv_n_kernel_S_LOOP -zgemv_n_kernel_L999: +.Lzgemv_n_kernel_L999: RESTORE_REGS mov w0, wzr diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S index 79ce9bc..0151029 100644 --- a/kernel/arm64/zgemv_t.S +++ b/kernel/arm64/zgemv_t.S @@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble zgemv_t_kernel_L999 + ble .Lzgemv_t_kernel_L999 cmp M, xzr - ble zgemv_t_kernel_L999 + ble .Lzgemv_t_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_Y, INC_Y, #SHZ @@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. INIT cmp INC_X, #1 - bne zgemv_t_kernel_S_BEGIN + bne .Lzgemv_t_kernel_S_BEGIN -zgemv_t_kernel_F_LOOP: +.Lzgemv_t_kernel_F_LOOP: mov A_PTR, A mov X_PTR, X @@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP: asr I, M, #2 cmp I, xzr - beq zgemv_t_kernel_F1 + beq .Lzgemv_t_kernel_F1 -zgemv_t_kernel_F4: +.Lzgemv_t_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zgemv_t_kernel_F4 + bne .Lzgemv_t_kernel_F4 KERNEL_F4_FINALIZE -zgemv_t_kernel_F1: +.Lzgemv_t_kernel_F1: ands I, M, #3 - ble zgemv_t_kernel_F_END + ble .Lzgemv_t_kernel_F_END -zgemv_t_kernel_F10: +.Lzgemv_t_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zgemv_t_kernel_F10 + bne .Lzgemv_t_kernel_F10 -zgemv_t_kernel_F_END: +.Lzgemv_t_kernel_F_END: #if !defined(DOUBLE) ld1 {v4.2s}, [Y] @@ -355,15 +355,15 @@ zgemv_t_kernel_F_END: add A, A, LDA subs J, J, #1 - bne zgemv_t_kernel_F_LOOP + bne .Lzgemv_t_kernel_F_LOOP - b zgemv_t_kernel_L999 + b .Lzgemv_t_kernel_L999 -zgemv_t_kernel_S_BEGIN: +.Lzgemv_t_kernel_S_BEGIN: INIT_S -zgemv_t_kernel_S_LOOP: +.Lzgemv_t_kernel_S_LOOP: mov A_PTR, A mov X_PTR, X @@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble zgemv_t_kernel_S1 + ble .Lzgemv_t_kernel_S1 -zgemv_t_kernel_S4: +.Lzgemv_t_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -381,21 +381,21 @@ zgemv_t_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zgemv_t_kernel_S4 + bne .Lzgemv_t_kernel_S4 -zgemv_t_kernel_S1: +.Lzgemv_t_kernel_S1: ands I, M, #3 - ble zgemv_t_kernel_S_END + ble .Lzgemv_t_kernel_S_END -zgemv_t_kernel_S10: +.Lzgemv_t_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zgemv_t_kernel_S10 + bne .Lzgemv_t_kernel_S10 -zgemv_t_kernel_S_END: +.Lzgemv_t_kernel_S_END: #if !defined(DOUBLE) ld1 {v4.2s}, [Y] @@ -413,9 +413,9 @@ zgemv_t_kernel_S_END: add A, A, LDA subs J, J, #1 - bne zgemv_t_kernel_S_LOOP + bne .Lzgemv_t_kernel_S_LOOP -zgemv_t_kernel_L999: +.Lzgemv_t_kernel_L999: RESTORE_REGS mov w0, wzr ret diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S index 1360dc9..1c89685 100644 --- a/kernel/arm64/znrm2.S +++ b/kernel/arm64/znrm2.S @@ -226,43 +226,43 @@ KERNEL_S1_END_\@: INIT cmp N, #0 - ble nrm2_kernel_L999 + ble .Lznrm2_kernel_L999 cmp INC_X, #0 - beq nrm2_kernel_L999 + beq .Lznrm2_kernel_L999 cmp INC_X, #1 - bne nrm2_kernel_S_BEGIN + bne .Lznrm2_kernel_S_BEGIN -nrm2_kernel_F_BEGIN: +.Lznrm2_kernel_F_BEGIN: asr I, N, #3 // I = N / 8 cmp I, xzr - ble nrm2_kernel_F1 + ble .Lznrm2_kernel_F1 -nrm2_kernel_F8: +.Lznrm2_kernel_F8: KERNEL_F8 subs I, I, #1 - bne nrm2_kernel_F8 + bne .Lznrm2_kernel_F8 -nrm2_kernel_F1: +.Lznrm2_kernel_F1: ands I, N, #7 - ble nrm2_kernel_L999 + ble .Lznrm2_kernel_L999 -nrm2_kernel_F10: +.Lznrm2_kernel_F10: KERNEL_F1 subs I, I, #1 - bne nrm2_kernel_F10 + bne .Lznrm2_kernel_F10 - b nrm2_kernel_L999 + b .Lznrm2_kernel_L999 -nrm2_kernel_S_BEGIN: +.Lznrm2_kernel_S_BEGIN: INIT_S @@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN: .align 5 -nrm2_kernel_S10: +.Lznrm2_kernel_S10: KERNEL_S1 subs I, I, #1 - bne nrm2_kernel_S10 + bne .Lznrm2_kernel_S10 -nrm2_kernel_L999: +.Lznrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ diff --git a/kernel/arm64/zrot.S b/kernel/arm64/zrot.S index 90f138a..b5e510e 100644 --- a/kernel/arm64/zrot.S +++ b/kernel/arm64/zrot.S @@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 INIT cmp INC_X, #1 - bne rot_kernel_S_BEGIN + bne .Lzrot_kernel_S_BEGIN cmp INC_Y, #1 - bne rot_kernel_S_BEGIN + bne .Lzrot_kernel_S_BEGIN -rot_kernel_F_BEGIN: +.Lzrot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq rot_kernel_F1 + beq .Lzrot_kernel_F1 KERNEL_INIT_F4 -rot_kernel_F4: +.Lzrot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne rot_kernel_F4 + bne .Lzrot_kernel_F4 -rot_kernel_F1: +.Lzrot_kernel_F1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 -rot_kernel_F10: +.Lzrot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne rot_kernel_F10 + bne .Lzrot_kernel_F10 mov w0, wzr ret -rot_kernel_S_BEGIN: +.Lzrot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble rot_kernel_S1 + ble .Lzrot_kernel_S1 -rot_kernel_S4: +.Lzrot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -236,21 +236,21 @@ rot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S4 + bne .Lzrot_kernel_S4 -rot_kernel_S1: +.Lzrot_kernel_S1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 -rot_kernel_S10: +.Lzrot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S10 + bne .Lzrot_kernel_S10 -rot_kernel_L999: +.Lzrot_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index daaa55e..9294559 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -215,71 +215,71 @@ zscal_begin: mov X_COPY, X cmp N, xzr - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 fcmp DA_R, #0.0 - bne zscal_kernel_R_non_zero + bne .Lzscal_kernel_R_non_zero fcmp DA_I, #0.0 - beq zscal_kernel_RI_zero + beq .Lzscal_kernel_RI_zero - b zscal_kernel_R_zero + b .Lzscal_kernel_R_zero -zscal_kernel_R_non_zero: +.Lzscal_kernel_R_non_zero: fcmp DA_I, #0.0 - beq zscal_kernel_I_zero + beq .Lzscal_kernel_I_zero /******************************************************************************* * A_R != 0 && A_I != 0 *******************************************************************************/ -zscal_kernel_RI_non_zero: +.Lzscal_kernel_RI_non_zero: INIT cmp INC_X, #1 - bne zscal_kernel_S_BEGIN + bne .Lzscal_kernel_S_BEGIN -zscal_kernel_F_BEGIN: +.Lzscal_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq zscal_kernel_F1 + beq .Lzscal_kernel_F1 KERNEL_INIT_F4 -zscal_kernel_F4: +.Lzscal_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zscal_kernel_F4 + bne .Lzscal_kernel_F4 -zscal_kernel_F1: +.Lzscal_kernel_F1: ands I, N, #3 - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 -zscal_kernel_F10: +.Lzscal_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zscal_kernel_F10 + bne .Lzscal_kernel_F10 mov w0, wzr ret -zscal_kernel_S_BEGIN: +.Lzscal_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble zscal_kernel_S1 + ble .Lzscal_kernel_S1 -zscal_kernel_S4: +.Lzscal_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -287,21 +287,21 @@ zscal_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zscal_kernel_S4 + bne .Lzscal_kernel_S4 -zscal_kernel_S1: +.Lzscal_kernel_S1: ands I, N, #3 - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 -zscal_kernel_S10: +.Lzscal_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zscal_kernel_S10 + bne .Lzscal_kernel_S10 -zscal_kernel_L999: +.Lzscal_kernel_L999: mov w0, wzr ret @@ -310,7 +310,7 @@ zscal_kernel_L999: * A_R == 0 && A_I != 0 *******************************************************************************/ -zscal_kernel_R_zero: +.Lzscal_kernel_R_zero: INIT_S #if !defined(DOUBLE) @@ -323,7 +323,7 @@ zscal_kernel_R_zero: ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I #endif -zscal_kernel_R_zero_1: +.Lzscal_kernel_R_zero_1: #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 @@ -337,7 +337,7 @@ zscal_kernel_R_zero_1: #endif add X, X, INC_X subs N, N, #1 - bne zscal_kernel_R_zero_1 + bne .Lzscal_kernel_R_zero_1 mov w0, wzr ret @@ -346,7 +346,7 @@ zscal_kernel_R_zero_1: * A_R != 0 && A_I == 0 *******************************************************************************/ -zscal_kernel_I_zero: +.Lzscal_kernel_I_zero: INIT_S #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R @@ -354,7 +354,7 @@ zscal_kernel_I_zero: ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R #endif -zscal_kernel_I_zero_1: +.Lzscal_kernel_I_zero_1: #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 @@ -366,7 +366,7 @@ zscal_kernel_I_zero_1: #endif add X, X, INC_X subs N, N, #1 - bne zscal_kernel_I_zero_1 + bne .Lzscal_kernel_I_zero_1 mov w0, wzr ret @@ -375,16 +375,16 @@ zscal_kernel_I_zero_1: * A_R == 0 && A_I == 0 *******************************************************************************/ -zscal_kernel_RI_zero: +.Lzscal_kernel_RI_zero: INIT_S -zscal_kernel_RI_zero_1: +.Lzscal_kernel_RI_zero_1: stp DA_R, DA_I, [X] add X, X, INC_X subs N, N, #1 - bne zscal_kernel_RI_zero_1 + bne .Lzscal_kernel_RI_zero_1 mov w0, wzr ret diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 77a7857..462acfe 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ztrmm_kernel_L2_BEGIN + ble .Lztrmm_kernel_L2_BEGIN -ztrmm_kernel_L4_BEGIN: +.Lztrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ztrmm_kernel_L4_M4_BEGIN: +.Lztrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ztrmm_kernel_L4_M2_BEGIN + ble .Lztrmm_kernel_L4_M2_BEGIN .align 5 -ztrmm_kernel_L4_M4_20: +.Lztrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20: asr counterL , tempK, #3 cmp counterL , #2 - blt ztrmm_kernel_L4_M4_32 + blt .Lztrmm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 - ble ztrmm_kernel_L4_M4_22a + ble .Lztrmm_kernel_L4_M4_22a .align 5 -ztrmm_kernel_L4_M4_22: +.Lztrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M4_22 + bgt .Lztrmm_kernel_L4_M4_22 .align 5 -ztrmm_kernel_L4_M4_22a: +.Lztrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ztrmm_kernel_L4_M4_44 + b .Lztrmm_kernel_L4_M4_44 .align 5 -ztrmm_kernel_L4_M4_32: +.Lztrmm_kernel_L4_M4_32: tst counterL, #1 - ble ztrmm_kernel_L4_M4_40 + ble .Lztrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b ztrmm_kernel_L4_M4_44 + b .Lztrmm_kernel_L4_M4_44 -ztrmm_kernel_L4_M4_40: +.Lztrmm_kernel_L4_M4_40: INIT4x4 -ztrmm_kernel_L4_M4_44: +.Lztrmm_kernel_L4_M4_44: ands counterL , tempK, #7 - ble ztrmm_kernel_L4_M4_100 + ble .Lztrmm_kernel_L4_M4_100 .align 5 -ztrmm_kernel_L4_M4_46: +.Lztrmm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne ztrmm_kernel_L4_M4_46 + bne .Lztrmm_kernel_L4_M4_46 -ztrmm_kernel_L4_M4_100: +.Lztrmm_kernel_L4_M4_100: SAVE4x4 @@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -ztrmm_kernel_L4_M4_END: +.Lztrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne ztrmm_kernel_L4_M4_20 + bne .Lztrmm_kernel_L4_M4_20 -ztrmm_kernel_L4_M2_BEGIN: +.Lztrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L4_END + ble .Lztrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L4_M1_BEGIN + ble .Lztrmm_kernel_L4_M1_BEGIN -ztrmm_kernel_L4_M2_20: +.Lztrmm_kernel_L4_M2_20: INIT2x4 @@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L4_M2_40 + ble .Lztrmm_kernel_L4_M2_40 -ztrmm_kernel_L4_M2_22: +.Lztrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M2_22 + bgt .Lztrmm_kernel_L4_M2_22 -ztrmm_kernel_L4_M2_40: +.Lztrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L4_M2_100 + ble .Lztrmm_kernel_L4_M2_100 -ztrmm_kernel_L4_M2_42: +.Lztrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M2_42 + bgt .Lztrmm_kernel_L4_M2_42 -ztrmm_kernel_L4_M2_100: +.Lztrmm_kernel_L4_M2_100: SAVE2x4 @@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L4_M2_END: +.Lztrmm_kernel_L4_M2_END: -ztrmm_kernel_L4_M1_BEGIN: +.Lztrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L4_END + ble .Lztrmm_kernel_L4_END -ztrmm_kernel_L4_M1_20: +.Lztrmm_kernel_L4_M1_20: INIT1x4 @@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L4_M1_40 + ble .Lztrmm_kernel_L4_M1_40 -ztrmm_kernel_L4_M1_22: +.Lztrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M1_22 + bgt .Lztrmm_kernel_L4_M1_22 -ztrmm_kernel_L4_M1_40: +.Lztrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L4_M1_100 + ble .Lztrmm_kernel_L4_M1_100 -ztrmm_kernel_L4_M1_42: +.Lztrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M1_42 + bgt .Lztrmm_kernel_L4_M1_42 -ztrmm_kernel_L4_M1_100: +.Lztrmm_kernel_L4_M1_100: SAVE1x4 @@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100: #endif -ztrmm_kernel_L4_END: +.Lztrmm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 @@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ztrmm_kernel_L4_BEGIN + bgt .Lztrmm_kernel_L4_BEGIN /******************************************************************************/ -ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ztrmm_kernel_L999 // error, N was less than 4? + ble .Lztrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ztrmm_kernel_L1_BEGIN + ble .Lztrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -ztrmm_kernel_L2_M4_BEGIN: +.Lztrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble ztrmm_kernel_L2_M2_BEGIN + ble .Lztrmm_kernel_L2_M2_BEGIN -ztrmm_kernel_L2_M4_20: +.Lztrmm_kernel_L2_M4_20: INIT4x2 @@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ztrmm_kernel_L2_M4_40 + ble .Lztrmm_kernel_L2_M4_40 .align 5 -ztrmm_kernel_L2_M4_22: +.Lztrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M4_22 + bgt .Lztrmm_kernel_L2_M4_22 -ztrmm_kernel_L2_M4_40: +.Lztrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M4_100 + ble .Lztrmm_kernel_L2_M4_100 -ztrmm_kernel_L2_M4_42: +.Lztrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M4_42 + bgt .Lztrmm_kernel_L2_M4_42 -ztrmm_kernel_L2_M4_100: +.Lztrmm_kernel_L2_M4_100: SAVE4x2 @@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ztrmm_kernel_L2_M4_END: +.Lztrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt ztrmm_kernel_L2_M4_20 + bgt .Lztrmm_kernel_L2_M4_20 -ztrmm_kernel_L2_M2_BEGIN: +.Lztrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L2_END + ble .Lztrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L2_M1_BEGIN + ble .Lztrmm_kernel_L2_M1_BEGIN -ztrmm_kernel_L2_M2_20: +.Lztrmm_kernel_L2_M2_20: INIT2x2 @@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ztrmm_kernel_L2_M2_40 + ble .Lztrmm_kernel_L2_M2_40 -ztrmm_kernel_L2_M2_22: +.Lztrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M2_22 + bgt .Lztrmm_kernel_L2_M2_22 -ztrmm_kernel_L2_M2_40: +.Lztrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M2_100 + ble .Lztrmm_kernel_L2_M2_100 -ztrmm_kernel_L2_M2_42: +.Lztrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M2_42 + bgt .Lztrmm_kernel_L2_M2_42 -ztrmm_kernel_L2_M2_100: +.Lztrmm_kernel_L2_M2_100: SAVE2x2 @@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L2_M2_END: +.Lztrmm_kernel_L2_M2_END: -ztrmm_kernel_L2_M1_BEGIN: +.Lztrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L2_END + ble .Lztrmm_kernel_L2_END -ztrmm_kernel_L2_M1_20: +.Lztrmm_kernel_L2_M1_20: INIT1x2 @@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ztrmm_kernel_L2_M1_40 + ble .Lztrmm_kernel_L2_M1_40 -ztrmm_kernel_L2_M1_22: +.Lztrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M1_22 + bgt .Lztrmm_kernel_L2_M1_22 -ztrmm_kernel_L2_M1_40: +.Lztrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M1_100 + ble .Lztrmm_kernel_L2_M1_100 -ztrmm_kernel_L2_M1_42: +.Lztrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M1_42 + bgt .Lztrmm_kernel_L2_M1_42 -ztrmm_kernel_L2_M1_100: +.Lztrmm_kernel_L2_M1_100: SAVE1x2 @@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100: #endif -ztrmm_kernel_L2_END: +.Lztrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END: /******************************************************************************/ -ztrmm_kernel_L1_BEGIN: +.Lztrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ztrmm_kernel_L999 // done + ble .Lztrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN: -ztrmm_kernel_L1_M4_BEGIN: +.Lztrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ztrmm_kernel_L1_M2_BEGIN + ble .Lztrmm_kernel_L1_M2_BEGIN -ztrmm_kernel_L1_M4_20: +.Lztrmm_kernel_L1_M4_20: INIT4x1 @@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M4_40 + ble .Lztrmm_kernel_L1_M4_40 .align 5 -ztrmm_kernel_L1_M4_22: +.Lztrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M4_22 + bgt .Lztrmm_kernel_L1_M4_22 -ztrmm_kernel_L1_M4_40: +.Lztrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M4_100 + ble .Lztrmm_kernel_L1_M4_100 -ztrmm_kernel_L1_M4_42: +.Lztrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M4_42 + bgt .Lztrmm_kernel_L1_M4_42 -ztrmm_kernel_L1_M4_100: +.Lztrmm_kernel_L1_M4_100: SAVE4x1 @@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ztrmm_kernel_L1_M4_END: +.Lztrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt ztrmm_kernel_L1_M4_20 + bgt .Lztrmm_kernel_L1_M4_20 -ztrmm_kernel_L1_M2_BEGIN: +.Lztrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L1_END + ble .Lztrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L1_M1_BEGIN + ble .Lztrmm_kernel_L1_M1_BEGIN -ztrmm_kernel_L1_M2_20: +.Lztrmm_kernel_L1_M2_20: INIT2x1 @@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M2_40 + ble .Lztrmm_kernel_L1_M2_40 -ztrmm_kernel_L1_M2_22: +.Lztrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M2_22 + bgt .Lztrmm_kernel_L1_M2_22 -ztrmm_kernel_L1_M2_40: +.Lztrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M2_100 + ble .Lztrmm_kernel_L1_M2_100 -ztrmm_kernel_L1_M2_42: +.Lztrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M2_42 + bgt .Lztrmm_kernel_L1_M2_42 -ztrmm_kernel_L1_M2_100: +.Lztrmm_kernel_L1_M2_100: SAVE2x1 @@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L1_M2_END: +.Lztrmm_kernel_L1_M2_END: -ztrmm_kernel_L1_M1_BEGIN: +.Lztrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L1_END + ble .Lztrmm_kernel_L1_END -ztrmm_kernel_L1_M1_20: +.Lztrmm_kernel_L1_M1_20: INIT1x1 @@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M1_40 + ble .Lztrmm_kernel_L1_M1_40 -ztrmm_kernel_L1_M1_22: +.Lztrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M1_22 + bgt .Lztrmm_kernel_L1_M1_22 -ztrmm_kernel_L1_M1_40: +.Lztrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M1_100 + ble .Lztrmm_kernel_L1_M1_100 -ztrmm_kernel_L1_M1_42: +.Lztrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M1_42 + bgt .Lztrmm_kernel_L1_M1_42 -ztrmm_kernel_L1_M1_100: +.Lztrmm_kernel_L1_M1_100: SAVE1x1 -ztrmm_kernel_L1_END: +.Lztrmm_kernel_L1_END: -ztrmm_kernel_L999: +.Lztrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)]