PROLOGUE
cmp N, xzr
- ble amax_kernel_zero
+ ble .Lamax_kernel_zero
cmp INC_X, xzr
- ble amax_kernel_zero
+ ble .Lamax_kernel_zero
cmp INC_X, #1
- bne amax_kernel_S_BEGIN
+ bne .Lamax_kernel_S_BEGIN
-amax_kernel_F_BEGIN:
+.Lamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq amax_kernel_F1_INIT
+ beq .Lamax_kernel_F1_INIT
INIT_F4
subs I, I, #1
- beq amax_kernel_F1
+ beq .Lamax_kernel_F1
-amax_kernel_F4:
+.Lamax_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne amax_kernel_F4
+ bne .Lamax_kernel_F4
-amax_kernel_F1:
+.Lamax_kernel_F1:
ands I, N, #3
- ble amax_kernel_L999
+ ble .Lamax_kernel_L999
-amax_kernel_F10:
+.Lamax_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne amax_kernel_F10
+ bne .Lamax_kernel_F10
ret
-amax_kernel_F1_INIT:
+.Lamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
- b amax_kernel_F1
+ b .Lamax_kernel_F1
-amax_kernel_S_BEGIN:
+.Lamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
- ble amax_kernel_L999
+ ble .Lamax_kernel_L999
asr I, N, #2
cmp I, xzr
- ble amax_kernel_S1
+ ble .Lamax_kernel_S1
-amax_kernel_S4:
+.Lamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne amax_kernel_S4
+ bne .Lamax_kernel_S4
-amax_kernel_S1:
+.Lamax_kernel_S1:
ands I, N, #3
- ble amax_kernel_L999
+ ble .Lamax_kernel_L999
-amax_kernel_S10:
+.Lamax_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne amax_kernel_S10
+ bne .Lamax_kernel_S10
-amax_kernel_L999:
+.Lamax_kernel_L999:
ret
-amax_kernel_zero:
+.Lamax_kernel_zero:
fmov MAXF, REG0
ret
#endif
cmp N, xzr
- ble asum_kernel_L999
+ ble .Lasum_kernel_L999
cmp INC_X, xzr
- ble asum_kernel_L999
+ ble .Lasum_kernel_L999
cmp INC_X, #1
- bne asum_kernel_S_BEGIN
+ bne .Lasum_kernel_S_BEGIN
-asum_kernel_F_BEGIN:
+.Lasum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
- beq asum_kernel_F1
+ beq .Lasum_kernel_F1
-asum_kernel_F8:
+.Lasum_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne asum_kernel_F8
+ bne .Lasum_kernel_F8
KERNEL_F8_FINALIZE
-asum_kernel_F1:
+.Lasum_kernel_F1:
ands I, N, #7
- ble asum_kernel_L999
+ ble .Lasum_kernel_L999
-asum_kernel_F10:
+.Lasum_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne asum_kernel_F10
+ bne .Lasum_kernel_F10
-asum_kernel_L999:
+.Lasum_kernel_L999:
ret
-asum_kernel_S_BEGIN:
+.Lasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble asum_kernel_S1
+ ble .Lasum_kernel_S1
-asum_kernel_S4:
+.Lasum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne asum_kernel_S4
+ bne .Lasum_kernel_S4
-asum_kernel_S1:
+.Lasum_kernel_S1:
ands I, N, #3
- ble asum_kernel_L999
+ ble .Lasum_kernel_L999
-asum_kernel_S10:
+.Lasum_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne asum_kernel_S10
+ bne .Lasum_kernel_S10
ret
PROLOGUE
cmp N, xzr
- ble axpy_kernel_L999
+ ble .Laxpy_kernel_L999
fcmp DA, #0.0
- beq axpy_kernel_L999
+ beq .Laxpy_kernel_L999
cmp INC_X, #1
- bne axpy_kernel_S_BEGIN
+ bne .Laxpy_kernel_S_BEGIN
cmp INC_Y, #1
- bne axpy_kernel_S_BEGIN
+ bne .Laxpy_kernel_S_BEGIN
-axpy_kernel_F_BEGIN:
+.Laxpy_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
- beq axpy_kernel_F1
+ beq .Laxpy_kernel_F1
-axpy_kernel_F8:
+.Laxpy_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne axpy_kernel_F8
+ bne .Laxpy_kernel_F8
-axpy_kernel_F1:
+.Laxpy_kernel_F1:
ands I, N, #7
- ble axpy_kernel_L999
+ ble .Laxpy_kernel_L999
-axpy_kernel_F10:
+.Laxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne axpy_kernel_F10
+ bne .Laxpy_kernel_F10
mov w0, wzr
ret
-axpy_kernel_S_BEGIN:
+.Laxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble axpy_kernel_S1
+ ble .Laxpy_kernel_S1
-axpy_kernel_S4:
+.Laxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne axpy_kernel_S4
+ bne .Laxpy_kernel_S4
-axpy_kernel_S1:
+.Laxpy_kernel_S1:
ands I, N, #3
- ble axpy_kernel_L999
+ ble .Laxpy_kernel_L999
-axpy_kernel_S10:
+.Laxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne axpy_kernel_S10
+ bne .Laxpy_kernel_S10
-axpy_kernel_L999:
+.Laxpy_kernel_L999:
mov w0, wzr
ret
fmov s1, SUMF
cmp N, xzr
- ble asum_kernel_L999
+ ble .Lcasum_kernel_L999
cmp INC_X, xzr
- ble asum_kernel_L999
+ ble .Lcasum_kernel_L999
cmp INC_X, #1
- bne asum_kernel_S_BEGIN
+ bne .Lcasum_kernel_S_BEGIN
-asum_kernel_F_BEGIN:
+.Lcasum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
- beq asum_kernel_F1
+ beq .Lcasum_kernel_F1
-asum_kernel_F8:
+.Lcasum_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne asum_kernel_F8
+ bne .Lcasum_kernel_F8
KERNEL_F8_FINALIZE
-asum_kernel_F1:
+.Lcasum_kernel_F1:
ands I, N, #7
- ble asum_kernel_L999
+ ble .Lcasum_kernel_L999
-asum_kernel_F10:
+.Lcasum_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne asum_kernel_F10
+ bne .Lcasum_kernel_F10
-asum_kernel_L999:
+.Lcasum_kernel_L999:
ret
-asum_kernel_S_BEGIN:
+.Lcasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble asum_kernel_S1
+ ble .Lcasum_kernel_S1
-asum_kernel_S4:
+.Lcasum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne asum_kernel_S4
+ bne .Lcasum_kernel_S4
-asum_kernel_S1:
+.Lcasum_kernel_S1:
ands I, N, #3
- ble asum_kernel_L999
+ ble .Lcasum_kernel_L999
-asum_kernel_S10:
+.Lcasum_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne asum_kernel_S10
+ bne .Lcasum_kernel_S10
ret
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble cgemm_kernel_L2_BEGIN
+ ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
-cgemm_kernel_L4_BEGIN:
+.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pA, origPA // pA = start of A array
add ppA, temp, pA
-cgemm_kernel_L4_M8_BEGIN:
+.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble cgemm_kernel_L4_M4_BEGIN
+ ble .Lcgemm_kernel_L4_M4_BEGIN
-cgemm_kernel_L4_M8_20:
+.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt cgemm_kernel_L4_M8_32
+ blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2 // subtract 2
- ble cgemm_kernel_L4_M8_22a
+ ble .Lcgemm_kernel_L4_M8_22a
.align 5
-cgemm_kernel_L4_M8_22:
+.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M8_22
+ bgt .Lcgemm_kernel_L4_M8_22
-cgemm_kernel_L4_M8_22a:
+.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
- b cgemm_kernel_L4_M8_44
+ b .Lcgemm_kernel_L4_M8_44
-cgemm_kernel_L4_M8_32:
+.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
- ble cgemm_kernel_L4_M8_40
+ ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_E
- b cgemm_kernel_L4_M8_44
+ b .Lcgemm_kernel_L4_M8_44
-cgemm_kernel_L4_M8_40:
+.Lcgemm_kernel_L4_M8_40:
INIT8x4
-cgemm_kernel_L4_M8_44:
+.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #1
- ble cgemm_kernel_L4_M8_100
+ ble .Lcgemm_kernel_L4_M8_100
-cgemm_kernel_L4_M8_46:
+.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
-cgemm_kernel_L4_M8_100:
+.Lcgemm_kernel_L4_M8_100:
SAVE8x4
-cgemm_kernel_L4_M8_END:
+.Lcgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
- bne cgemm_kernel_L4_M8_20
+ bne .Lcgemm_kernel_L4_M8_20
-cgemm_kernel_L4_M4_BEGIN:
+.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
tst counterI, #4
- ble cgemm_kernel_L4_M2_BEGIN
+ ble .Lcgemm_kernel_L4_M2_BEGIN
-cgemm_kernel_L4_M4_20:
+.Lcgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble cgemm_kernel_L4_M4_40
+ ble .Lcgemm_kernel_L4_M4_40
-cgemm_kernel_L4_M4_22:
+.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M4_22
+ bgt .Lcgemm_kernel_L4_M4_22
-cgemm_kernel_L4_M4_40:
+.Lcgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L4_M4_100
+ ble .Lcgemm_kernel_L4_M4_100
-cgemm_kernel_L4_M4_42:
+.Lcgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M4_42
+ bgt .Lcgemm_kernel_L4_M4_42
-cgemm_kernel_L4_M4_100:
+.Lcgemm_kernel_L4_M4_100:
SAVE4x4
-cgemm_kernel_L4_M4_END:
+.Lcgemm_kernel_L4_M4_END:
-cgemm_kernel_L4_M2_BEGIN:
+.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L4_M1_BEGIN
+ ble .Lcgemm_kernel_L4_M1_BEGIN
-cgemm_kernel_L4_M2_20:
+.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L4_M2_40
+ ble .Lcgemm_kernel_L4_M2_40
-cgemm_kernel_L4_M2_22:
+.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M2_22
+ bgt .Lcgemm_kernel_L4_M2_22
-cgemm_kernel_L4_M2_40:
+.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L4_M2_100
+ ble .Lcgemm_kernel_L4_M2_100
-cgemm_kernel_L4_M2_42:
+.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M2_42
+ bgt .Lcgemm_kernel_L4_M2_42
-cgemm_kernel_L4_M2_100:
+.Lcgemm_kernel_L4_M2_100:
SAVE2x4
-cgemm_kernel_L4_M2_END:
+.Lcgemm_kernel_L4_M2_END:
-cgemm_kernel_L4_M1_BEGIN:
+.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
-cgemm_kernel_L4_M1_20:
+.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L4_M1_40
+ ble .Lcgemm_kernel_L4_M1_40
-cgemm_kernel_L4_M1_22:
+.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M1_22
+ bgt .Lcgemm_kernel_L4_M1_22
-cgemm_kernel_L4_M1_40:
+.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L4_M1_100
+ ble .Lcgemm_kernel_L4_M1_100
-cgemm_kernel_L4_M1_42:
+.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M1_42
+ bgt .Lcgemm_kernel_L4_M1_42
-cgemm_kernel_L4_M1_100:
+.Lcgemm_kernel_L4_M1_100:
SAVE1x4
-cgemm_kernel_L4_END:
+.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
- bgt cgemm_kernel_L4_BEGIN
+ bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
-cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble cgemm_kernel_L999 // error, N was less than 4?
+ ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble cgemm_kernel_L1_BEGIN
+ ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
-cgemm_kernel_L2_M4_BEGIN:
+.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble cgemm_kernel_L2_M2_BEGIN
+ ble .Lcgemm_kernel_L2_M2_BEGIN
-cgemm_kernel_L2_M4_20:
+.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble cgemm_kernel_L2_M4_40
+ ble .Lcgemm_kernel_L2_M4_40
.align 5
-cgemm_kernel_L2_M4_22:
+.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M4_22
+ bgt .Lcgemm_kernel_L2_M4_22
-cgemm_kernel_L2_M4_40:
+.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M4_100
+ ble .Lcgemm_kernel_L2_M4_100
-cgemm_kernel_L2_M4_42:
+.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M4_42
+ bgt .Lcgemm_kernel_L2_M4_42
-cgemm_kernel_L2_M4_100:
+.Lcgemm_kernel_L2_M4_100:
SAVE4x2
-cgemm_kernel_L2_M4_END:
+.Lcgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt cgemm_kernel_L2_M4_20
+ bgt .Lcgemm_kernel_L2_M4_20
-cgemm_kernel_L2_M2_BEGIN:
+.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L2_END
+ ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L2_M1_BEGIN
+ ble .Lcgemm_kernel_L2_M1_BEGIN
-cgemm_kernel_L2_M2_20:
+.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble cgemm_kernel_L2_M2_40
+ ble .Lcgemm_kernel_L2_M2_40
-cgemm_kernel_L2_M2_22:
+.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M2_22
+ bgt .Lcgemm_kernel_L2_M2_22
-cgemm_kernel_L2_M2_40:
+.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M2_100
+ ble .Lcgemm_kernel_L2_M2_100
-cgemm_kernel_L2_M2_42:
+.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M2_42
+ bgt .Lcgemm_kernel_L2_M2_42
-cgemm_kernel_L2_M2_100:
+.Lcgemm_kernel_L2_M2_100:
SAVE2x2
-cgemm_kernel_L2_M2_END:
+.Lcgemm_kernel_L2_M2_END:
-cgemm_kernel_L2_M1_BEGIN:
+.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L2_END
+ ble .Lcgemm_kernel_L2_END
-cgemm_kernel_L2_M1_20:
+.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble cgemm_kernel_L2_M1_40
+ ble .Lcgemm_kernel_L2_M1_40
-cgemm_kernel_L2_M1_22:
+.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M1_22
+ bgt .Lcgemm_kernel_L2_M1_22
-cgemm_kernel_L2_M1_40:
+.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M1_100
+ ble .Lcgemm_kernel_L2_M1_100
-cgemm_kernel_L2_M1_42:
+.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M1_42
+ bgt .Lcgemm_kernel_L2_M1_42
-cgemm_kernel_L2_M1_100:
+.Lcgemm_kernel_L2_M1_100:
SAVE1x2
-cgemm_kernel_L2_END:
+.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
-cgemm_kernel_L1_BEGIN:
+.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble cgemm_kernel_L999 // done
+ ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
-cgemm_kernel_L1_M4_BEGIN:
+.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble cgemm_kernel_L1_M2_BEGIN
+ ble .Lcgemm_kernel_L1_M2_BEGIN
-cgemm_kernel_L1_M4_20:
+.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M4_40
+ ble .Lcgemm_kernel_L1_M4_40
.align 5
-cgemm_kernel_L1_M4_22:
+.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M4_22
+ bgt .Lcgemm_kernel_L1_M4_22
-cgemm_kernel_L1_M4_40:
+.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M4_100
+ ble .Lcgemm_kernel_L1_M4_100
-cgemm_kernel_L1_M4_42:
+.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M4_42
+ bgt .Lcgemm_kernel_L1_M4_42
-cgemm_kernel_L1_M4_100:
+.Lcgemm_kernel_L1_M4_100:
SAVE4x1
-cgemm_kernel_L1_M4_END:
+.Lcgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt cgemm_kernel_L1_M4_20
+ bgt .Lcgemm_kernel_L1_M4_20
-cgemm_kernel_L1_M2_BEGIN:
+.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L1_END
+ ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L1_M1_BEGIN
+ ble .Lcgemm_kernel_L1_M1_BEGIN
-cgemm_kernel_L1_M2_20:
+.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M2_40
+ ble .Lcgemm_kernel_L1_M2_40
-cgemm_kernel_L1_M2_22:
+.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M2_22
+ bgt .Lcgemm_kernel_L1_M2_22
-cgemm_kernel_L1_M2_40:
+.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M2_100
+ ble .Lcgemm_kernel_L1_M2_100
-cgemm_kernel_L1_M2_42:
+.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M2_42
+ bgt .Lcgemm_kernel_L1_M2_42
-cgemm_kernel_L1_M2_100:
+.Lcgemm_kernel_L1_M2_100:
SAVE2x1
-cgemm_kernel_L1_M2_END:
+.Lcgemm_kernel_L1_M2_END:
-cgemm_kernel_L1_M1_BEGIN:
+.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L1_END
+ ble .Lcgemm_kernel_L1_END
-cgemm_kernel_L1_M1_20:
+.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M1_40
+ ble .Lcgemm_kernel_L1_M1_40
-cgemm_kernel_L1_M1_22:
+.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M1_22
+ bgt .Lcgemm_kernel_L1_M1_22
-cgemm_kernel_L1_M1_40:
+.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M1_100
+ ble .Lcgemm_kernel_L1_M1_100
-cgemm_kernel_L1_M1_42:
+.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M1_42
+ bgt .Lcgemm_kernel_L1_M1_42
-cgemm_kernel_L1_M1_100:
+.Lcgemm_kernel_L1_M1_100:
SAVE1x1
-cgemm_kernel_L1_END:
+.Lcgemm_kernel_L1_END:
-cgemm_kernel_L999:
+.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble cgemm_kernel_L2_BEGIN
+ ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
-cgemm_kernel_L4_BEGIN:
+.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
mov pA, origPA // pA = start of A array
-cgemm_kernel_L4_M8_BEGIN:
+.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble cgemm_kernel_L4_M4_BEGIN
+ ble .Lcgemm_kernel_L4_M4_BEGIN
.align 5
-cgemm_kernel_L4_M8_20:
+.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
- blt cgemm_kernel_L4_M8_32
+ blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
- ble cgemm_kernel_L4_M8_22a
+ ble .Lcgemm_kernel_L4_M8_22a
.align 5
-cgemm_kernel_L4_M8_22:
+.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M8_22
+ bgt .Lcgemm_kernel_L4_M8_22
.align 5
-cgemm_kernel_L4_M8_22a:
+.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b cgemm_kernel_L4_M8_44
+ b .Lcgemm_kernel_L4_M8_44
.align 5
-cgemm_kernel_L4_M8_32:
+.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
- ble cgemm_kernel_L4_M8_40
+ ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b cgemm_kernel_L4_M8_44
+ b .Lcgemm_kernel_L4_M8_44
-cgemm_kernel_L4_M8_40:
+.Lcgemm_kernel_L4_M8_40:
INIT8x4
-cgemm_kernel_L4_M8_44:
+.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #7
- ble cgemm_kernel_L4_M8_100
+ ble .Lcgemm_kernel_L4_M8_100
.align 5
-cgemm_kernel_L4_M8_46:
+.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
- bne cgemm_kernel_L4_M8_46
+ bne .Lcgemm_kernel_L4_M8_46
-cgemm_kernel_L4_M8_100:
+.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
-cgemm_kernel_L4_M8_END:
+.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
- bne cgemm_kernel_L4_M8_20
+ bne .Lcgemm_kernel_L4_M8_20
-cgemm_kernel_L4_M4_BEGIN:
+.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
tst counterI, #4
- ble cgemm_kernel_L4_M2_BEGIN
+ ble .Lcgemm_kernel_L4_M2_BEGIN
-cgemm_kernel_L4_M4_20:
+.Lcgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt cgemm_kernel_L4_M4_32
+ blt .Lcgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble cgemm_kernel_L4_M4_22a
+ ble .Lcgemm_kernel_L4_M4_22a
.align 5
-cgemm_kernel_L4_M4_22:
+.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M4_22
+ bgt .Lcgemm_kernel_L4_M4_22
-cgemm_kernel_L4_M4_22a:
+.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b cgemm_kernel_L4_M4_44
-cgemm_kernel_L4_M4_32:
+ b .Lcgemm_kernel_L4_M4_44
+.Lcgemm_kernel_L4_M4_32:
tst counterL, #1
- ble cgemm_kernel_L4_M4_40
+ ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b cgemm_kernel_L4_M4_44
-cgemm_kernel_L4_M4_40:
+ b .Lcgemm_kernel_L4_M4_44
+.Lcgemm_kernel_L4_M4_40:
INIT4x4
-cgemm_kernel_L4_M4_44:
+.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1
- ble cgemm_kernel_L4_M4_100
+ ble .Lcgemm_kernel_L4_M4_100
-cgemm_kernel_L4_M4_46:
+.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
-cgemm_kernel_L4_M4_100:
+.Lcgemm_kernel_L4_M4_100:
SAVE4x4
-cgemm_kernel_L4_M4_END:
+.Lcgemm_kernel_L4_M4_END:
-cgemm_kernel_L4_M2_BEGIN:
+.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L4_M1_BEGIN
+ ble .Lcgemm_kernel_L4_M1_BEGIN
-cgemm_kernel_L4_M2_20:
+.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L4_M2_40
+ ble .Lcgemm_kernel_L4_M2_40
-cgemm_kernel_L4_M2_22:
+.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M2_22
+ bgt .Lcgemm_kernel_L4_M2_22
-cgemm_kernel_L4_M2_40:
+.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L4_M2_100
+ ble .Lcgemm_kernel_L4_M2_100
-cgemm_kernel_L4_M2_42:
+.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M2_42
+ bgt .Lcgemm_kernel_L4_M2_42
-cgemm_kernel_L4_M2_100:
+.Lcgemm_kernel_L4_M2_100:
SAVE2x4
-cgemm_kernel_L4_M2_END:
+.Lcgemm_kernel_L4_M2_END:
-cgemm_kernel_L4_M1_BEGIN:
+.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
-cgemm_kernel_L4_M1_20:
+.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L4_M1_40
+ ble .Lcgemm_kernel_L4_M1_40
-cgemm_kernel_L4_M1_22:
+.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M1_22
+ bgt .Lcgemm_kernel_L4_M1_22
-cgemm_kernel_L4_M1_40:
+.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L4_M1_100
+ ble .Lcgemm_kernel_L4_M1_100
-cgemm_kernel_L4_M1_42:
+.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M1_42
+ bgt .Lcgemm_kernel_L4_M1_42
-cgemm_kernel_L4_M1_100:
+.Lcgemm_kernel_L4_M1_100:
SAVE1x4
-cgemm_kernel_L4_END:
+.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
- bgt cgemm_kernel_L4_BEGIN
+ bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
-cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble cgemm_kernel_L999 // error, N was less than 4?
+ ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble cgemm_kernel_L1_BEGIN
+ ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-cgemm_kernel_L2_M8_BEGIN:
+.Lcgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble cgemm_kernel_L2_M4_BEGIN
+ ble .Lcgemm_kernel_L2_M4_BEGIN
-cgemm_kernel_L2_M8_20:
+.Lcgemm_kernel_L2_M8_20:
INIT8x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble cgemm_kernel_L2_M8_40
+ ble .Lcgemm_kernel_L2_M8_40
.align 5
-cgemm_kernel_L2_M8_22:
+.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M8_22
+ bgt .Lcgemm_kernel_L2_M8_22
-cgemm_kernel_L2_M8_40:
+.Lcgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M8_100
+ ble .Lcgemm_kernel_L2_M8_100
-cgemm_kernel_L2_M8_42:
+.Lcgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M8_42
+ bgt .Lcgemm_kernel_L2_M8_42
-cgemm_kernel_L2_M8_100:
+.Lcgemm_kernel_L2_M8_100:
SAVE8x2
-cgemm_kernel_L2_M8_END:
+.Lcgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
- bgt cgemm_kernel_L2_M8_20
+ bgt .Lcgemm_kernel_L2_M8_20
-cgemm_kernel_L2_M4_BEGIN:
+.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble cgemm_kernel_L2_END
+ ble .Lcgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
- ble cgemm_kernel_L2_M2_BEGIN
+ ble .Lcgemm_kernel_L2_M2_BEGIN
-cgemm_kernel_L2_M4_20:
+.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble cgemm_kernel_L2_M4_40
+ ble .Lcgemm_kernel_L2_M4_40
.align 5
-cgemm_kernel_L2_M4_22:
+.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M4_22
+ bgt .Lcgemm_kernel_L2_M4_22
-cgemm_kernel_L2_M4_40:
+.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M4_100
+ ble .Lcgemm_kernel_L2_M4_100
-cgemm_kernel_L2_M4_42:
+.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M4_42
+ bgt .Lcgemm_kernel_L2_M4_42
-cgemm_kernel_L2_M4_100:
+.Lcgemm_kernel_L2_M4_100:
SAVE4x2
-cgemm_kernel_L2_M4_END:
+.Lcgemm_kernel_L2_M4_END:
-cgemm_kernel_L2_M2_BEGIN:
+.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L2_END
+ ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L2_M1_BEGIN
+ ble .Lcgemm_kernel_L2_M1_BEGIN
-cgemm_kernel_L2_M2_20:
+.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble cgemm_kernel_L2_M2_40
+ ble .Lcgemm_kernel_L2_M2_40
-cgemm_kernel_L2_M2_22:
+.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M2_22
+ bgt .Lcgemm_kernel_L2_M2_22
-cgemm_kernel_L2_M2_40:
+.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M2_100
+ ble .Lcgemm_kernel_L2_M2_100
-cgemm_kernel_L2_M2_42:
+.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M2_42
+ bgt .Lcgemm_kernel_L2_M2_42
-cgemm_kernel_L2_M2_100:
+.Lcgemm_kernel_L2_M2_100:
SAVE2x2
-cgemm_kernel_L2_M2_END:
+.Lcgemm_kernel_L2_M2_END:
-cgemm_kernel_L2_M1_BEGIN:
+.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L2_END
+ ble .Lcgemm_kernel_L2_END
-cgemm_kernel_L2_M1_20:
+.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble cgemm_kernel_L2_M1_40
+ ble .Lcgemm_kernel_L2_M1_40
-cgemm_kernel_L2_M1_22:
+.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M1_22
+ bgt .Lcgemm_kernel_L2_M1_22
-cgemm_kernel_L2_M1_40:
+.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M1_100
+ ble .Lcgemm_kernel_L2_M1_100
-cgemm_kernel_L2_M1_42:
+.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M1_42
+ bgt .Lcgemm_kernel_L2_M1_42
-cgemm_kernel_L2_M1_100:
+.Lcgemm_kernel_L2_M1_100:
SAVE1x2
-cgemm_kernel_L2_END:
+.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
-cgemm_kernel_L1_BEGIN:
+.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble cgemm_kernel_L999 // done
+ ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
mov pA, origPA // pA = A
-cgemm_kernel_L1_M8_BEGIN:
+.Lcgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble cgemm_kernel_L1_M4_BEGIN
+ ble .Lcgemm_kernel_L1_M4_BEGIN
-cgemm_kernel_L1_M8_20:
+.Lcgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M8_40
+ ble .Lcgemm_kernel_L1_M8_40
.align 5
-cgemm_kernel_L1_M8_22:
+.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M8_22
+ bgt .Lcgemm_kernel_L1_M8_22
-cgemm_kernel_L1_M8_40:
+.Lcgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M8_100
+ ble .Lcgemm_kernel_L1_M8_100
-cgemm_kernel_L1_M8_42:
+.Lcgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M8_42
+ bgt .Lcgemm_kernel_L1_M8_42
-cgemm_kernel_L1_M8_100:
+.Lcgemm_kernel_L1_M8_100:
SAVE8x1
-cgemm_kernel_L1_M8_END:
+.Lcgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
- bgt cgemm_kernel_L1_M8_20
+ bgt .Lcgemm_kernel_L1_M8_20
-cgemm_kernel_L1_M4_BEGIN:
+.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble cgemm_kernel_L1_END
+ ble .Lcgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
- ble cgemm_kernel_L1_M2_BEGIN
+ ble .Lcgemm_kernel_L1_M2_BEGIN
-cgemm_kernel_L1_M4_20:
+.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M4_40
+ ble .Lcgemm_kernel_L1_M4_40
.align 5
-cgemm_kernel_L1_M4_22:
+.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M4_22
+ bgt .Lcgemm_kernel_L1_M4_22
-cgemm_kernel_L1_M4_40:
+.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M4_100
+ ble .Lcgemm_kernel_L1_M4_100
-cgemm_kernel_L1_M4_42:
+.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M4_42
+ bgt .Lcgemm_kernel_L1_M4_42
-cgemm_kernel_L1_M4_100:
+.Lcgemm_kernel_L1_M4_100:
SAVE4x1
-cgemm_kernel_L1_M4_END:
+.Lcgemm_kernel_L1_M4_END:
-cgemm_kernel_L1_M2_BEGIN:
+.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L1_END
+ ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L1_M1_BEGIN
+ ble .Lcgemm_kernel_L1_M1_BEGIN
-cgemm_kernel_L1_M2_20:
+.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M2_40
+ ble .Lcgemm_kernel_L1_M2_40
-cgemm_kernel_L1_M2_22:
+.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M2_22
+ bgt .Lcgemm_kernel_L1_M2_22
-cgemm_kernel_L1_M2_40:
+.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M2_100
+ ble .Lcgemm_kernel_L1_M2_100
-cgemm_kernel_L1_M2_42:
+.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M2_42
+ bgt .Lcgemm_kernel_L1_M2_42
-cgemm_kernel_L1_M2_100:
+.Lcgemm_kernel_L1_M2_100:
SAVE2x1
-cgemm_kernel_L1_M2_END:
+.Lcgemm_kernel_L1_M2_END:
-cgemm_kernel_L1_M1_BEGIN:
+.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L1_END
+ ble .Lcgemm_kernel_L1_END
-cgemm_kernel_L1_M1_20:
+.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M1_40
+ ble .Lcgemm_kernel_L1_M1_40
-cgemm_kernel_L1_M1_22:
+.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M1_22
+ bgt .Lcgemm_kernel_L1_M1_22
-cgemm_kernel_L1_M1_40:
+.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M1_100
+ ble .Lcgemm_kernel_L1_M1_100
-cgemm_kernel_L1_M1_42:
+.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M1_42
+ bgt .Lcgemm_kernel_L1_M1_42
-cgemm_kernel_L1_M1_100:
+.Lcgemm_kernel_L1_M1_100:
SAVE1x1
-cgemm_kernel_L1_END:
+.Lcgemm_kernel_L1_END:
-cgemm_kernel_L999:
+.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble cgemm_kernel_L2_BEGIN
+ ble .Lcgemm_kernel_L2_BEGIN
/******************************************************************************/
-cgemm_kernel_L4_BEGIN:
+.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
mov pA, origPA // pA = start of A array
-cgemm_kernel_L4_M8_BEGIN:
+.Lcgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble cgemm_kernel_L4_M4_BEGIN
+ ble .Lcgemm_kernel_L4_M4_BEGIN
.align 5
-cgemm_kernel_L4_M8_20:
+.Lcgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #5 // origK / 32
cmp counterL , #2
- blt cgemm_kernel_L4_M8_32
+ blt .Lcgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1_M2_x8
subs counterL, counterL, #2 // subtract 2
- ble cgemm_kernel_L4_M8_22a
+ ble .Lcgemm_kernel_L4_M8_22a
.align 5
-cgemm_kernel_L4_M8_22:
+.Lcgemm_kernel_L4_M8_22:
KERNEL8x4_M1_M2_x16
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M8_22
+ bgt .Lcgemm_kernel_L4_M8_22
.align 5
-cgemm_kernel_L4_M8_22a:
+.Lcgemm_kernel_L4_M8_22a:
KERNEL8x4_M1_M2_x8
KERNEL8x4_M1_M2_x4
KERNEL8x4_M1
KERNEL8x4_E
- b cgemm_kernel_L4_M8_44
+ b .Lcgemm_kernel_L4_M8_44
.align 5
-cgemm_kernel_L4_M8_32:
+.Lcgemm_kernel_L4_M8_32:
tst counterL, #1
- ble cgemm_kernel_L4_M8_40
+ ble .Lcgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b cgemm_kernel_L4_M8_44
+ b .Lcgemm_kernel_L4_M8_44
-cgemm_kernel_L4_M8_40:
+.Lcgemm_kernel_L4_M8_40:
INIT8x4
-cgemm_kernel_L4_M8_44:
+.Lcgemm_kernel_L4_M8_44:
ands counterL , origK, #31
- ble cgemm_kernel_L4_M8_100
+ ble .Lcgemm_kernel_L4_M8_100
.align 5
-cgemm_kernel_L4_M8_46:
+.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
- bne cgemm_kernel_L4_M8_46
+ bne .Lcgemm_kernel_L4_M8_46
-cgemm_kernel_L4_M8_100:
+.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
-cgemm_kernel_L4_M8_END:
+.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
- bne cgemm_kernel_L4_M8_20
+ bne .Lcgemm_kernel_L4_M8_20
-cgemm_kernel_L4_M4_BEGIN:
+.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
tst counterI, #4
- ble cgemm_kernel_L4_M2_BEGIN
+ ble .Lcgemm_kernel_L4_M2_BEGIN
-cgemm_kernel_L4_M4_20:
+.Lcgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt cgemm_kernel_L4_M4_32
+ blt .Lcgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble cgemm_kernel_L4_M4_22a
+ ble .Lcgemm_kernel_L4_M4_22a
.align 5
-cgemm_kernel_L4_M4_22:
+.Lcgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M4_22
+ bgt .Lcgemm_kernel_L4_M4_22
-cgemm_kernel_L4_M4_22a:
+.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b cgemm_kernel_L4_M4_44
-cgemm_kernel_L4_M4_32:
+ b .Lcgemm_kernel_L4_M4_44
+.Lcgemm_kernel_L4_M4_32:
tst counterL, #1
- ble cgemm_kernel_L4_M4_40
+ ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b cgemm_kernel_L4_M4_44
-cgemm_kernel_L4_M4_40:
+ b .Lcgemm_kernel_L4_M4_44
+.Lcgemm_kernel_L4_M4_40:
INIT4x4
-cgemm_kernel_L4_M4_44:
+.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1
- ble cgemm_kernel_L4_M4_100
+ ble .Lcgemm_kernel_L4_M4_100
-cgemm_kernel_L4_M4_46:
+.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
-cgemm_kernel_L4_M4_100:
+.Lcgemm_kernel_L4_M4_100:
SAVE4x4
-cgemm_kernel_L4_M4_END:
+.Lcgemm_kernel_L4_M4_END:
-cgemm_kernel_L4_M2_BEGIN:
+.Lcgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L4_M1_BEGIN
+ ble .Lcgemm_kernel_L4_M1_BEGIN
-cgemm_kernel_L4_M2_20:
+.Lcgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L4_M2_40
+ ble .Lcgemm_kernel_L4_M2_40
-cgemm_kernel_L4_M2_22:
+.Lcgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M2_22
+ bgt .Lcgemm_kernel_L4_M2_22
-cgemm_kernel_L4_M2_40:
+.Lcgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L4_M2_100
+ ble .Lcgemm_kernel_L4_M2_100
-cgemm_kernel_L4_M2_42:
+.Lcgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M2_42
+ bgt .Lcgemm_kernel_L4_M2_42
-cgemm_kernel_L4_M2_100:
+.Lcgemm_kernel_L4_M2_100:
SAVE2x4
-cgemm_kernel_L4_M2_END:
+.Lcgemm_kernel_L4_M2_END:
-cgemm_kernel_L4_M1_BEGIN:
+.Lcgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L4_END
+ ble .Lcgemm_kernel_L4_END
-cgemm_kernel_L4_M1_20:
+.Lcgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L4_M1_40
+ ble .Lcgemm_kernel_L4_M1_40
-cgemm_kernel_L4_M1_22:
+.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M1_22
+ bgt .Lcgemm_kernel_L4_M1_22
-cgemm_kernel_L4_M1_40:
+.Lcgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L4_M1_100
+ ble .Lcgemm_kernel_L4_M1_100
-cgemm_kernel_L4_M1_42:
+.Lcgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L4_M1_42
+ bgt .Lcgemm_kernel_L4_M1_42
-cgemm_kernel_L4_M1_100:
+.Lcgemm_kernel_L4_M1_100:
SAVE1x4
-cgemm_kernel_L4_END:
+.Lcgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
- bgt cgemm_kernel_L4_BEGIN
+ bgt .Lcgemm_kernel_L4_BEGIN
/******************************************************************************/
-cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble cgemm_kernel_L999 // error, N was less than 4?
+ ble .Lcgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble cgemm_kernel_L1_BEGIN
+ ble .Lcgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-cgemm_kernel_L2_M8_BEGIN:
+.Lcgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble cgemm_kernel_L2_M4_BEGIN
+ ble .Lcgemm_kernel_L2_M4_BEGIN
-cgemm_kernel_L2_M8_20:
+.Lcgemm_kernel_L2_M8_20:
INIT8x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble cgemm_kernel_L2_M8_40
+ ble .Lcgemm_kernel_L2_M8_40
.align 5
-cgemm_kernel_L2_M8_22:
+.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M8_22
+ bgt .Lcgemm_kernel_L2_M8_22
-cgemm_kernel_L2_M8_40:
+.Lcgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M8_100
+ ble .Lcgemm_kernel_L2_M8_100
-cgemm_kernel_L2_M8_42:
+.Lcgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M8_42
+ bgt .Lcgemm_kernel_L2_M8_42
-cgemm_kernel_L2_M8_100:
+.Lcgemm_kernel_L2_M8_100:
SAVE8x2
-cgemm_kernel_L2_M8_END:
+.Lcgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
- bgt cgemm_kernel_L2_M8_20
+ bgt .Lcgemm_kernel_L2_M8_20
-cgemm_kernel_L2_M4_BEGIN:
+.Lcgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble cgemm_kernel_L2_END
+ ble .Lcgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
- ble cgemm_kernel_L2_M2_BEGIN
+ ble .Lcgemm_kernel_L2_M2_BEGIN
-cgemm_kernel_L2_M4_20:
+.Lcgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble cgemm_kernel_L2_M4_40
+ ble .Lcgemm_kernel_L2_M4_40
.align 5
-cgemm_kernel_L2_M4_22:
+.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M4_22
+ bgt .Lcgemm_kernel_L2_M4_22
-cgemm_kernel_L2_M4_40:
+.Lcgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M4_100
+ ble .Lcgemm_kernel_L2_M4_100
-cgemm_kernel_L2_M4_42:
+.Lcgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M4_42
+ bgt .Lcgemm_kernel_L2_M4_42
-cgemm_kernel_L2_M4_100:
+.Lcgemm_kernel_L2_M4_100:
SAVE4x2
-cgemm_kernel_L2_M4_END:
+.Lcgemm_kernel_L2_M4_END:
-cgemm_kernel_L2_M2_BEGIN:
+.Lcgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L2_END
+ ble .Lcgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L2_M1_BEGIN
+ ble .Lcgemm_kernel_L2_M1_BEGIN
-cgemm_kernel_L2_M2_20:
+.Lcgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble cgemm_kernel_L2_M2_40
+ ble .Lcgemm_kernel_L2_M2_40
-cgemm_kernel_L2_M2_22:
+.Lcgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M2_22
+ bgt .Lcgemm_kernel_L2_M2_22
-cgemm_kernel_L2_M2_40:
+.Lcgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M2_100
+ ble .Lcgemm_kernel_L2_M2_100
-cgemm_kernel_L2_M2_42:
+.Lcgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M2_42
+ bgt .Lcgemm_kernel_L2_M2_42
-cgemm_kernel_L2_M2_100:
+.Lcgemm_kernel_L2_M2_100:
SAVE2x2
-cgemm_kernel_L2_M2_END:
+.Lcgemm_kernel_L2_M2_END:
-cgemm_kernel_L2_M1_BEGIN:
+.Lcgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L2_END
+ ble .Lcgemm_kernel_L2_END
-cgemm_kernel_L2_M1_20:
+.Lcgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble cgemm_kernel_L2_M1_40
+ ble .Lcgemm_kernel_L2_M1_40
-cgemm_kernel_L2_M1_22:
+.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M1_22
+ bgt .Lcgemm_kernel_L2_M1_22
-cgemm_kernel_L2_M1_40:
+.Lcgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L2_M1_100
+ ble .Lcgemm_kernel_L2_M1_100
-cgemm_kernel_L2_M1_42:
+.Lcgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L2_M1_42
+ bgt .Lcgemm_kernel_L2_M1_42
-cgemm_kernel_L2_M1_100:
+.Lcgemm_kernel_L2_M1_100:
SAVE1x2
-cgemm_kernel_L2_END:
+.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
-cgemm_kernel_L1_BEGIN:
+.Lcgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble cgemm_kernel_L999 // done
+ ble .Lcgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
mov pA, origPA // pA = A
-cgemm_kernel_L1_M8_BEGIN:
+.Lcgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble cgemm_kernel_L1_M4_BEGIN
+ ble .Lcgemm_kernel_L1_M4_BEGIN
-cgemm_kernel_L1_M8_20:
+.Lcgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M8_40
+ ble .Lcgemm_kernel_L1_M8_40
.align 5
-cgemm_kernel_L1_M8_22:
+.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M8_22
+ bgt .Lcgemm_kernel_L1_M8_22
-cgemm_kernel_L1_M8_40:
+.Lcgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M8_100
+ ble .Lcgemm_kernel_L1_M8_100
-cgemm_kernel_L1_M8_42:
+.Lcgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M8_42
+ bgt .Lcgemm_kernel_L1_M8_42
-cgemm_kernel_L1_M8_100:
+.Lcgemm_kernel_L1_M8_100:
SAVE8x1
-cgemm_kernel_L1_M8_END:
+.Lcgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
- bgt cgemm_kernel_L1_M8_20
+ bgt .Lcgemm_kernel_L1_M8_20
-cgemm_kernel_L1_M4_BEGIN:
+.Lcgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble cgemm_kernel_L1_END
+ ble .Lcgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
- ble cgemm_kernel_L1_M2_BEGIN
+ ble .Lcgemm_kernel_L1_M2_BEGIN
-cgemm_kernel_L1_M4_20:
+.Lcgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M4_40
+ ble .Lcgemm_kernel_L1_M4_40
.align 5
-cgemm_kernel_L1_M4_22:
+.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M4_22
+ bgt .Lcgemm_kernel_L1_M4_22
-cgemm_kernel_L1_M4_40:
+.Lcgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M4_100
+ ble .Lcgemm_kernel_L1_M4_100
-cgemm_kernel_L1_M4_42:
+.Lcgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M4_42
+ bgt .Lcgemm_kernel_L1_M4_42
-cgemm_kernel_L1_M4_100:
+.Lcgemm_kernel_L1_M4_100:
SAVE4x1
-cgemm_kernel_L1_M4_END:
+.Lcgemm_kernel_L1_M4_END:
-cgemm_kernel_L1_M2_BEGIN:
+.Lcgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble cgemm_kernel_L1_END
+ ble .Lcgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble cgemm_kernel_L1_M1_BEGIN
+ ble .Lcgemm_kernel_L1_M1_BEGIN
-cgemm_kernel_L1_M2_20:
+.Lcgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M2_40
+ ble .Lcgemm_kernel_L1_M2_40
-cgemm_kernel_L1_M2_22:
+.Lcgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M2_22
+ bgt .Lcgemm_kernel_L1_M2_22
-cgemm_kernel_L1_M2_40:
+.Lcgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M2_100
+ ble .Lcgemm_kernel_L1_M2_100
-cgemm_kernel_L1_M2_42:
+.Lcgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M2_42
+ bgt .Lcgemm_kernel_L1_M2_42
-cgemm_kernel_L1_M2_100:
+.Lcgemm_kernel_L1_M2_100:
SAVE2x1
-cgemm_kernel_L1_M2_END:
+.Lcgemm_kernel_L1_M2_END:
-cgemm_kernel_L1_M1_BEGIN:
+.Lcgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble cgemm_kernel_L1_END
+ ble .Lcgemm_kernel_L1_END
-cgemm_kernel_L1_M1_20:
+.Lcgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble cgemm_kernel_L1_M1_40
+ ble .Lcgemm_kernel_L1_M1_40
-cgemm_kernel_L1_M1_22:
+.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M1_22
+ bgt .Lcgemm_kernel_L1_M1_22
-cgemm_kernel_L1_M1_40:
+.Lcgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble cgemm_kernel_L1_M1_100
+ ble .Lcgemm_kernel_L1_M1_100
-cgemm_kernel_L1_M1_42:
+.Lcgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt cgemm_kernel_L1_M1_42
+ bgt .Lcgemm_kernel_L1_M1_42
-cgemm_kernel_L1_M1_100:
+.Lcgemm_kernel_L1_M1_100:
SAVE1x1
-cgemm_kernel_L1_END:
+.Lcgemm_kernel_L1_END:
-cgemm_kernel_L999:
+.Lcgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
PROLOGUE
cmp N, xzr
- ble copy_kernel_L999
+ ble .Lcopy_kernel_L999
cmp INC_X, #1
- bne copy_kernel_S_BEGIN
+ bne .Lcopy_kernel_S_BEGIN
cmp INC_Y, #1
- bne copy_kernel_S_BEGIN
+ bne .Lcopy_kernel_S_BEGIN
-copy_kernel_F_BEGIN:
+.Lcopy_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq copy_kernel_F1
+ beq .Lcopy_kernel_F1
-copy_kernel_F4:
+.Lcopy_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne copy_kernel_F4
+ bne .Lcopy_kernel_F4
-copy_kernel_F1:
+.Lcopy_kernel_F1:
ands I, N, #3
- ble copy_kernel_L999
+ ble .Lcopy_kernel_L999
-copy_kernel_F10:
+.Lcopy_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne copy_kernel_F10
+ bne .Lcopy_kernel_F10
mov w0, wzr
ret
-copy_kernel_S_BEGIN:
+.Lcopy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble copy_kernel_S1
+ ble .Lcopy_kernel_S1
-copy_kernel_S4:
+.Lcopy_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne copy_kernel_S4
+ bne .Lcopy_kernel_S4
-copy_kernel_S1:
+.Lcopy_kernel_S1:
ands I, N, #3
- ble copy_kernel_L999
+ ble .Lcopy_kernel_L999
-copy_kernel_S10:
+.Lcopy_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne copy_kernel_S10
+ bne .Lcopy_kernel_S10
-copy_kernel_L999:
+.Lcopy_kernel_L999:
mov w0, wzr
ret
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble ctrmm_kernel_L2_BEGIN
+ ble .Lctrmm_kernel_L2_BEGIN
/******************************************************************************/
-ctrmm_kernel_L4_BEGIN:
+.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
#endif
mov pA, origPA // pA = start of A array
-ctrmm_kernel_L4_M4_BEGIN:
+.Lctrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble ctrmm_kernel_L4_M2_BEGIN
+ ble .Lctrmm_kernel_L4_M2_BEGIN
-ctrmm_kernel_L4_M4_20:
+.Lctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt ctrmm_kernel_L4_M4_32
+ blt .Lctrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble ctrmm_kernel_L4_M4_22a
+ ble .Lctrmm_kernel_L4_M4_22a
.align 5
-ctrmm_kernel_L4_M4_22:
+.Lctrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M4_22
+ bgt .Lctrmm_kernel_L4_M4_22
-ctrmm_kernel_L4_M4_22a:
+.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b ctrmm_kernel_L4_M4_44
+ b .Lctrmm_kernel_L4_M4_44
-ctrmm_kernel_L4_M4_32:
+.Lctrmm_kernel_L4_M4_32:
tst counterL, #1
- ble ctrmm_kernel_L4_M4_40
+ ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b ctrmm_kernel_L4_M4_44
+ b .Lctrmm_kernel_L4_M4_44
-ctrmm_kernel_L4_M4_40:
+.Lctrmm_kernel_L4_M4_40:
INIT4x4
-ctrmm_kernel_L4_M4_44:
+.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
- ble ctrmm_kernel_L4_M4_100
+ ble .Lctrmm_kernel_L4_M4_100
-ctrmm_kernel_L4_M4_46:
+.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
-ctrmm_kernel_L4_M4_100:
+.Lctrmm_kernel_L4_M4_100:
SAVE4x4
add tempOffset, tempOffset, #4
#endif
-ctrmm_kernel_L4_M4_END:
+.Lctrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
- bne ctrmm_kernel_L4_M4_20
+ bne .Lctrmm_kernel_L4_M4_20
-ctrmm_kernel_L4_M2_BEGIN:
+.Lctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ctrmm_kernel_L4_END
+ ble .Lctrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble ctrmm_kernel_L4_M1_BEGIN
+ ble .Lctrmm_kernel_L4_M1_BEGIN
-ctrmm_kernel_L4_M2_20:
+.Lctrmm_kernel_L4_M2_20:
INIT2x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L4_M2_40
+ ble .Lctrmm_kernel_L4_M2_40
-ctrmm_kernel_L4_M2_22:
+.Lctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M2_22
+ bgt .Lctrmm_kernel_L4_M2_22
-ctrmm_kernel_L4_M2_40:
+.Lctrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L4_M2_100
+ ble .Lctrmm_kernel_L4_M2_100
-ctrmm_kernel_L4_M2_42:
+.Lctrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M2_42
+ bgt .Lctrmm_kernel_L4_M2_42
-ctrmm_kernel_L4_M2_100:
+.Lctrmm_kernel_L4_M2_100:
SAVE2x4
add tempOffset, tempOffset, #2
#endif
-ctrmm_kernel_L4_M2_END:
+.Lctrmm_kernel_L4_M2_END:
-ctrmm_kernel_L4_M1_BEGIN:
+.Lctrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ctrmm_kernel_L4_END
+ ble .Lctrmm_kernel_L4_END
-ctrmm_kernel_L4_M1_20:
+.Lctrmm_kernel_L4_M1_20:
INIT1x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L4_M1_40
+ ble .Lctrmm_kernel_L4_M1_40
-ctrmm_kernel_L4_M1_22:
+.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M1_22
+ bgt .Lctrmm_kernel_L4_M1_22
-ctrmm_kernel_L4_M1_40:
+.Lctrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L4_M1_100
+ ble .Lctrmm_kernel_L4_M1_100
-ctrmm_kernel_L4_M1_42:
+.Lctrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M1_42
+ bgt .Lctrmm_kernel_L4_M1_42
-ctrmm_kernel_L4_M1_100:
+.Lctrmm_kernel_L4_M1_100:
SAVE1x4
add tempOffset, tempOffset, #1
#endif
-ctrmm_kernel_L4_END:
+.Lctrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
#endif
subs counterJ, counterJ , #1 // j--
- bgt ctrmm_kernel_L4_BEGIN
+ bgt .Lctrmm_kernel_L4_BEGIN
/******************************************************************************/
-ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble ctrmm_kernel_L999 // error, N was less than 4?
+ ble .Lctrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble ctrmm_kernel_L1_BEGIN
+ ble .Lctrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-ctrmm_kernel_L2_M4_BEGIN:
+.Lctrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble ctrmm_kernel_L2_M2_BEGIN
+ ble .Lctrmm_kernel_L2_M2_BEGIN
-ctrmm_kernel_L2_M4_20:
+.Lctrmm_kernel_L2_M4_20:
INIT4x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble ctrmm_kernel_L2_M4_40
+ ble .Lctrmm_kernel_L2_M4_40
.align 5
-ctrmm_kernel_L2_M4_22:
+.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M4_22
+ bgt .Lctrmm_kernel_L2_M4_22
-ctrmm_kernel_L2_M4_40:
+.Lctrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L2_M4_100
+ ble .Lctrmm_kernel_L2_M4_100
-ctrmm_kernel_L2_M4_42:
+.Lctrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M4_42
+ bgt .Lctrmm_kernel_L2_M4_42
-ctrmm_kernel_L2_M4_100:
+.Lctrmm_kernel_L2_M4_100:
SAVE4x2
add tempOffset, tempOffset, #4
#endif
-ctrmm_kernel_L2_M4_END:
+.Lctrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt ctrmm_kernel_L2_M4_20
+ bgt .Lctrmm_kernel_L2_M4_20
-ctrmm_kernel_L2_M2_BEGIN:
+.Lctrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ctrmm_kernel_L2_END
+ ble .Lctrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble ctrmm_kernel_L2_M1_BEGIN
+ ble .Lctrmm_kernel_L2_M1_BEGIN
-ctrmm_kernel_L2_M2_20:
+.Lctrmm_kernel_L2_M2_20:
INIT2x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble ctrmm_kernel_L2_M2_40
+ ble .Lctrmm_kernel_L2_M2_40
-ctrmm_kernel_L2_M2_22:
+.Lctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M2_22
+ bgt .Lctrmm_kernel_L2_M2_22
-ctrmm_kernel_L2_M2_40:
+.Lctrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L2_M2_100
+ ble .Lctrmm_kernel_L2_M2_100
-ctrmm_kernel_L2_M2_42:
+.Lctrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M2_42
+ bgt .Lctrmm_kernel_L2_M2_42
-ctrmm_kernel_L2_M2_100:
+.Lctrmm_kernel_L2_M2_100:
SAVE2x2
add tempOffset, tempOffset, #2
#endif
-ctrmm_kernel_L2_M2_END:
+.Lctrmm_kernel_L2_M2_END:
-ctrmm_kernel_L2_M1_BEGIN:
+.Lctrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ctrmm_kernel_L2_END
+ ble .Lctrmm_kernel_L2_END
-ctrmm_kernel_L2_M1_20:
+.Lctrmm_kernel_L2_M1_20:
INIT1x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble ctrmm_kernel_L2_M1_40
+ ble .Lctrmm_kernel_L2_M1_40
-ctrmm_kernel_L2_M1_22:
+.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M1_22
+ bgt .Lctrmm_kernel_L2_M1_22
-ctrmm_kernel_L2_M1_40:
+.Lctrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L2_M1_100
+ ble .Lctrmm_kernel_L2_M1_100
-ctrmm_kernel_L2_M1_42:
+.Lctrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M1_42
+ bgt .Lctrmm_kernel_L2_M1_42
-ctrmm_kernel_L2_M1_100:
+.Lctrmm_kernel_L2_M1_100:
SAVE1x2
add tempOffset, tempOffset, #1
#endif
-ctrmm_kernel_L2_END:
+.Lctrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
-ctrmm_kernel_L1_BEGIN:
+.Lctrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble ctrmm_kernel_L999 // done
+ ble .Lctrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
mov pA, origPA // pA = A
-ctrmm_kernel_L1_M4_BEGIN:
+.Lctrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble ctrmm_kernel_L1_M2_BEGIN
+ ble .Lctrmm_kernel_L1_M2_BEGIN
-ctrmm_kernel_L1_M4_20:
+.Lctrmm_kernel_L1_M4_20:
INIT4x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L1_M4_40
+ ble .Lctrmm_kernel_L1_M4_40
.align 5
-ctrmm_kernel_L1_M4_22:
+.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M4_22
+ bgt .Lctrmm_kernel_L1_M4_22
-ctrmm_kernel_L1_M4_40:
+.Lctrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L1_M4_100
+ ble .Lctrmm_kernel_L1_M4_100
-ctrmm_kernel_L1_M4_42:
+.Lctrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M4_42
+ bgt .Lctrmm_kernel_L1_M4_42
-ctrmm_kernel_L1_M4_100:
+.Lctrmm_kernel_L1_M4_100:
SAVE4x1
add tempOffset, tempOffset, #4
#endif
-ctrmm_kernel_L1_M4_END:
+.Lctrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt ctrmm_kernel_L1_M4_20
+ bgt .Lctrmm_kernel_L1_M4_20
-ctrmm_kernel_L1_M2_BEGIN:
+.Lctrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ctrmm_kernel_L1_END
+ ble .Lctrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble ctrmm_kernel_L1_M1_BEGIN
+ ble .Lctrmm_kernel_L1_M1_BEGIN
-ctrmm_kernel_L1_M2_20:
+.Lctrmm_kernel_L1_M2_20:
INIT2x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L1_M2_40
+ ble .Lctrmm_kernel_L1_M2_40
-ctrmm_kernel_L1_M2_22:
+.Lctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M2_22
+ bgt .Lctrmm_kernel_L1_M2_22
-ctrmm_kernel_L1_M2_40:
+.Lctrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L1_M2_100
+ ble .Lctrmm_kernel_L1_M2_100
-ctrmm_kernel_L1_M2_42:
+.Lctrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M2_42
+ bgt .Lctrmm_kernel_L1_M2_42
-ctrmm_kernel_L1_M2_100:
+.Lctrmm_kernel_L1_M2_100:
SAVE2x1
add tempOffset, tempOffset, #2
#endif
-ctrmm_kernel_L1_M2_END:
+.Lctrmm_kernel_L1_M2_END:
-ctrmm_kernel_L1_M1_BEGIN:
+.Lctrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ctrmm_kernel_L1_END
+ ble .Lctrmm_kernel_L1_END
-ctrmm_kernel_L1_M1_20:
+.Lctrmm_kernel_L1_M1_20:
INIT1x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L1_M1_40
+ ble .Lctrmm_kernel_L1_M1_40
-ctrmm_kernel_L1_M1_22:
+.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M1_22
+ bgt .Lctrmm_kernel_L1_M1_22
-ctrmm_kernel_L1_M1_40:
+.Lctrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L1_M1_100
+ ble .Lctrmm_kernel_L1_M1_100
-ctrmm_kernel_L1_M1_42:
+.Lctrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M1_42
+ bgt .Lctrmm_kernel_L1_M1_42
-ctrmm_kernel_L1_M1_100:
+.Lctrmm_kernel_L1_M1_100:
SAVE1x1
-ctrmm_kernel_L1_END:
+.Lctrmm_kernel_L1_END:
-ctrmm_kernel_L999:
+.Lctrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble ctrmm_kernel_L2_BEGIN
+ ble .Lctrmm_kernel_L2_BEGIN
/******************************************************************************/
-ctrmm_kernel_L4_BEGIN:
+.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
#endif
mov pA, origPA // pA = start of A array
-ctrmm_kernel_L4_M8_BEGIN:
+.Lctrmm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble ctrmm_kernel_L4_M4_BEGIN
+ ble .Lctrmm_kernel_L4_M4_BEGIN
-ctrmm_kernel_L4_M8_20:
+.Lctrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #3
cmp counterL , #2
- blt ctrmm_kernel_L4_M8_32
+ blt .Lctrmm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
- ble ctrmm_kernel_L4_M8_22a
+ ble .Lctrmm_kernel_L4_M8_22a
.align 5
-ctrmm_kernel_L4_M8_22:
+.Lctrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M8_22
+ bgt .Lctrmm_kernel_L4_M8_22
.align 5
-ctrmm_kernel_L4_M8_22a:
+.Lctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b ctrmm_kernel_L4_M8_44
+ b .Lctrmm_kernel_L4_M8_44
.align 5
-ctrmm_kernel_L4_M8_32:
+.Lctrmm_kernel_L4_M8_32:
tst counterL, #1
- ble ctrmm_kernel_L4_M8_40
+ ble .Lctrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b ctrmm_kernel_L4_M8_44
+ b .Lctrmm_kernel_L4_M8_44
-ctrmm_kernel_L4_M8_40:
+.Lctrmm_kernel_L4_M8_40:
INIT8x4
-ctrmm_kernel_L4_M8_44:
+.Lctrmm_kernel_L4_M8_44:
ands counterL , tempK, #7
- ble ctrmm_kernel_L4_M8_100
+ ble .Lctrmm_kernel_L4_M8_100
.align 5
-ctrmm_kernel_L4_M8_46:
+.Lctrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
- bne ctrmm_kernel_L4_M8_46
+ bne .Lctrmm_kernel_L4_M8_46
-ctrmm_kernel_L4_M8_100:
+.Lctrmm_kernel_L4_M8_100:
SAVE8x4
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
-ctrmm_kernel_L4_M8_END:
+.Lctrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
- bne ctrmm_kernel_L4_M8_20
+ bne .Lctrmm_kernel_L4_M8_20
-ctrmm_kernel_L4_M4_BEGIN:
+.Lctrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble ctrmm_kernel_L4_END
+ ble .Lctrmm_kernel_L4_END
tst counterI, #4
- ble ctrmm_kernel_L4_M2_BEGIN
+ ble .Lctrmm_kernel_L4_M2_BEGIN
-ctrmm_kernel_L4_M4_20:
+.Lctrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt ctrmm_kernel_L4_M4_32
+ blt .Lctrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble ctrmm_kernel_L4_M4_22a
+ ble .Lctrmm_kernel_L4_M4_22a
.align 5
-ctrmm_kernel_L4_M4_22:
+.Lctrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M4_22
+ bgt .Lctrmm_kernel_L4_M4_22
-ctrmm_kernel_L4_M4_22a:
+.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b ctrmm_kernel_L4_M4_44
-ctrmm_kernel_L4_M4_32:
+ b .Lctrmm_kernel_L4_M4_44
+.Lctrmm_kernel_L4_M4_32:
tst counterL, #1
- ble ctrmm_kernel_L4_M4_40
+ ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b ctrmm_kernel_L4_M4_44
-ctrmm_kernel_L4_M4_40:
+ b .Lctrmm_kernel_L4_M4_44
+.Lctrmm_kernel_L4_M4_40:
INIT4x4
-ctrmm_kernel_L4_M4_44:
+.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
- ble ctrmm_kernel_L4_M4_100
+ ble .Lctrmm_kernel_L4_M4_100
-ctrmm_kernel_L4_M4_46:
+.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
-ctrmm_kernel_L4_M4_100:
+.Lctrmm_kernel_L4_M4_100:
SAVE4x4
add tempOffset, tempOffset, #4
#endif
-ctrmm_kernel_L4_M4_END:
+.Lctrmm_kernel_L4_M4_END:
-ctrmm_kernel_L4_M2_BEGIN:
+.Lctrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ctrmm_kernel_L4_END
+ ble .Lctrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble ctrmm_kernel_L4_M1_BEGIN
+ ble .Lctrmm_kernel_L4_M1_BEGIN
-ctrmm_kernel_L4_M2_20:
+.Lctrmm_kernel_L4_M2_20:
INIT2x4
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L4_M2_40
+ ble .Lctrmm_kernel_L4_M2_40
-ctrmm_kernel_L4_M2_22:
+.Lctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M2_22
+ bgt .Lctrmm_kernel_L4_M2_22
-ctrmm_kernel_L4_M2_40:
+.Lctrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L4_M2_100
+ ble .Lctrmm_kernel_L4_M2_100
-ctrmm_kernel_L4_M2_42:
+.Lctrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M2_42
+ bgt .Lctrmm_kernel_L4_M2_42
-ctrmm_kernel_L4_M2_100:
+.Lctrmm_kernel_L4_M2_100:
SAVE2x4
add tempOffset, tempOffset, #2
#endif
-ctrmm_kernel_L4_M2_END:
+.Lctrmm_kernel_L4_M2_END:
-ctrmm_kernel_L4_M1_BEGIN:
+.Lctrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ctrmm_kernel_L4_END
+ ble .Lctrmm_kernel_L4_END
-ctrmm_kernel_L4_M1_20:
+.Lctrmm_kernel_L4_M1_20:
INIT1x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L4_M1_40
+ ble .Lctrmm_kernel_L4_M1_40
-ctrmm_kernel_L4_M1_22:
+.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M1_22
+ bgt .Lctrmm_kernel_L4_M1_22
-ctrmm_kernel_L4_M1_40:
+.Lctrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L4_M1_100
+ ble .Lctrmm_kernel_L4_M1_100
-ctrmm_kernel_L4_M1_42:
+.Lctrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L4_M1_42
+ bgt .Lctrmm_kernel_L4_M1_42
-ctrmm_kernel_L4_M1_100:
+.Lctrmm_kernel_L4_M1_100:
SAVE1x4
add tempOffset, tempOffset, #1
#endif
-ctrmm_kernel_L4_END:
+.Lctrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
#endif
subs counterJ, counterJ , #1 // j--
- bgt ctrmm_kernel_L4_BEGIN
+ bgt .Lctrmm_kernel_L4_BEGIN
/******************************************************************************/
-ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble ctrmm_kernel_L999 // error, N was less than 4?
+ ble .Lctrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble ctrmm_kernel_L1_BEGIN
+ ble .Lctrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
#endif
mov pA, origPA // pA = A
-ctrmm_kernel_L2_M8_BEGIN:
+.Lctrmm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble ctrmm_kernel_L2_M4_BEGIN
+ ble .Lctrmm_kernel_L2_M4_BEGIN
-ctrmm_kernel_L2_M8_20:
+.Lctrmm_kernel_L2_M8_20:
INIT8x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble ctrmm_kernel_L2_M8_40
+ ble .Lctrmm_kernel_L2_M8_40
.align 5
-ctrmm_kernel_L2_M8_22:
+.Lctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M8_22
+ bgt .Lctrmm_kernel_L2_M8_22
-ctrmm_kernel_L2_M8_40:
+.Lctrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L2_M8_100
+ ble .Lctrmm_kernel_L2_M8_100
-ctrmm_kernel_L2_M8_42:
+.Lctrmm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M8_42
+ bgt .Lctrmm_kernel_L2_M8_42
-ctrmm_kernel_L2_M8_100:
+.Lctrmm_kernel_L2_M8_100:
SAVE8x2
add tempOffset, tempOffset, #8
#endif
-ctrmm_kernel_L2_M8_END:
+.Lctrmm_kernel_L2_M8_END:
subs counterI, counterI, #1
- bgt ctrmm_kernel_L2_M8_20
+ bgt .Lctrmm_kernel_L2_M8_20
-ctrmm_kernel_L2_M4_BEGIN:
+.Lctrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble ctrmm_kernel_L2_END
+ ble .Lctrmm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
- ble ctrmm_kernel_L2_M2_BEGIN
+ ble .Lctrmm_kernel_L2_M2_BEGIN
-ctrmm_kernel_L2_M4_20:
+.Lctrmm_kernel_L2_M4_20:
INIT4x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble ctrmm_kernel_L2_M4_40
+ ble .Lctrmm_kernel_L2_M4_40
.align 5
-ctrmm_kernel_L2_M4_22:
+.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M4_22
+ bgt .Lctrmm_kernel_L2_M4_22
-ctrmm_kernel_L2_M4_40:
+.Lctrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L2_M4_100
+ ble .Lctrmm_kernel_L2_M4_100
-ctrmm_kernel_L2_M4_42:
+.Lctrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M4_42
+ bgt .Lctrmm_kernel_L2_M4_42
-ctrmm_kernel_L2_M4_100:
+.Lctrmm_kernel_L2_M4_100:
SAVE4x2
add tempOffset, tempOffset, #4
#endif
-ctrmm_kernel_L2_M4_END:
+.Lctrmm_kernel_L2_M4_END:
-ctrmm_kernel_L2_M2_BEGIN:
+.Lctrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ctrmm_kernel_L2_END
+ ble .Lctrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble ctrmm_kernel_L2_M1_BEGIN
+ ble .Lctrmm_kernel_L2_M1_BEGIN
-ctrmm_kernel_L2_M2_20:
+.Lctrmm_kernel_L2_M2_20:
INIT2x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble ctrmm_kernel_L2_M2_40
+ ble .Lctrmm_kernel_L2_M2_40
-ctrmm_kernel_L2_M2_22:
+.Lctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M2_22
+ bgt .Lctrmm_kernel_L2_M2_22
-ctrmm_kernel_L2_M2_40:
+.Lctrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L2_M2_100
+ ble .Lctrmm_kernel_L2_M2_100
-ctrmm_kernel_L2_M2_42:
+.Lctrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M2_42
+ bgt .Lctrmm_kernel_L2_M2_42
-ctrmm_kernel_L2_M2_100:
+.Lctrmm_kernel_L2_M2_100:
SAVE2x2
add tempOffset, tempOffset, #2
#endif
-ctrmm_kernel_L2_M2_END:
+.Lctrmm_kernel_L2_M2_END:
-ctrmm_kernel_L2_M1_BEGIN:
+.Lctrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ctrmm_kernel_L2_END
+ ble .Lctrmm_kernel_L2_END
-ctrmm_kernel_L2_M1_20:
+.Lctrmm_kernel_L2_M1_20:
INIT1x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble ctrmm_kernel_L2_M1_40
+ ble .Lctrmm_kernel_L2_M1_40
-ctrmm_kernel_L2_M1_22:
+.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M1_22
+ bgt .Lctrmm_kernel_L2_M1_22
-ctrmm_kernel_L2_M1_40:
+.Lctrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L2_M1_100
+ ble .Lctrmm_kernel_L2_M1_100
-ctrmm_kernel_L2_M1_42:
+.Lctrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L2_M1_42
+ bgt .Lctrmm_kernel_L2_M1_42
-ctrmm_kernel_L2_M1_100:
+.Lctrmm_kernel_L2_M1_100:
SAVE1x2
add tempOffset, tempOffset, #1
#endif
-ctrmm_kernel_L2_END:
+.Lctrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
-ctrmm_kernel_L1_BEGIN:
+.Lctrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble ctrmm_kernel_L999 // done
+ ble .Lctrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
#endif
mov pA, origPA // pA = A
-ctrmm_kernel_L1_M8_BEGIN:
+.Lctrmm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble ctrmm_kernel_L1_M4_BEGIN
+ ble .Lctrmm_kernel_L1_M4_BEGIN
-ctrmm_kernel_L1_M8_20:
+.Lctrmm_kernel_L1_M8_20:
INIT8x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L1_M8_40
+ ble .Lctrmm_kernel_L1_M8_40
.align 5
-ctrmm_kernel_L1_M8_22:
+.Lctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M8_22
+ bgt .Lctrmm_kernel_L1_M8_22
-ctrmm_kernel_L1_M8_40:
+.Lctrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L1_M8_100
+ ble .Lctrmm_kernel_L1_M8_100
-ctrmm_kernel_L1_M8_42:
+.Lctrmm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M8_42
+ bgt .Lctrmm_kernel_L1_M8_42
-ctrmm_kernel_L1_M8_100:
+.Lctrmm_kernel_L1_M8_100:
SAVE8x1
add tempOffset, tempOffset, #8
#endif
-ctrmm_kernel_L1_M8_END:
+.Lctrmm_kernel_L1_M8_END:
subs counterI, counterI, #1
- bgt ctrmm_kernel_L1_M8_20
+ bgt .Lctrmm_kernel_L1_M8_20
-ctrmm_kernel_L1_M4_BEGIN:
+.Lctrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble ctrmm_kernel_L1_END
+ ble .Lctrmm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
- ble ctrmm_kernel_L1_M2_BEGIN
+ ble .Lctrmm_kernel_L1_M2_BEGIN
-ctrmm_kernel_L1_M4_20:
+.Lctrmm_kernel_L1_M4_20:
INIT4x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L1_M4_40
+ ble .Lctrmm_kernel_L1_M4_40
.align 5
-ctrmm_kernel_L1_M4_22:
+.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M4_22
+ bgt .Lctrmm_kernel_L1_M4_22
-ctrmm_kernel_L1_M4_40:
+.Lctrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L1_M4_100
+ ble .Lctrmm_kernel_L1_M4_100
-ctrmm_kernel_L1_M4_42:
+.Lctrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M4_42
+ bgt .Lctrmm_kernel_L1_M4_42
-ctrmm_kernel_L1_M4_100:
+.Lctrmm_kernel_L1_M4_100:
SAVE4x1
add tempOffset, tempOffset, #4
#endif
-ctrmm_kernel_L1_M4_END:
+.Lctrmm_kernel_L1_M4_END:
-ctrmm_kernel_L1_M2_BEGIN:
+.Lctrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ctrmm_kernel_L1_END
+ ble .Lctrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble ctrmm_kernel_L1_M1_BEGIN
+ ble .Lctrmm_kernel_L1_M1_BEGIN
-ctrmm_kernel_L1_M2_20:
+.Lctrmm_kernel_L1_M2_20:
INIT2x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L1_M2_40
+ ble .Lctrmm_kernel_L1_M2_40
-ctrmm_kernel_L1_M2_22:
+.Lctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M2_22
+ bgt .Lctrmm_kernel_L1_M2_22
-ctrmm_kernel_L1_M2_40:
+.Lctrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L1_M2_100
+ ble .Lctrmm_kernel_L1_M2_100
-ctrmm_kernel_L1_M2_42:
+.Lctrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M2_42
+ bgt .Lctrmm_kernel_L1_M2_42
-ctrmm_kernel_L1_M2_100:
+.Lctrmm_kernel_L1_M2_100:
SAVE2x1
add tempOffset, tempOffset, #2
#endif
-ctrmm_kernel_L1_M2_END:
+.Lctrmm_kernel_L1_M2_END:
-ctrmm_kernel_L1_M1_BEGIN:
+.Lctrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ctrmm_kernel_L1_END
+ ble .Lctrmm_kernel_L1_END
-ctrmm_kernel_L1_M1_20:
+.Lctrmm_kernel_L1_M1_20:
INIT1x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ctrmm_kernel_L1_M1_40
+ ble .Lctrmm_kernel_L1_M1_40
-ctrmm_kernel_L1_M1_22:
+.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M1_22
+ bgt .Lctrmm_kernel_L1_M1_22
-ctrmm_kernel_L1_M1_40:
+.Lctrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ctrmm_kernel_L1_M1_100
+ ble .Lctrmm_kernel_L1_M1_100
-ctrmm_kernel_L1_M1_42:
+.Lctrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt ctrmm_kernel_L1_M1_42
+ bgt .Lctrmm_kernel_L1_M1_42
-ctrmm_kernel_L1_M1_100:
+.Lctrmm_kernel_L1_M1_100:
SAVE1x1
-ctrmm_kernel_L1_END:
+.Lctrmm_kernel_L1_END:
-ctrmm_kernel_L999:
+.Lctrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
PROLOGUE
cmp N, xzr
- ble axpy_kernel_L999
+ ble .Ldaxpy_kernel_L999
fcmp DA, #0.0
- beq axpy_kernel_L999
+ beq .Ldaxpy_kernel_L999
cmp INC_X, #1
- bne axpy_kernel_S_BEGIN
+ bne .Ldaxpy_kernel_S_BEGIN
cmp INC_Y, #1
- bne axpy_kernel_S_BEGIN
+ bne .Ldaxpy_kernel_S_BEGIN
-axpy_kernel_F_BEGIN:
+.Ldaxpy_kernel_F_BEGIN:
asr I, N, #5
cmp I, xzr
- beq axpy_kernel_F1
+ beq .Ldaxpy_kernel_F1
.align 5
-axpy_kernel_F32:
+.Ldaxpy_kernel_F32:
KERNEL_F32
subs I, I, #1
- bne axpy_kernel_F32
+ bne .Ldaxpy_kernel_F32
-axpy_kernel_F1:
+.Ldaxpy_kernel_F1:
ands I, N, #31
- ble axpy_kernel_L999
+ ble .Ldaxpy_kernel_L999
-axpy_kernel_F10:
+.Ldaxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne axpy_kernel_F10
+ bne .Ldaxpy_kernel_F10
- b axpy_kernel_L999
+ b .Ldaxpy_kernel_L999
-axpy_kernel_S_BEGIN:
+.Ldaxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble axpy_kernel_S1
+ ble .Ldaxpy_kernel_S1
-axpy_kernel_S4:
+.Ldaxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne axpy_kernel_S4
+ bne .Ldaxpy_kernel_S4
-axpy_kernel_S1:
+.Ldaxpy_kernel_S1:
ands I, N, #3
- ble axpy_kernel_L999
+ ble .Ldaxpy_kernel_L999
-axpy_kernel_S10:
+.Ldaxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne axpy_kernel_S10
+ bne .Ldaxpy_kernel_S10
-axpy_kernel_L999:
+.Ldaxpy_kernel_L999:
mov w0, wzr
ret
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble dgemm_kernel_L2_BEGIN
+ ble .Ldgemm_kernel_L2_BEGIN
-dgemm_kernel_L4_BEGIN:
+.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
//------------------------------------------------------------------------------
-dgemm_kernel_L4_M8_BEGIN:
+.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dgemm_kernel_L4_M4_BEGIN
+ ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
-dgemm_kernel_L4_M8_20:
+.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #2 // L = K / 4
cmp counterL , #2
- blt dgemm_kernel_L4_M8_32
+ blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
- ble dgemm_kernel_L4_M8_22a
+ ble .Ldgemm_kernel_L4_M8_22a
.align 5
-dgemm_kernel_L4_M8_22:
+.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M8_22
+ bgt .Ldgemm_kernel_L4_M8_22
.align 5
-dgemm_kernel_L4_M8_22a:
+.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b dgemm_kernel_L4_M8_44
+ b .Ldgemm_kernel_L4_M8_44
.align 5
-dgemm_kernel_L4_M8_32:
+.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
- ble dgemm_kernel_L4_M8_40
+ ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b dgemm_kernel_L4_M8_44
+ b .Ldgemm_kernel_L4_M8_44
-dgemm_kernel_L4_M8_40:
+.Ldgemm_kernel_L4_M8_40:
INIT8x4
-dgemm_kernel_L4_M8_44:
+.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #3
- ble dgemm_kernel_L4_M8_100
+ ble .Ldgemm_kernel_L4_M8_100
.align 5
-dgemm_kernel_L4_M8_46:
+.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
- bne dgemm_kernel_L4_M8_46
+ bne .Ldgemm_kernel_L4_M8_46
-dgemm_kernel_L4_M8_100:
+.Ldgemm_kernel_L4_M8_100:
lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp]
SAVE8x4
-dgemm_kernel_L4_M8_END:
+.Ldgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp
add ppA, ppA, temp
subs counterI, counterI, #1
- bne dgemm_kernel_L4_M8_20
+ bne .Ldgemm_kernel_L4_M8_20
-dgemm_kernel_L4_M4_BEGIN:
+.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
tst counterI, #4
- ble dgemm_kernel_L4_M2_BEGIN
+ ble .Ldgemm_kernel_L4_M2_BEGIN
-dgemm_kernel_L4_M4_20:
+.Ldgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble dgemm_kernel_L4_M4_40
+ ble .Ldgemm_kernel_L4_M4_40
-dgemm_kernel_L4_M4_22:
+.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M4_22
+ bgt .Ldgemm_kernel_L4_M4_22
-dgemm_kernel_L4_M4_40:
+.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M4_100
+ ble .Ldgemm_kernel_L4_M4_100
-dgemm_kernel_L4_M4_42:
+.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M4_42
+ bgt .Ldgemm_kernel_L4_M4_42
-dgemm_kernel_L4_M4_100:
+.Ldgemm_kernel_L4_M4_100:
SAVE4x4
-dgemm_kernel_L4_M4_END:
+.Ldgemm_kernel_L4_M4_END:
-dgemm_kernel_L4_M2_BEGIN:
+.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L4_M1_BEGIN
+ ble .Ldgemm_kernel_L4_M1_BEGIN
-dgemm_kernel_L4_M2_20:
+.Ldgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M2_40
+ ble .Ldgemm_kernel_L4_M2_40
-dgemm_kernel_L4_M2_22:
+.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M2_22
+ bgt .Ldgemm_kernel_L4_M2_22
-dgemm_kernel_L4_M2_40:
+.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M2_100
+ ble .Ldgemm_kernel_L4_M2_100
-dgemm_kernel_L4_M2_42:
+.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M2_42
+ bgt .Ldgemm_kernel_L4_M2_42
-dgemm_kernel_L4_M2_100:
+.Ldgemm_kernel_L4_M2_100:
SAVE2x4
-dgemm_kernel_L4_M2_END:
+.Ldgemm_kernel_L4_M2_END:
-dgemm_kernel_L4_M1_BEGIN:
+.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
-dgemm_kernel_L4_M1_20:
+.Ldgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M1_40
+ ble .Ldgemm_kernel_L4_M1_40
-dgemm_kernel_L4_M1_22:
+.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M1_22
+ bgt .Ldgemm_kernel_L4_M1_22
-dgemm_kernel_L4_M1_40:
+.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M1_100
+ ble .Ldgemm_kernel_L4_M1_100
-dgemm_kernel_L4_M1_42:
+.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M1_42
+ bgt .Ldgemm_kernel_L4_M1_42
-dgemm_kernel_L4_M1_100:
+.Ldgemm_kernel_L4_M1_100:
SAVE1x4
-dgemm_kernel_L4_END:
+.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
- bgt dgemm_kernel_L4_BEGIN
+ bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
-dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble dgemm_kernel_L999 // error, N was less than 4?
+ ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble dgemm_kernel_L1_BEGIN
+ ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
-dgemm_kernel_L2_M4_BEGIN:
+.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble dgemm_kernel_L2_M2_BEGIN
+ ble .Ldgemm_kernel_L2_M2_BEGIN
-dgemm_kernel_L2_M4_20:
+.Ldgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M4_40
+ ble .Ldgemm_kernel_L2_M4_40
.align 5
-dgemm_kernel_L2_M4_22:
+.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M4_22
+ bgt .Ldgemm_kernel_L2_M4_22
-dgemm_kernel_L2_M4_40:
+.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M4_100
+ ble .Ldgemm_kernel_L2_M4_100
-dgemm_kernel_L2_M4_42:
+.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M4_42
+ bgt .Ldgemm_kernel_L2_M4_42
-dgemm_kernel_L2_M4_100:
+.Ldgemm_kernel_L2_M4_100:
SAVE4x2
-dgemm_kernel_L2_M4_END:
+.Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt dgemm_kernel_L2_M4_20
+ bgt .Ldgemm_kernel_L2_M4_20
-dgemm_kernel_L2_M2_BEGIN:
+.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L2_M1_BEGIN
+ ble .Ldgemm_kernel_L2_M1_BEGIN
-dgemm_kernel_L2_M2_20:
+.Ldgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M2_40
+ ble .Ldgemm_kernel_L2_M2_40
-dgemm_kernel_L2_M2_22:
+.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M2_22
+ bgt .Ldgemm_kernel_L2_M2_22
-dgemm_kernel_L2_M2_40:
+.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M2_100
+ ble .Ldgemm_kernel_L2_M2_100
-dgemm_kernel_L2_M2_42:
+.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M2_42
+ bgt .Ldgemm_kernel_L2_M2_42
-dgemm_kernel_L2_M2_100:
+.Ldgemm_kernel_L2_M2_100:
SAVE2x2
-dgemm_kernel_L2_M2_END:
+.Ldgemm_kernel_L2_M2_END:
-dgemm_kernel_L2_M1_BEGIN:
+.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
-dgemm_kernel_L2_M1_20:
+.Ldgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble dgemm_kernel_L2_M1_40
+ ble .Ldgemm_kernel_L2_M1_40
-dgemm_kernel_L2_M1_22:
+.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M1_22
+ bgt .Ldgemm_kernel_L2_M1_22
-dgemm_kernel_L2_M1_40:
+.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M1_100
+ ble .Ldgemm_kernel_L2_M1_100
-dgemm_kernel_L2_M1_42:
+.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M1_42
+ bgt .Ldgemm_kernel_L2_M1_42
-dgemm_kernel_L2_M1_100:
+.Ldgemm_kernel_L2_M1_100:
SAVE1x2
-dgemm_kernel_L2_END:
+.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
-dgemm_kernel_L1_BEGIN:
+.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble dgemm_kernel_L999 // done
+ ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
-dgemm_kernel_L1_M4_BEGIN:
+.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dgemm_kernel_L1_M2_BEGIN
+ ble .Ldgemm_kernel_L1_M2_BEGIN
-dgemm_kernel_L1_M4_20:
+.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M4_40
+ ble .Ldgemm_kernel_L1_M4_40
.align 5
-dgemm_kernel_L1_M4_22:
+.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M4_22
+ bgt .Ldgemm_kernel_L1_M4_22
-dgemm_kernel_L1_M4_40:
+.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M4_100
+ ble .Ldgemm_kernel_L1_M4_100
-dgemm_kernel_L1_M4_42:
+.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M4_42
+ bgt .Ldgemm_kernel_L1_M4_42
-dgemm_kernel_L1_M4_100:
+.Ldgemm_kernel_L1_M4_100:
SAVE4x1
-dgemm_kernel_L1_M4_END:
+.Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt dgemm_kernel_L1_M4_20
+ bgt .Ldgemm_kernel_L1_M4_20
-dgemm_kernel_L1_M2_BEGIN:
+.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L1_M1_BEGIN
+ ble .Ldgemm_kernel_L1_M1_BEGIN
-dgemm_kernel_L1_M2_20:
+.Ldgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M2_40
+ ble .Ldgemm_kernel_L1_M2_40
-dgemm_kernel_L1_M2_22:
+.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M2_22
+ bgt .Ldgemm_kernel_L1_M2_22
-dgemm_kernel_L1_M2_40:
+.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M2_100
+ ble .Ldgemm_kernel_L1_M2_100
-dgemm_kernel_L1_M2_42:
+.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M2_42
+ bgt .Ldgemm_kernel_L1_M2_42
-dgemm_kernel_L1_M2_100:
+.Ldgemm_kernel_L1_M2_100:
SAVE2x1
-dgemm_kernel_L1_M2_END:
+.Ldgemm_kernel_L1_M2_END:
-dgemm_kernel_L1_M1_BEGIN:
+.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
-dgemm_kernel_L1_M1_20:
+.Ldgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M1_40
+ ble .Ldgemm_kernel_L1_M1_40
-dgemm_kernel_L1_M1_22:
+.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M1_22
+ bgt .Ldgemm_kernel_L1_M1_22
-dgemm_kernel_L1_M1_40:
+.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M1_100
+ ble .Ldgemm_kernel_L1_M1_100
-dgemm_kernel_L1_M1_42:
+.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M1_42
+ bgt .Ldgemm_kernel_L1_M1_42
-dgemm_kernel_L1_M1_100:
+.Ldgemm_kernel_L1_M1_100:
SAVE1x1
-dgemm_kernel_L1_END:
+.Ldgemm_kernel_L1_END:
-dgemm_kernel_L999:
+.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
- ble dgemm_kernel_L4_BEGIN
+ ble .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
-dgemm_kernel_L8_BEGIN:
+.Ldgemm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
mov pA, origPA // pA = start of A array
-dgemm_kernel_L8_M4_BEGIN:
+.Ldgemm_kernel_L8_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dgemm_kernel_L8_M2_BEGIN
+ ble .Ldgemm_kernel_L8_M2_BEGIN
-dgemm_kernel_L8_M4_20:
+.Ldgemm_kernel_L8_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt dgemm_kernel_L8_M4_32
+ blt .Ldgemm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2
- ble dgemm_kernel_L8_M4_22a
+ ble .Ldgemm_kernel_L8_M4_22a
.align 5
-dgemm_kernel_L8_M4_22:
+.Ldgemm_kernel_L8_M4_22:
KERNEL4x8_M1
KERNEL4x8_M2
subs counterL, counterL, #1
- bgt dgemm_kernel_L8_M4_22
+ bgt .Ldgemm_kernel_L8_M4_22
-dgemm_kernel_L8_M4_22a:
+.Ldgemm_kernel_L8_M4_22a:
KERNEL4x8_M1
KERNEL4x8_E
- b dgemm_kernel_L8_M4_44
+ b .Ldgemm_kernel_L8_M4_44
-dgemm_kernel_L8_M4_32:
+.Ldgemm_kernel_L8_M4_32:
tst counterL, #1
- ble dgemm_kernel_L8_M4_40
+ ble .Ldgemm_kernel_L8_M4_40
KERNEL4x8_I
KERNEL4x8_E
- b dgemm_kernel_L8_M4_44
+ b .Ldgemm_kernel_L8_M4_44
-dgemm_kernel_L8_M4_40:
+.Ldgemm_kernel_L8_M4_40:
INIT4x8
-dgemm_kernel_L8_M4_44:
+.Ldgemm_kernel_L8_M4_44:
ands counterL , origK, #1
- ble dgemm_kernel_L8_M4_100
+ ble .Ldgemm_kernel_L8_M4_100
-dgemm_kernel_L8_M4_46:
+.Ldgemm_kernel_L8_M4_46:
KERNEL4x8_SUB
-dgemm_kernel_L8_M4_100:
+.Ldgemm_kernel_L8_M4_100:
SAVE4x8
-dgemm_kernel_L8_M4_END:
+.Ldgemm_kernel_L8_M4_END:
subs counterI, counterI, #1
- bne dgemm_kernel_L8_M4_20
+ bne .Ldgemm_kernel_L8_M4_20
-dgemm_kernel_L8_M2_BEGIN:
+.Ldgemm_kernel_L8_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L8_END
+ ble .Ldgemm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L8_M1_BEGIN
+ ble .Ldgemm_kernel_L8_M1_BEGIN
-dgemm_kernel_L8_M2_20:
+.Ldgemm_kernel_L8_M2_20:
INIT2x8
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L8_M2_40
+ ble .Ldgemm_kernel_L8_M2_40
-dgemm_kernel_L8_M2_22:
+.Ldgemm_kernel_L8_M2_22:
KERNEL2x8_SUB
KERNEL2x8_SUB
KERNEL2x8_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L8_M2_22
+ bgt .Ldgemm_kernel_L8_M2_22
-dgemm_kernel_L8_M2_40:
+.Ldgemm_kernel_L8_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L8_M2_100
+ ble .Ldgemm_kernel_L8_M2_100
-dgemm_kernel_L8_M2_42:
+.Ldgemm_kernel_L8_M2_42:
KERNEL2x8_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L8_M2_42
+ bgt .Ldgemm_kernel_L8_M2_42
-dgemm_kernel_L8_M2_100:
+.Ldgemm_kernel_L8_M2_100:
SAVE2x8
-dgemm_kernel_L8_M2_END:
+.Ldgemm_kernel_L8_M2_END:
-dgemm_kernel_L8_M1_BEGIN:
+.Ldgemm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L8_END
+ ble .Ldgemm_kernel_L8_END
-dgemm_kernel_L8_M1_20:
+.Ldgemm_kernel_L8_M1_20:
INIT1x8
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L8_M1_40
+ ble .Ldgemm_kernel_L8_M1_40
-dgemm_kernel_L8_M1_22:
+.Ldgemm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L8_M1_22
+ bgt .Ldgemm_kernel_L8_M1_22
-dgemm_kernel_L8_M1_40:
+.Ldgemm_kernel_L8_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L8_M1_100
+ ble .Ldgemm_kernel_L8_M1_100
-dgemm_kernel_L8_M1_42:
+.Ldgemm_kernel_L8_M1_42:
KERNEL1x8_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L8_M1_42
+ bgt .Ldgemm_kernel_L8_M1_42
-dgemm_kernel_L8_M1_100:
+.Ldgemm_kernel_L8_M1_100:
SAVE1x8
-dgemm_kernel_L8_END:
+.Ldgemm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
subs counterJ, counterJ , #1 // j--
- bgt dgemm_kernel_L8_BEGIN
+ bgt .Ldgemm_kernel_L8_BEGIN
/******************************************************************************/
-dgemm_kernel_L4_BEGIN:
+.Ldgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #7
- ble dgemm_kernel_L999
+ ble .Ldgemm_kernel_L999
tst counterJ , #4
- ble dgemm_kernel_L2_BEGIN
+ ble .Ldgemm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pA, origPA // pA = start of A array
-dgemm_kernel_L4_M4_BEGIN:
+.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dgemm_kernel_L4_M2_BEGIN
+ ble .Ldgemm_kernel_L4_M2_BEGIN
-dgemm_kernel_L4_M4_20:
+.Ldgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt dgemm_kernel_L4_M4_32
+ blt .Ldgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble dgemm_kernel_L4_M4_22a
+ ble .Ldgemm_kernel_L4_M4_22a
.align 5
-dgemm_kernel_L4_M4_22:
+.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M4_22
+ bgt .Ldgemm_kernel_L4_M4_22
-dgemm_kernel_L4_M4_22a:
+.Ldgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b dgemm_kernel_L4_M4_44
+ b .Ldgemm_kernel_L4_M4_44
-dgemm_kernel_L4_M4_32:
+.Ldgemm_kernel_L4_M4_32:
tst counterL, #1
- ble dgemm_kernel_L4_M4_40
+ ble .Ldgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b dgemm_kernel_L4_M4_44
+ b .Ldgemm_kernel_L4_M4_44
-dgemm_kernel_L4_M4_40:
+.Ldgemm_kernel_L4_M4_40:
INIT4x4
-dgemm_kernel_L4_M4_44:
+.Ldgemm_kernel_L4_M4_44:
ands counterL , origK, #1
- ble dgemm_kernel_L4_M4_100
+ ble .Ldgemm_kernel_L4_M4_100
-dgemm_kernel_L4_M4_46:
+.Ldgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
-dgemm_kernel_L4_M4_100:
+.Ldgemm_kernel_L4_M4_100:
SAVE4x4
-dgemm_kernel_L4_M4_END:
+.Ldgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
- bne dgemm_kernel_L4_M4_20
+ bne .Ldgemm_kernel_L4_M4_20
-dgemm_kernel_L4_M2_BEGIN:
+.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L4_M1_BEGIN
+ ble .Ldgemm_kernel_L4_M1_BEGIN
-dgemm_kernel_L4_M2_20:
+.Ldgemm_kernel_L4_M2_20:
INIT2x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M2_40
+ ble .Ldgemm_kernel_L4_M2_40
-dgemm_kernel_L4_M2_22:
+.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M2_22
+ bgt .Ldgemm_kernel_L4_M2_22
-dgemm_kernel_L4_M2_40:
+.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M2_100
+ ble .Ldgemm_kernel_L4_M2_100
-dgemm_kernel_L4_M2_42:
+.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M2_42
+ bgt .Ldgemm_kernel_L4_M2_42
-dgemm_kernel_L4_M2_100:
+.Ldgemm_kernel_L4_M2_100:
SAVE2x4
-dgemm_kernel_L4_M2_END:
+.Ldgemm_kernel_L4_M2_END:
-dgemm_kernel_L4_M1_BEGIN:
+.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
-dgemm_kernel_L4_M1_20:
+.Ldgemm_kernel_L4_M1_20:
INIT1x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M1_40
+ ble .Ldgemm_kernel_L4_M1_40
-dgemm_kernel_L4_M1_22:
+.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M1_22
+ bgt .Ldgemm_kernel_L4_M1_22
-dgemm_kernel_L4_M1_40:
+.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M1_100
+ ble .Ldgemm_kernel_L4_M1_100
-dgemm_kernel_L4_M1_42:
+.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M1_42
+ bgt .Ldgemm_kernel_L4_M1_42
-dgemm_kernel_L4_M1_100:
+.Ldgemm_kernel_L4_M1_100:
SAVE1x4
-dgemm_kernel_L4_END:
+.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/
-dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble dgemm_kernel_L999 // error, N was less than 4?
+ ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble dgemm_kernel_L1_BEGIN
+ ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-dgemm_kernel_L2_M4_BEGIN:
+.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble dgemm_kernel_L2_M2_BEGIN
+ ble .Ldgemm_kernel_L2_M2_BEGIN
-dgemm_kernel_L2_M4_20:
+.Ldgemm_kernel_L2_M4_20:
INIT4x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M4_40
+ ble .Ldgemm_kernel_L2_M4_40
.align 5
-dgemm_kernel_L2_M4_22:
+.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M4_22
+ bgt .Ldgemm_kernel_L2_M4_22
-dgemm_kernel_L2_M4_40:
+.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M4_100
+ ble .Ldgemm_kernel_L2_M4_100
-dgemm_kernel_L2_M4_42:
+.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M4_42
+ bgt .Ldgemm_kernel_L2_M4_42
-dgemm_kernel_L2_M4_100:
+.Ldgemm_kernel_L2_M4_100:
SAVE4x2
-dgemm_kernel_L2_M4_END:
+.Ldgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt dgemm_kernel_L2_M4_20
+ bgt .Ldgemm_kernel_L2_M4_20
-dgemm_kernel_L2_M2_BEGIN:
+.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L2_M1_BEGIN
+ ble .Ldgemm_kernel_L2_M1_BEGIN
-dgemm_kernel_L2_M2_20:
+.Ldgemm_kernel_L2_M2_20:
INIT2x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M2_40
+ ble .Ldgemm_kernel_L2_M2_40
-dgemm_kernel_L2_M2_22:
+.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M2_22
+ bgt .Ldgemm_kernel_L2_M2_22
-dgemm_kernel_L2_M2_40:
+.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M2_100
+ ble .Ldgemm_kernel_L2_M2_100
-dgemm_kernel_L2_M2_42:
+.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M2_42
+ bgt .Ldgemm_kernel_L2_M2_42
-dgemm_kernel_L2_M2_100:
+.Ldgemm_kernel_L2_M2_100:
SAVE2x2
-dgemm_kernel_L2_M2_END:
+.Ldgemm_kernel_L2_M2_END:
-dgemm_kernel_L2_M1_BEGIN:
+.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
-dgemm_kernel_L2_M1_20:
+.Ldgemm_kernel_L2_M1_20:
INIT1x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble dgemm_kernel_L2_M1_40
+ ble .Ldgemm_kernel_L2_M1_40
-dgemm_kernel_L2_M1_22:
+.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M1_22
+ bgt .Ldgemm_kernel_L2_M1_22
-dgemm_kernel_L2_M1_40:
+.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M1_100
+ ble .Ldgemm_kernel_L2_M1_100
-dgemm_kernel_L2_M1_42:
+.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M1_42
+ bgt .Ldgemm_kernel_L2_M1_42
-dgemm_kernel_L2_M1_100:
+.Ldgemm_kernel_L2_M1_100:
SAVE1x2
-dgemm_kernel_L2_END:
+.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
-dgemm_kernel_L1_BEGIN:
+.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble dgemm_kernel_L999 // done
+ ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
mov pA, origPA // pA = A
-dgemm_kernel_L1_M4_BEGIN:
+.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dgemm_kernel_L1_M2_BEGIN
+ ble .Ldgemm_kernel_L1_M2_BEGIN
-dgemm_kernel_L1_M4_20:
+.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M4_40
+ ble .Ldgemm_kernel_L1_M4_40
.align 5
-dgemm_kernel_L1_M4_22:
+.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M4_22
+ bgt .Ldgemm_kernel_L1_M4_22
-dgemm_kernel_L1_M4_40:
+.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M4_100
+ ble .Ldgemm_kernel_L1_M4_100
-dgemm_kernel_L1_M4_42:
+.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M4_42
+ bgt .Ldgemm_kernel_L1_M4_42
-dgemm_kernel_L1_M4_100:
+.Ldgemm_kernel_L1_M4_100:
SAVE4x1
-dgemm_kernel_L1_M4_END:
+.Ldgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt dgemm_kernel_L1_M4_20
+ bgt .Ldgemm_kernel_L1_M4_20
-dgemm_kernel_L1_M2_BEGIN:
+.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L1_M1_BEGIN
+ ble .Ldgemm_kernel_L1_M1_BEGIN
-dgemm_kernel_L1_M2_20:
+.Ldgemm_kernel_L1_M2_20:
INIT2x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M2_40
+ ble .Ldgemm_kernel_L1_M2_40
-dgemm_kernel_L1_M2_22:
+.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M2_22
+ bgt .Ldgemm_kernel_L1_M2_22
-dgemm_kernel_L1_M2_40:
+.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M2_100
+ ble .Ldgemm_kernel_L1_M2_100
-dgemm_kernel_L1_M2_42:
+.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M2_42
+ bgt .Ldgemm_kernel_L1_M2_42
-dgemm_kernel_L1_M2_100:
+.Ldgemm_kernel_L1_M2_100:
SAVE2x1
-dgemm_kernel_L1_M2_END:
+.Ldgemm_kernel_L1_M2_END:
-dgemm_kernel_L1_M1_BEGIN:
+.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
-dgemm_kernel_L1_M1_20:
+.Ldgemm_kernel_L1_M1_20:
INIT1x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M1_40
+ ble .Ldgemm_kernel_L1_M1_40
-dgemm_kernel_L1_M1_22:
+.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M1_22
+ bgt .Ldgemm_kernel_L1_M1_22
-dgemm_kernel_L1_M1_40:
+.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M1_100
+ ble .Ldgemm_kernel_L1_M1_100
-dgemm_kernel_L1_M1_42:
+.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M1_42
+ bgt .Ldgemm_kernel_L1_M1_42
-dgemm_kernel_L1_M1_100:
+.Ldgemm_kernel_L1_M1_100:
SAVE1x1
-dgemm_kernel_L1_END:
+.Ldgemm_kernel_L1_END:
-dgemm_kernel_L999:
+.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble dgemm_kernel_L2_BEGIN
+ ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/
.align 5
-dgemm_kernel_L4_BEGIN:
+.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
mov pA, origPA // pA = start of A array
-dgemm_kernel_L4_M8_BEGIN:
+.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dgemm_kernel_L4_M4_BEGIN
+ ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
-dgemm_kernel_L4_M8_20:
+.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
- blt dgemm_kernel_L4_M8_32
+ blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
- ble dgemm_kernel_L4_M8_22a
+ ble .Ldgemm_kernel_L4_M8_22a
.align 5
-dgemm_kernel_L4_M8_22:
+.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M8_22
+ bgt .Ldgemm_kernel_L4_M8_22
.align 5
-dgemm_kernel_L4_M8_22a:
+.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b dgemm_kernel_L4_M8_44
+ b .Ldgemm_kernel_L4_M8_44
.align 5
-dgemm_kernel_L4_M8_32:
+.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
- ble dgemm_kernel_L4_M8_40
+ ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b dgemm_kernel_L4_M8_44
+ b .Ldgemm_kernel_L4_M8_44
-dgemm_kernel_L4_M8_40:
+.Ldgemm_kernel_L4_M8_40:
INIT8x4
-dgemm_kernel_L4_M8_44:
+.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #7
- ble dgemm_kernel_L4_M8_100
+ ble .Ldgemm_kernel_L4_M8_100
.align 5
-dgemm_kernel_L4_M8_46:
+.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
- bne dgemm_kernel_L4_M8_46
+ bne .Ldgemm_kernel_L4_M8_46
-dgemm_kernel_L4_M8_100:
+.Ldgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE8x4
-dgemm_kernel_L4_M8_END:
+.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
- bne dgemm_kernel_L4_M8_20
+ bne .Ldgemm_kernel_L4_M8_20
-dgemm_kernel_L4_M4_BEGIN:
+.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
tst counterI, #4
- ble dgemm_kernel_L4_M2_BEGIN
+ ble .Ldgemm_kernel_L4_M2_BEGIN
-dgemm_kernel_L4_M4_20:
+.Ldgemm_kernel_L4_M4_20:
INIT4x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M4_40
+ ble .Ldgemm_kernel_L4_M4_40
.align 5
-dgemm_kernel_L4_M4_22:
+.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M4_22
+ bgt .Ldgemm_kernel_L4_M4_22
-dgemm_kernel_L4_M4_40:
+.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M4_100
+ ble .Ldgemm_kernel_L4_M4_100
-dgemm_kernel_L4_M4_42:
+.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M4_42
+ bgt .Ldgemm_kernel_L4_M4_42
-dgemm_kernel_L4_M4_100:
+.Ldgemm_kernel_L4_M4_100:
SAVE4x4
-dgemm_kernel_L4_M4_END:
+.Ldgemm_kernel_L4_M4_END:
-dgemm_kernel_L4_M2_BEGIN:
+.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L4_M1_BEGIN
+ ble .Ldgemm_kernel_L4_M1_BEGIN
-dgemm_kernel_L4_M2_20:
+.Ldgemm_kernel_L4_M2_20:
INIT2x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M2_40
+ ble .Ldgemm_kernel_L4_M2_40
.align 5
-dgemm_kernel_L4_M2_22:
+.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M2_22
+ bgt .Ldgemm_kernel_L4_M2_22
-dgemm_kernel_L4_M2_40:
+.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M2_100
+ ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
-dgemm_kernel_L4_M2_42:
+.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M2_42
+ bgt .Ldgemm_kernel_L4_M2_42
-dgemm_kernel_L4_M2_100:
+.Ldgemm_kernel_L4_M2_100:
SAVE2x4
-dgemm_kernel_L4_M2_END:
+.Ldgemm_kernel_L4_M2_END:
-dgemm_kernel_L4_M1_BEGIN:
+.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
-dgemm_kernel_L4_M1_20:
+.Ldgemm_kernel_L4_M1_20:
INIT1x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M1_40
+ ble .Ldgemm_kernel_L4_M1_40
.align 5
-dgemm_kernel_L4_M1_22:
+.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M1_22
+ bgt .Ldgemm_kernel_L4_M1_22
-dgemm_kernel_L4_M1_40:
+.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M1_100
+ ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
-dgemm_kernel_L4_M1_42:
+.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M1_42
+ bgt .Ldgemm_kernel_L4_M1_42
-dgemm_kernel_L4_M1_100:
+.Ldgemm_kernel_L4_M1_100:
SAVE1x4
-dgemm_kernel_L4_END:
+.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
- bgt dgemm_kernel_L4_BEGIN
+ bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
-dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble dgemm_kernel_L999 // error, N was less than 4?
+ ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble dgemm_kernel_L1_BEGIN
+ ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pCRow1, pCRow0, LDC
mov pA, origPA // pA = A
-dgemm_kernel_L2_M8_BEGIN:
+.Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dgemm_kernel_L2_M4_BEGIN
+ ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5
-dgemm_kernel_L2_M8_20:
+.Ldgemm_kernel_L2_M8_20:
INIT8x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M8_40
+ ble .Ldgemm_kernel_L2_M8_40
.align 5
-dgemm_kernel_L2_M8_22:
+.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M8_22
+ bgt .Ldgemm_kernel_L2_M8_22
-dgemm_kernel_L2_M8_40:
+.Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M8_100
+ ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-dgemm_kernel_L2_M8_42:
+.Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M8_42
+ bgt .Ldgemm_kernel_L2_M8_42
-dgemm_kernel_L2_M8_100:
+.Ldgemm_kernel_L2_M8_100:
SAVE8x2
-dgemm_kernel_L2_M8_END:
+.Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
- bgt dgemm_kernel_L2_M8_20
+ bgt .Ldgemm_kernel_L2_M8_20
-dgemm_kernel_L2_M4_BEGIN:
+.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
- ble dgemm_kernel_L2_M2_BEGIN
+ ble .Ldgemm_kernel_L2_M2_BEGIN
-dgemm_kernel_L2_M4_20:
+.Ldgemm_kernel_L2_M4_20:
INIT4x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M4_40
+ ble .Ldgemm_kernel_L2_M4_40
.align 5
-dgemm_kernel_L2_M4_22:
+.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M4_22
+ bgt .Ldgemm_kernel_L2_M4_22
-dgemm_kernel_L2_M4_40:
+.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M4_100
+ ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-dgemm_kernel_L2_M4_42:
+.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M4_42
+ bgt .Ldgemm_kernel_L2_M4_42
-dgemm_kernel_L2_M4_100:
+.Ldgemm_kernel_L2_M4_100:
SAVE4x2
-dgemm_kernel_L2_M4_END:
+.Ldgemm_kernel_L2_M4_END:
-dgemm_kernel_L2_M2_BEGIN:
+.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L2_M1_BEGIN
+ ble .Ldgemm_kernel_L2_M1_BEGIN
-dgemm_kernel_L2_M2_20:
+.Ldgemm_kernel_L2_M2_20:
INIT2x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M2_40
+ ble .Ldgemm_kernel_L2_M2_40
-dgemm_kernel_L2_M2_22:
+.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M2_22
+ bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-dgemm_kernel_L2_M2_40:
+.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M2_100
+ ble .Ldgemm_kernel_L2_M2_100
-dgemm_kernel_L2_M2_42:
+.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M2_42
+ bgt .Ldgemm_kernel_L2_M2_42
-dgemm_kernel_L2_M2_100:
+.Ldgemm_kernel_L2_M2_100:
SAVE2x2
-dgemm_kernel_L2_M2_END:
+.Ldgemm_kernel_L2_M2_END:
-dgemm_kernel_L2_M1_BEGIN:
+.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
-dgemm_kernel_L2_M1_20:
+.Ldgemm_kernel_L2_M1_20:
INIT1x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble dgemm_kernel_L2_M1_40
+ ble .Ldgemm_kernel_L2_M1_40
-dgemm_kernel_L2_M1_22:
+.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M1_22
+ bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-dgemm_kernel_L2_M1_40:
+.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M1_100
+ ble .Ldgemm_kernel_L2_M1_100
-dgemm_kernel_L2_M1_42:
+.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M1_42
+ bgt .Ldgemm_kernel_L2_M1_42
-dgemm_kernel_L2_M1_100:
+.Ldgemm_kernel_L2_M1_100:
SAVE1x2
-dgemm_kernel_L2_END:
+.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
-dgemm_kernel_L1_BEGIN:
+.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble dgemm_kernel_L999 // done
+ ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A
-dgemm_kernel_L1_M8_BEGIN:
+.Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dgemm_kernel_L1_M4_BEGIN
+ ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5
-dgemm_kernel_L1_M8_20:
+.Ldgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M8_40
+ ble .Ldgemm_kernel_L1_M8_40
.align 5
-dgemm_kernel_L1_M8_22:
+.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M8_22
+ bgt .Ldgemm_kernel_L1_M8_22
-dgemm_kernel_L1_M8_40:
+.Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M8_100
+ ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
-dgemm_kernel_L1_M8_42:
+.Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M8_42
+ bgt .Ldgemm_kernel_L1_M8_42
-dgemm_kernel_L1_M8_100:
+.Ldgemm_kernel_L1_M8_100:
SAVE8x1
-dgemm_kernel_L1_M8_END:
+.Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
- bgt dgemm_kernel_L1_M8_20
+ bgt .Ldgemm_kernel_L1_M8_20
-dgemm_kernel_L1_M4_BEGIN:
+.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
- ble dgemm_kernel_L1_M2_BEGIN
+ ble .Ldgemm_kernel_L1_M2_BEGIN
-dgemm_kernel_L1_M4_20:
+.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M4_40
+ ble .Ldgemm_kernel_L1_M4_40
.align 5
-dgemm_kernel_L1_M4_22:
+.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M4_22
+ bgt .Ldgemm_kernel_L1_M4_22
-dgemm_kernel_L1_M4_40:
+.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M4_100
+ ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
-dgemm_kernel_L1_M4_42:
+.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M4_42
+ bgt .Ldgemm_kernel_L1_M4_42
-dgemm_kernel_L1_M4_100:
+.Ldgemm_kernel_L1_M4_100:
SAVE4x1
-dgemm_kernel_L1_M4_END:
+.Ldgemm_kernel_L1_M4_END:
-dgemm_kernel_L1_M2_BEGIN:
+.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L1_M1_BEGIN
+ ble .Ldgemm_kernel_L1_M1_BEGIN
-dgemm_kernel_L1_M2_20:
+.Ldgemm_kernel_L1_M2_20:
INIT2x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M2_40
+ ble .Ldgemm_kernel_L1_M2_40
-dgemm_kernel_L1_M2_22:
+.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M2_22
+ bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
-dgemm_kernel_L1_M2_40:
+.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M2_100
+ ble .Ldgemm_kernel_L1_M2_100
-dgemm_kernel_L1_M2_42:
+.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M2_42
+ bgt .Ldgemm_kernel_L1_M2_42
-dgemm_kernel_L1_M2_100:
+.Ldgemm_kernel_L1_M2_100:
SAVE2x1
-dgemm_kernel_L1_M2_END:
+.Ldgemm_kernel_L1_M2_END:
-dgemm_kernel_L1_M1_BEGIN:
+.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
-dgemm_kernel_L1_M1_20:
+.Ldgemm_kernel_L1_M1_20:
INIT1x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M1_40
+ ble .Ldgemm_kernel_L1_M1_40
-dgemm_kernel_L1_M1_22:
+.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M1_22
+ bgt .Ldgemm_kernel_L1_M1_22
-dgemm_kernel_L1_M1_40:
+.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M1_100
+ ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
-dgemm_kernel_L1_M1_42:
+.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M1_42
+ bgt .Ldgemm_kernel_L1_M1_42
-dgemm_kernel_L1_M1_100:
+.Ldgemm_kernel_L1_M1_100:
SAVE1x1
-dgemm_kernel_L1_END:
+.Ldgemm_kernel_L1_END:
-dgemm_kernel_L999:
+.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble dgemm_kernel_L2_BEGIN
+ ble .Ldgemm_kernel_L2_BEGIN
/******************************************************************************/
.align 5
-dgemm_kernel_L4_BEGIN:
+.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
mov pA, origPA // pA = start of A array
-dgemm_kernel_L4_M8_BEGIN:
+.Ldgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dgemm_kernel_L4_M4_BEGIN
+ ble .Ldgemm_kernel_L4_M4_BEGIN
.align 5
-dgemm_kernel_L4_M8_20:
+.Ldgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #7 // L = K / 128
cmp counterL , #2 // is there at least 4 to do?
- blt dgemm_kernel_L4_M8_32
+ blt .Ldgemm_kernel_L4_M8_32
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1_M2_x1
subs counterL, counterL, #2 // subtract 2
- ble dgemm_kernel_L4_M8_22a
+ ble .Ldgemm_kernel_L4_M8_22a
.align 5
-dgemm_kernel_L4_M8_22:
+.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1_M2_x64
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M8_22
+ bgt .Ldgemm_kernel_L4_M8_22
.align 5
-dgemm_kernel_L4_M8_22a:
+.Ldgemm_kernel_L4_M8_22a:
KERNEL8x4_M1_M2_x32
KERNEL8x4_M1_M2_x16
KERNEL8x4_M1
KERNEL8x4_E
- b dgemm_kernel_L4_M8_44
+ b .Ldgemm_kernel_L4_M8_44
.align 5
-dgemm_kernel_L4_M8_32:
+.Ldgemm_kernel_L4_M8_32:
tst counterL, #1
- ble dgemm_kernel_L4_M8_40
+ ble .Ldgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b dgemm_kernel_L4_M8_44
+ b .Ldgemm_kernel_L4_M8_44
-dgemm_kernel_L4_M8_40:
+.Ldgemm_kernel_L4_M8_40:
INIT8x4
-dgemm_kernel_L4_M8_44:
+.Ldgemm_kernel_L4_M8_44:
ands counterL , origK, #127
- ble dgemm_kernel_L4_M8_100
+ ble .Ldgemm_kernel_L4_M8_100
.align 5
-dgemm_kernel_L4_M8_46:
+.Ldgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
- bne dgemm_kernel_L4_M8_46
+ bne .Ldgemm_kernel_L4_M8_46
-dgemm_kernel_L4_M8_100:
+.Ldgemm_kernel_L4_M8_100:
prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
SAVE8x4
-dgemm_kernel_L4_M8_END:
+.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
- bne dgemm_kernel_L4_M8_20
+ bne .Ldgemm_kernel_L4_M8_20
-dgemm_kernel_L4_M4_BEGIN:
+.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
tst counterI, #4
- ble dgemm_kernel_L4_M2_BEGIN
+ ble .Ldgemm_kernel_L4_M2_BEGIN
-dgemm_kernel_L4_M4_20:
+.Ldgemm_kernel_L4_M4_20:
INIT4x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M4_40
+ ble .Ldgemm_kernel_L4_M4_40
.align 5
-dgemm_kernel_L4_M4_22:
+.Ldgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M4_22
+ bgt .Ldgemm_kernel_L4_M4_22
-dgemm_kernel_L4_M4_40:
+.Ldgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M4_100
+ ble .Ldgemm_kernel_L4_M4_100
-dgemm_kernel_L4_M4_42:
+.Ldgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M4_42
+ bgt .Ldgemm_kernel_L4_M4_42
-dgemm_kernel_L4_M4_100:
+.Ldgemm_kernel_L4_M4_100:
SAVE4x4
-dgemm_kernel_L4_M4_END:
+.Ldgemm_kernel_L4_M4_END:
-dgemm_kernel_L4_M2_BEGIN:
+.Ldgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L4_M1_BEGIN
+ ble .Ldgemm_kernel_L4_M1_BEGIN
-dgemm_kernel_L4_M2_20:
+.Ldgemm_kernel_L4_M2_20:
INIT2x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M2_40
+ ble .Ldgemm_kernel_L4_M2_40
.align 5
-dgemm_kernel_L4_M2_22:
+.Ldgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M2_22
+ bgt .Ldgemm_kernel_L4_M2_22
-dgemm_kernel_L4_M2_40:
+.Ldgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M2_100
+ ble .Ldgemm_kernel_L4_M2_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
-dgemm_kernel_L4_M2_42:
+.Ldgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M2_42
+ bgt .Ldgemm_kernel_L4_M2_42
-dgemm_kernel_L4_M2_100:
+.Ldgemm_kernel_L4_M2_100:
SAVE2x4
-dgemm_kernel_L4_M2_END:
+.Ldgemm_kernel_L4_M2_END:
-dgemm_kernel_L4_M1_BEGIN:
+.Ldgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L4_END
+ ble .Ldgemm_kernel_L4_END
-dgemm_kernel_L4_M1_20:
+.Ldgemm_kernel_L4_M1_20:
INIT1x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L4_M1_40
+ ble .Ldgemm_kernel_L4_M1_40
.align 5
-dgemm_kernel_L4_M1_22:
+.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M1_22
+ bgt .Ldgemm_kernel_L4_M1_22
-dgemm_kernel_L4_M1_40:
+.Ldgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L4_M1_100
+ ble .Ldgemm_kernel_L4_M1_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
-dgemm_kernel_L4_M1_42:
+.Ldgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L4_M1_42
+ bgt .Ldgemm_kernel_L4_M1_42
-dgemm_kernel_L4_M1_100:
+.Ldgemm_kernel_L4_M1_100:
SAVE1x4
-dgemm_kernel_L4_END:
+.Ldgemm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
subs counterJ, counterJ , #1 // j--
- bgt dgemm_kernel_L4_BEGIN
+ bgt .Ldgemm_kernel_L4_BEGIN
/******************************************************************************/
-dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble dgemm_kernel_L999 // error, N was less than 4?
+ ble .Ldgemm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble dgemm_kernel_L1_BEGIN
+ ble .Ldgemm_kernel_L1_BEGIN
mov pCRow0, pC
add pCRow1, pCRow0, LDC
mov pA, origPA // pA = A
-dgemm_kernel_L2_M8_BEGIN:
+.Ldgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dgemm_kernel_L2_M4_BEGIN
+ ble .Ldgemm_kernel_L2_M4_BEGIN
.align 5
-dgemm_kernel_L2_M8_20:
+.Ldgemm_kernel_L2_M8_20:
INIT8x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M8_40
+ ble .Ldgemm_kernel_L2_M8_40
.align 5
-dgemm_kernel_L2_M8_22:
+.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M8_22
+ bgt .Ldgemm_kernel_L2_M8_22
-dgemm_kernel_L2_M8_40:
+.Ldgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M8_100
+ ble .Ldgemm_kernel_L2_M8_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
-dgemm_kernel_L2_M8_42:
+.Ldgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M8_42
+ bgt .Ldgemm_kernel_L2_M8_42
-dgemm_kernel_L2_M8_100:
+.Ldgemm_kernel_L2_M8_100:
SAVE8x2
-dgemm_kernel_L2_M8_END:
+.Ldgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
- bgt dgemm_kernel_L2_M8_20
+ bgt .Ldgemm_kernel_L2_M8_20
-dgemm_kernel_L2_M4_BEGIN:
+.Ldgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
- ble dgemm_kernel_L2_M2_BEGIN
+ ble .Ldgemm_kernel_L2_M2_BEGIN
-dgemm_kernel_L2_M4_20:
+.Ldgemm_kernel_L2_M4_20:
INIT4x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M4_40
+ ble .Ldgemm_kernel_L2_M4_40
.align 5
-dgemm_kernel_L2_M4_22:
+.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M4_22
+ bgt .Ldgemm_kernel_L2_M4_22
-dgemm_kernel_L2_M4_40:
+.Ldgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M4_100
+ ble .Ldgemm_kernel_L2_M4_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
-dgemm_kernel_L2_M4_42:
+.Ldgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M4_42
+ bgt .Ldgemm_kernel_L2_M4_42
-dgemm_kernel_L2_M4_100:
+.Ldgemm_kernel_L2_M4_100:
SAVE4x2
-dgemm_kernel_L2_M4_END:
+.Ldgemm_kernel_L2_M4_END:
-dgemm_kernel_L2_M2_BEGIN:
+.Ldgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L2_M1_BEGIN
+ ble .Ldgemm_kernel_L2_M1_BEGIN
-dgemm_kernel_L2_M2_20:
+.Ldgemm_kernel_L2_M2_20:
INIT2x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dgemm_kernel_L2_M2_40
+ ble .Ldgemm_kernel_L2_M2_40
-dgemm_kernel_L2_M2_22:
+.Ldgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M2_22
+ bgt .Ldgemm_kernel_L2_M2_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
-dgemm_kernel_L2_M2_40:
+.Ldgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M2_100
+ ble .Ldgemm_kernel_L2_M2_100
-dgemm_kernel_L2_M2_42:
+.Ldgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M2_42
+ bgt .Ldgemm_kernel_L2_M2_42
-dgemm_kernel_L2_M2_100:
+.Ldgemm_kernel_L2_M2_100:
SAVE2x2
-dgemm_kernel_L2_M2_END:
+.Ldgemm_kernel_L2_M2_END:
-dgemm_kernel_L2_M1_BEGIN:
+.Ldgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L2_END
+ ble .Ldgemm_kernel_L2_END
-dgemm_kernel_L2_M1_20:
+.Ldgemm_kernel_L2_M1_20:
INIT1x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble dgemm_kernel_L2_M1_40
+ ble .Ldgemm_kernel_L2_M1_40
-dgemm_kernel_L2_M1_22:
+.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M1_22
+ bgt .Ldgemm_kernel_L2_M1_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
-dgemm_kernel_L2_M1_40:
+.Ldgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L2_M1_100
+ ble .Ldgemm_kernel_L2_M1_100
-dgemm_kernel_L2_M1_42:
+.Ldgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L2_M1_42
+ bgt .Ldgemm_kernel_L2_M1_42
-dgemm_kernel_L2_M1_100:
+.Ldgemm_kernel_L2_M1_100:
SAVE1x2
-dgemm_kernel_L2_END:
+.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8
/******************************************************************************/
-dgemm_kernel_L1_BEGIN:
+.Ldgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble dgemm_kernel_L999 // done
+ ble .Ldgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
mov pA, origPA // pA = A
-dgemm_kernel_L1_M8_BEGIN:
+.Ldgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dgemm_kernel_L1_M4_BEGIN
+ ble .Ldgemm_kernel_L1_M4_BEGIN
.align 5
-dgemm_kernel_L1_M8_20:
+.Ldgemm_kernel_L1_M8_20:
INIT8x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M8_40
+ ble .Ldgemm_kernel_L1_M8_40
.align 5
-dgemm_kernel_L1_M8_22:
+.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M8_22
+ bgt .Ldgemm_kernel_L1_M8_22
-dgemm_kernel_L1_M8_40:
+.Ldgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M8_100
+ ble .Ldgemm_kernel_L1_M8_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
-dgemm_kernel_L1_M8_42:
+.Ldgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M8_42
+ bgt .Ldgemm_kernel_L1_M8_42
-dgemm_kernel_L1_M8_100:
+.Ldgemm_kernel_L1_M8_100:
SAVE8x1
-dgemm_kernel_L1_M8_END:
+.Ldgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
- bgt dgemm_kernel_L1_M8_20
+ bgt .Ldgemm_kernel_L1_M8_20
-dgemm_kernel_L1_M4_BEGIN:
+.Ldgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
- ble dgemm_kernel_L1_M2_BEGIN
+ ble .Ldgemm_kernel_L1_M2_BEGIN
-dgemm_kernel_L1_M4_20:
+.Ldgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M4_40
+ ble .Ldgemm_kernel_L1_M4_40
.align 5
-dgemm_kernel_L1_M4_22:
+.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M4_22
+ bgt .Ldgemm_kernel_L1_M4_22
-dgemm_kernel_L1_M4_40:
+.Ldgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M4_100
+ ble .Ldgemm_kernel_L1_M4_100
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
-dgemm_kernel_L1_M4_42:
+.Ldgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M4_42
+ bgt .Ldgemm_kernel_L1_M4_42
-dgemm_kernel_L1_M4_100:
+.Ldgemm_kernel_L1_M4_100:
SAVE4x1
-dgemm_kernel_L1_M4_END:
+.Ldgemm_kernel_L1_M4_END:
-dgemm_kernel_L1_M2_BEGIN:
+.Ldgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble dgemm_kernel_L1_M1_BEGIN
+ ble .Ldgemm_kernel_L1_M1_BEGIN
-dgemm_kernel_L1_M2_20:
+.Ldgemm_kernel_L1_M2_20:
INIT2x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M2_40
+ ble .Ldgemm_kernel_L1_M2_40
-dgemm_kernel_L1_M2_22:
+.Ldgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M2_22
+ bgt .Ldgemm_kernel_L1_M2_22
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
-dgemm_kernel_L1_M2_40:
+.Ldgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M2_100
+ ble .Ldgemm_kernel_L1_M2_100
-dgemm_kernel_L1_M2_42:
+.Ldgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M2_42
+ bgt .Ldgemm_kernel_L1_M2_42
-dgemm_kernel_L1_M2_100:
+.Ldgemm_kernel_L1_M2_100:
SAVE2x1
-dgemm_kernel_L1_M2_END:
+.Ldgemm_kernel_L1_M2_END:
-dgemm_kernel_L1_M1_BEGIN:
+.Ldgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dgemm_kernel_L1_END
+ ble .Ldgemm_kernel_L1_END
-dgemm_kernel_L1_M1_20:
+.Ldgemm_kernel_L1_M1_20:
INIT1x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dgemm_kernel_L1_M1_40
+ ble .Ldgemm_kernel_L1_M1_40
-dgemm_kernel_L1_M1_22:
+.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M1_22
+ bgt .Ldgemm_kernel_L1_M1_22
-dgemm_kernel_L1_M1_40:
+.Ldgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble dgemm_kernel_L1_M1_100
+ ble .Ldgemm_kernel_L1_M1_100
prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE]
-dgemm_kernel_L1_M1_42:
+.Ldgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dgemm_kernel_L1_M1_42
+ bgt .Ldgemm_kernel_L1_M1_42
-dgemm_kernel_L1_M1_100:
+.Ldgemm_kernel_L1_M1_100:
SAVE1x1
-dgemm_kernel_L1_END:
+.Ldgemm_kernel_L1_END:
-dgemm_kernel_L999:
+.Ldgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
lsl LDA, LDA, #3 // LDA = LDA * SIZE
-dgemm_ncopy_L4_BEGIN:
+.Ldgemm_ncopy_L4_BEGIN:
asr J, N, #2 // J = N / 4
cmp J, #0
- ble dgemm_ncopy_L2_BEGIN
+ ble .Ldgemm_ncopy_L2_BEGIN
.align 5
-dgemm_ncopy_L4_M4_BEGIN:
+.Ldgemm_ncopy_L4_M4_BEGIN:
mov A01, A00
add A02, A01, LDA
asr I, M, #2 // I = M / 4
cmp I, #0
- ble dgemm_ncopy_L4_M4_40
+ ble .Ldgemm_ncopy_L4_M4_40
.align 5
-dgemm_ncopy_L4_M4_20:
+.Ldgemm_ncopy_L4_M4_20:
COPY4x4
subs I , I , #1
- bne dgemm_ncopy_L4_M4_20
+ bne .Ldgemm_ncopy_L4_M4_20
-dgemm_ncopy_L4_M4_40:
+.Ldgemm_ncopy_L4_M4_40:
and I, M , #3
cmp I, #0
- ble dgemm_ncopy_L4_M4_END
+ ble .Ldgemm_ncopy_L4_M4_END
.align 5
-dgemm_ncopy_L4_M4_60:
+.Ldgemm_ncopy_L4_M4_60:
COPY1x4
subs I , I , #1
- bne dgemm_ncopy_L4_M4_60
+ bne .Ldgemm_ncopy_L4_M4_60
-dgemm_ncopy_L4_M4_END:
+.Ldgemm_ncopy_L4_M4_END:
subs J , J, #1 // j--
- bne dgemm_ncopy_L4_M4_BEGIN
+ bne .Ldgemm_ncopy_L4_M4_BEGIN
/*********************************************************************************************/
-dgemm_ncopy_L2_BEGIN:
+.Ldgemm_ncopy_L2_BEGIN:
tst N, #3
- ble dgemm_ncopy_L999
+ ble .Ldgemm_ncopy_L999
tst N, #2
- ble dgemm_ncopy_L1_BEGIN
+ ble .Ldgemm_ncopy_L1_BEGIN
-dgemm_ncopy_L2_M4_BEGIN:
+.Ldgemm_ncopy_L2_M4_BEGIN:
mov A01, A00
add A02, A01, LDA
add A00, A02, LDA
asr I, M, #2 // I = M / 4
cmp I, #0
- ble dgemm_ncopy_L2_M4_40
+ ble .Ldgemm_ncopy_L2_M4_40
.align 5
-dgemm_ncopy_L2_M4_20:
+.Ldgemm_ncopy_L2_M4_20:
COPY4x2
subs I , I , #1
- bne dgemm_ncopy_L2_M4_20
+ bne .Ldgemm_ncopy_L2_M4_20
-dgemm_ncopy_L2_M4_40:
+.Ldgemm_ncopy_L2_M4_40:
and I, M , #3
cmp I, #0
- ble dgemm_ncopy_L2_M4_END
+ ble .Ldgemm_ncopy_L2_M4_END
.align 5
-dgemm_ncopy_L2_M4_60:
+.Ldgemm_ncopy_L2_M4_60:
COPY1x2
subs I , I , #1
- bne dgemm_ncopy_L2_M4_60
+ bne .Ldgemm_ncopy_L2_M4_60
-dgemm_ncopy_L2_M4_END:
+.Ldgemm_ncopy_L2_M4_END:
/*********************************************************************************************/
-dgemm_ncopy_L1_BEGIN:
+.Ldgemm_ncopy_L1_BEGIN:
tst N, #1
- ble dgemm_ncopy_L999
+ ble .Ldgemm_ncopy_L999
-dgemm_ncopy_L1_M4_BEGIN:
+.Ldgemm_ncopy_L1_M4_BEGIN:
mov A01, A00
asr I, M, #2 // I = M / 4
cmp I, #0
- ble dgemm_ncopy_L1_M4_40
+ ble .Ldgemm_ncopy_L1_M4_40
.align 5
-dgemm_ncopy_L1_M4_20:
+.Ldgemm_ncopy_L1_M4_20:
COPY4x1
subs I , I , #1
- bne dgemm_ncopy_L1_M4_20
+ bne .Ldgemm_ncopy_L1_M4_20
-dgemm_ncopy_L1_M4_40:
+.Ldgemm_ncopy_L1_M4_40:
and I, M , #3
cmp I, #0
- ble dgemm_ncopy_L1_M4_END
+ ble .Ldgemm_ncopy_L1_M4_END
.align 5
-dgemm_ncopy_L1_M4_60:
+.Ldgemm_ncopy_L1_M4_60:
COPY1x1
subs I , I , #1
- bne dgemm_ncopy_L1_M4_60
+ bne .Ldgemm_ncopy_L1_M4_60
-dgemm_ncopy_L1_M4_END:
+.Ldgemm_ncopy_L1_M4_END:
-dgemm_ncopy_L999:
+.Ldgemm_ncopy_L999:
mov x0, #0
RESTORE_REGS
lsl LDA, LDA, #3 // LDA = LDA * SIZE
-dgemm_ncopy_L8_BEGIN:
+.Ldgemm_ncopy_L8_BEGIN:
asr J, N, #3 // J = N / 8
cmp J, #0
- ble dgemm_ncopy_L4_BEGIN
+ ble .Ldgemm_ncopy_L4_BEGIN
-dgemm_ncopy_L8_M8_BEGIN:
+.Ldgemm_ncopy_L8_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
asr I, M, #3 // I = M / 8
cmp I, #0
- ble dgemm_ncopy_L8_M8_40
+ ble .Ldgemm_ncopy_L8_M8_40
-dgemm_ncopy_L8_M8_20:
+.Ldgemm_ncopy_L8_M8_20:
COPY8x8
subs I , I , #1
- bne dgemm_ncopy_L8_M8_20
+ bne .Ldgemm_ncopy_L8_M8_20
-dgemm_ncopy_L8_M8_40:
+.Ldgemm_ncopy_L8_M8_40:
and I, M , #7
cmp I, #0
- ble dgemm_ncopy_L8_M8_END
+ ble .Ldgemm_ncopy_L8_M8_END
-dgemm_ncopy_L8_M8_60:
+.Ldgemm_ncopy_L8_M8_60:
COPY1x8
subs I , I , #1
- bne dgemm_ncopy_L8_M8_60
+ bne .Ldgemm_ncopy_L8_M8_60
-dgemm_ncopy_L8_M8_END:
+.Ldgemm_ncopy_L8_M8_END:
subs J , J, #1 // j--
- bne dgemm_ncopy_L8_M8_BEGIN
+ bne .Ldgemm_ncopy_L8_M8_BEGIN
/*********************************************************************************************/
-dgemm_ncopy_L4_BEGIN:
+.Ldgemm_ncopy_L4_BEGIN:
tst N, #7
- ble dgemm_ncopy_L999
+ ble .Ldgemm_ncopy_L999
tst N, #4
- ble dgemm_ncopy_L2_BEGIN
+ ble .Ldgemm_ncopy_L2_BEGIN
-dgemm_ncopy_L4_M8_BEGIN:
+.Ldgemm_ncopy_L4_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
asr I, M, #3 // I = M / 8
cmp I, #0
- ble dgemm_ncopy_L4_M8_40
+ ble .Ldgemm_ncopy_L4_M8_40
-dgemm_ncopy_L4_M8_20:
+.Ldgemm_ncopy_L4_M8_20:
COPY8x4
subs I , I , #1
- bne dgemm_ncopy_L4_M8_20
+ bne .Ldgemm_ncopy_L4_M8_20
-dgemm_ncopy_L4_M8_40:
+.Ldgemm_ncopy_L4_M8_40:
and I, M , #7
cmp I, #0
- ble dgemm_ncopy_L4_M8_END
+ ble .Ldgemm_ncopy_L4_M8_END
-dgemm_ncopy_L4_M8_60:
+.Ldgemm_ncopy_L4_M8_60:
COPY1x4
subs I , I , #1
- bne dgemm_ncopy_L4_M8_60
+ bne .Ldgemm_ncopy_L4_M8_60
-dgemm_ncopy_L4_M8_END:
+.Ldgemm_ncopy_L4_M8_END:
/*********************************************************************************************/
-dgemm_ncopy_L2_BEGIN:
+.Ldgemm_ncopy_L2_BEGIN:
tst N, #3
- ble dgemm_ncopy_L999
+ ble .Ldgemm_ncopy_L999
tst N, #2
- ble dgemm_ncopy_L1_BEGIN
+ ble .Ldgemm_ncopy_L1_BEGIN
-dgemm_ncopy_L2_M8_BEGIN:
+.Ldgemm_ncopy_L2_M8_BEGIN:
mov A01, A00
add A02, A01, LDA
add A00, A02, LDA
asr I, M, #3 // I = M / 8
cmp I, #0
- ble dgemm_ncopy_L2_M8_40
+ ble .Ldgemm_ncopy_L2_M8_40
-dgemm_ncopy_L2_M8_20:
+.Ldgemm_ncopy_L2_M8_20:
COPY8x2
subs I , I , #1
- bne dgemm_ncopy_L2_M8_20
+ bne .Ldgemm_ncopy_L2_M8_20
-dgemm_ncopy_L2_M8_40:
+.Ldgemm_ncopy_L2_M8_40:
and I, M , #7
cmp I, #0
- ble dgemm_ncopy_L2_M8_END
+ ble .Ldgemm_ncopy_L2_M8_END
-dgemm_ncopy_L2_M8_60:
+.Ldgemm_ncopy_L2_M8_60:
COPY1x2
subs I , I , #1
- bne dgemm_ncopy_L2_M8_60
+ bne .Ldgemm_ncopy_L2_M8_60
-dgemm_ncopy_L2_M8_END:
+.Ldgemm_ncopy_L2_M8_END:
/*********************************************************************************************/
-dgemm_ncopy_L1_BEGIN:
+.Ldgemm_ncopy_L1_BEGIN:
tst N, #1
- ble dgemm_ncopy_L999
+ ble .Ldgemm_ncopy_L999
-dgemm_ncopy_L1_M8_BEGIN:
+.Ldgemm_ncopy_L1_M8_BEGIN:
mov A01, A00
asr I, M, #3 // I = M / 8
cmp I, #0
- ble dgemm_ncopy_L1_M8_40
+ ble .Ldgemm_ncopy_L1_M8_40
-dgemm_ncopy_L1_M8_20:
+.Ldgemm_ncopy_L1_M8_20:
COPY8x1
subs I , I , #1
- bne dgemm_ncopy_L1_M8_20
+ bne .Ldgemm_ncopy_L1_M8_20
-dgemm_ncopy_L1_M8_40:
+.Ldgemm_ncopy_L1_M8_40:
and I, M , #7
cmp I, #0
- ble dgemm_ncopy_L1_M8_END
+ ble .Ldgemm_ncopy_L1_M8_END
-dgemm_ncopy_L1_M8_60:
+.Ldgemm_ncopy_L1_M8_60:
COPY1x1
subs I , I , #1
- bne dgemm_ncopy_L1_M8_60
+ bne .Ldgemm_ncopy_L1_M8_60
-dgemm_ncopy_L1_M8_END:
+.Ldgemm_ncopy_L1_M8_END:
-dgemm_ncopy_L999:
+.Ldgemm_ncopy_L999:
mov x0, #0
RESTORE_REGS
lsl M4, M, #5 // M4 = M * 4 * SIZE
-dgemm_tcopy_L4_BEGIN:
+.Ldgemm_tcopy_L4_BEGIN:
asr J, M, #2 // J = M / 4
cmp J, #0
- ble dgemm_tcopy_L2_BEGIN
+ ble .Ldgemm_tcopy_L2_BEGIN
.align 5
-dgemm_tcopy_L4_M4_BEGIN:
+.Ldgemm_tcopy_L4_M4_BEGIN:
mov A01, A
add A02, A01, LDA
asr I, N, #2 // I = N / 4
cmp I, #0
- ble dgemm_tcopy_L4_M4_40
+ ble .Ldgemm_tcopy_L4_M4_40
.align 5
-dgemm_tcopy_L4_M4_20:
+.Ldgemm_tcopy_L4_M4_20:
COPY4x4
subs I , I , #1
- bne dgemm_tcopy_L4_M4_20
+ bne .Ldgemm_tcopy_L4_M4_20
-dgemm_tcopy_L4_M4_40:
+.Ldgemm_tcopy_L4_M4_40:
tst N , #2
- ble dgemm_tcopy_L4_M4_60
+ ble .Ldgemm_tcopy_L4_M4_60
COPY2x4
-dgemm_tcopy_L4_M4_60:
+.Ldgemm_tcopy_L4_M4_60:
tst N, #1
- ble dgemm_tcopy_L4_M4_END
+ ble .Ldgemm_tcopy_L4_M4_END
COPY1x4
-dgemm_tcopy_L4_M4_END:
+.Ldgemm_tcopy_L4_M4_END:
subs J , J, #1 // j--
- bne dgemm_tcopy_L4_M4_BEGIN
+ bne .Ldgemm_tcopy_L4_M4_BEGIN
/*********************************************************************************************/
-dgemm_tcopy_L2_BEGIN:
+.Ldgemm_tcopy_L2_BEGIN:
tst M, #3
- ble dgemm_tcopy_L999
+ ble .Ldgemm_tcopy_L999
tst M, #2
- ble dgemm_tcopy_L1_BEGIN
+ ble .Ldgemm_tcopy_L1_BEGIN
-dgemm_tcopy_L2_M4_BEGIN:
+.Ldgemm_tcopy_L2_M4_BEGIN:
mov A01, A
add A02, A01, LDA
add A, A02, LDA
asr I, N, #2 // I = N / 4
cmp I, #0
- ble dgemm_tcopy_L2_M4_40
+ ble .Ldgemm_tcopy_L2_M4_40
.align 5
-dgemm_tcopy_L2_M4_20:
+.Ldgemm_tcopy_L2_M4_20:
COPY4x2
subs I , I , #1
- bne dgemm_tcopy_L2_M4_20
+ bne .Ldgemm_tcopy_L2_M4_20
-dgemm_tcopy_L2_M4_40:
+.Ldgemm_tcopy_L2_M4_40:
tst N , #2
- ble dgemm_tcopy_L2_M4_60
+ ble .Ldgemm_tcopy_L2_M4_60
COPY2x2
-dgemm_tcopy_L2_M4_60:
+.Ldgemm_tcopy_L2_M4_60:
tst N , #1
- ble dgemm_tcopy_L2_M4_END
+ ble .Ldgemm_tcopy_L2_M4_END
COPY1x2
-dgemm_tcopy_L2_M4_END:
+.Ldgemm_tcopy_L2_M4_END:
/*********************************************************************************************/
-dgemm_tcopy_L1_BEGIN:
+.Ldgemm_tcopy_L1_BEGIN:
tst M, #1
- ble dgemm_tcopy_L999
+ ble .Ldgemm_tcopy_L999
-dgemm_tcopy_L1_M4_BEGIN:
+.Ldgemm_tcopy_L1_M4_BEGIN:
mov A01, A // A01 = A
mov B01, B
asr I, N, #2 // I = M / 4
cmp I, #0
- ble dgemm_tcopy_L1_M4_40
+ ble .Ldgemm_tcopy_L1_M4_40
.align 5
-dgemm_tcopy_L1_M4_20:
+.Ldgemm_tcopy_L1_M4_20:
COPY4x1
subs I , I , #1
- bne dgemm_tcopy_L1_M4_20
+ bne .Ldgemm_tcopy_L1_M4_20
-dgemm_tcopy_L1_M4_40:
+.Ldgemm_tcopy_L1_M4_40:
tst N , #2
- ble dgemm_tcopy_L1_M4_60
+ ble .Ldgemm_tcopy_L1_M4_60
COPY2x1
-dgemm_tcopy_L1_M4_60:
+.Ldgemm_tcopy_L1_M4_60:
tst N , #1
- ble dgemm_tcopy_L1_M4_END
+ ble .Ldgemm_tcopy_L1_M4_END
COPY1x1
-dgemm_tcopy_L1_M4_END:
+.Ldgemm_tcopy_L1_M4_END:
-dgemm_tcopy_L999:
+.Ldgemm_tcopy_L999:
mov x0, #0 // set return value
RESTORE_REGS
ret
lsl M8, M, #6 // M8 = M * 8 * SIZE
-dgemm_tcopy_L8_BEGIN:
+.Ldgemm_tcopy_L8_BEGIN:
asr J, M, #3 // J = M / 4
cmp J, #0
- ble dgemm_tcopy_L4_BEGIN
+ ble .Ldgemm_tcopy_L4_BEGIN
.align 5
-dgemm_tcopy_L8_M8_BEGIN:
+.Ldgemm_tcopy_L8_M8_BEGIN:
mov A01, A
add A02, A01, LDA
asr I, N, #3 // I = N / 8
cmp I, #0
- ble dgemm_tcopy_L8_M8_40
+ ble .Ldgemm_tcopy_L8_M8_40
.align 5
-dgemm_tcopy_L8_M8_20:
+.Ldgemm_tcopy_L8_M8_20:
COPY8x8
subs I , I , #1
- bne dgemm_tcopy_L8_M8_20
+ bne .Ldgemm_tcopy_L8_M8_20
-dgemm_tcopy_L8_M8_40:
+.Ldgemm_tcopy_L8_M8_40:
tst N , #4
- ble dgemm_tcopy_L8_M8_60
+ ble .Ldgemm_tcopy_L8_M8_60
COPY4x8
-dgemm_tcopy_L8_M8_60:
+.Ldgemm_tcopy_L8_M8_60:
tst N , #2
- ble dgemm_tcopy_L8_M8_80
+ ble .Ldgemm_tcopy_L8_M8_80
COPY2x8
-dgemm_tcopy_L8_M8_80:
+.Ldgemm_tcopy_L8_M8_80:
tst N, #1
- ble dgemm_tcopy_L8_M8_END
+ ble .Ldgemm_tcopy_L8_M8_END
COPY1x8
-dgemm_tcopy_L8_M8_END:
+.Ldgemm_tcopy_L8_M8_END:
subs J , J, #1 // j--
- bne dgemm_tcopy_L8_M8_BEGIN
+ bne .Ldgemm_tcopy_L8_M8_BEGIN
/*********************************************************************************************/
-dgemm_tcopy_L4_BEGIN:
+.Ldgemm_tcopy_L4_BEGIN:
tst M, #7
- ble dgemm_tcopy_L999
+ ble .Ldgemm_tcopy_L999
tst M, #4
- ble dgemm_tcopy_L2_BEGIN
+ ble .Ldgemm_tcopy_L2_BEGIN
-dgemm_tcopy_L4_M8_BEGIN:
+.Ldgemm_tcopy_L4_M8_BEGIN:
mov A01, A
add A02, A01, LDA
asr I, N, #3 // I = N / 8
cmp I, #0
- ble dgemm_tcopy_L4_M8_40
+ ble .Ldgemm_tcopy_L4_M8_40
.align 5
-dgemm_tcopy_L4_M8_20:
+.Ldgemm_tcopy_L4_M8_20:
COPY8x4
subs I , I , #1
- bne dgemm_tcopy_L4_M8_20
+ bne .Ldgemm_tcopy_L4_M8_20
-dgemm_tcopy_L4_M8_40:
+.Ldgemm_tcopy_L4_M8_40:
tst N , #4
- ble dgemm_tcopy_L4_M8_60
+ ble .Ldgemm_tcopy_L4_M8_60
COPY4x4
-dgemm_tcopy_L4_M8_60:
+.Ldgemm_tcopy_L4_M8_60:
tst N , #2
- ble dgemm_tcopy_L4_M8_80
+ ble .Ldgemm_tcopy_L4_M8_80
COPY2x4
-dgemm_tcopy_L4_M8_80:
+.Ldgemm_tcopy_L4_M8_80:
tst N, #1
- ble dgemm_tcopy_L4_M8_END
+ ble .Ldgemm_tcopy_L4_M8_END
COPY1x4
-dgemm_tcopy_L4_M8_END:
+.Ldgemm_tcopy_L4_M8_END:
/*********************************************************************************************/
-dgemm_tcopy_L2_BEGIN:
+.Ldgemm_tcopy_L2_BEGIN:
tst M, #3
- ble dgemm_tcopy_L999
+ ble .Ldgemm_tcopy_L999
tst M, #2
- ble dgemm_tcopy_L1_BEGIN
+ ble .Ldgemm_tcopy_L1_BEGIN
-dgemm_tcopy_L2_M8_BEGIN:
+.Ldgemm_tcopy_L2_M8_BEGIN:
mov A01, A
add A02, A01, LDA
add A, A02, LDA
asr I, N, #3 // I = N / 8
cmp I, #0
- ble dgemm_tcopy_L2_M8_40
+ ble .Ldgemm_tcopy_L2_M8_40
.align 5
-dgemm_tcopy_L2_M8_20:
+.Ldgemm_tcopy_L2_M8_20:
COPY8x2
subs I , I , #1
- bne dgemm_tcopy_L2_M8_20
+ bne .Ldgemm_tcopy_L2_M8_20
-dgemm_tcopy_L2_M8_40:
+.Ldgemm_tcopy_L2_M8_40:
tst N , #4
- ble dgemm_tcopy_L2_M8_60
+ ble .Ldgemm_tcopy_L2_M8_60
COPY4x2
-dgemm_tcopy_L2_M8_60:
+.Ldgemm_tcopy_L2_M8_60:
tst N , #2
- ble dgemm_tcopy_L2_M8_80
+ ble .Ldgemm_tcopy_L2_M8_80
COPY2x2
-dgemm_tcopy_L2_M8_80:
+.Ldgemm_tcopy_L2_M8_80:
tst N , #1
- ble dgemm_tcopy_L2_M8_END
+ ble .Ldgemm_tcopy_L2_M8_END
COPY1x2
-dgemm_tcopy_L2_M8_END:
+.Ldgemm_tcopy_L2_M8_END:
/*********************************************************************************************/
-dgemm_tcopy_L1_BEGIN:
+.Ldgemm_tcopy_L1_BEGIN:
tst M, #1
- ble dgemm_tcopy_L999
+ ble .Ldgemm_tcopy_L999
-dgemm_tcopy_L1_M8_BEGIN:
+.Ldgemm_tcopy_L1_M8_BEGIN:
mov A01, A // A01 = A
mov B01, B
asr I, N, #3 // I = M / 8
cmp I, #0
- ble dgemm_tcopy_L1_M8_40
+ ble .Ldgemm_tcopy_L1_M8_40
.align 5
-dgemm_tcopy_L1_M8_20:
+.Ldgemm_tcopy_L1_M8_20:
COPY8x1
subs I , I , #1
- bne dgemm_tcopy_L1_M8_20
+ bne .Ldgemm_tcopy_L1_M8_20
-dgemm_tcopy_L1_M8_40:
+.Ldgemm_tcopy_L1_M8_40:
tst N , #4
- ble dgemm_tcopy_L1_M8_60
+ ble .Ldgemm_tcopy_L1_M8_60
COPY4x1
-dgemm_tcopy_L1_M8_60:
+.Ldgemm_tcopy_L1_M8_60:
tst N , #2
- ble dgemm_tcopy_L1_M8_80
+ ble .Ldgemm_tcopy_L1_M8_80
COPY2x1
-dgemm_tcopy_L1_M8_80:
+.Ldgemm_tcopy_L1_M8_80:
tst N , #1
- ble dgemm_tcopy_L1_M8_END
+ ble .Ldgemm_tcopy_L1_M8_END
COPY1x1
-dgemm_tcopy_L1_M8_END:
+.Ldgemm_tcopy_L1_M8_END:
-dgemm_tcopy_L999:
+.Ldgemm_tcopy_L999:
mov x0, #0 // set return value
RESTORE_REGS
ret
#endif
cmp N, xzr
- ble dot_kernel_L999
+ ble .Ldot_kernel_L999
cmp INC_X, #1
- bne dot_kernel_S_BEGIN
+ bne .Ldot_kernel_S_BEGIN
cmp INC_Y, #1
- bne dot_kernel_S_BEGIN
+ bne .Ldot_kernel_S_BEGIN
-dot_kernel_F_BEGIN:
+.Ldot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq dot_kernel_F1
+ beq .Ldot_kernel_F1
-dot_kernel_F4:
+.Ldot_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne dot_kernel_F4
+ bne .Ldot_kernel_F4
KERNEL_F4_FINALIZE
-dot_kernel_F1:
+.Ldot_kernel_F1:
ands I, N, #3
- ble dot_kernel_L999
+ ble .Ldot_kernel_L999
-dot_kernel_F10:
+.Ldot_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne dot_kernel_F10
+ bne .Ldot_kernel_F10
ret
-dot_kernel_S_BEGIN:
+.Ldot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble dot_kernel_S1
+ ble .Ldot_kernel_S1
-dot_kernel_S4:
+.Ldot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne dot_kernel_S4
+ bne .Ldot_kernel_S4
-dot_kernel_S1:
+.Ldot_kernel_S1:
ands I, N, #3
- ble dot_kernel_L999
+ ble .Ldot_kernel_L999
-dot_kernel_S10:
+.Ldot_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne dot_kernel_S10
+ bne .Ldot_kernel_S10
-dot_kernel_L999:
+.Ldot_kernel_L999:
ret
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble dtrmm_kernel_L2_BEGIN
+ ble .Ldtrmm_kernel_L2_BEGIN
/******************************************************************************/
-dtrmm_kernel_L4_BEGIN:
+.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pA, origPA // pA = start of A array
-dtrmm_kernel_L4_M4_BEGIN:
+.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dtrmm_kernel_L4_M2_BEGIN
+ ble .Ldtrmm_kernel_L4_M2_BEGIN
-dtrmm_kernel_L4_M4_20:
+.Ldtrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt dtrmm_kernel_L4_M4_32
+ blt .Ldtrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble dtrmm_kernel_L4_M4_22a
+ ble .Ldtrmm_kernel_L4_M4_22a
.align 5
-dtrmm_kernel_L4_M4_22:
+.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M4_22
+ bgt .Ldtrmm_kernel_L4_M4_22
-dtrmm_kernel_L4_M4_22a:
+.Ldtrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b dtrmm_kernel_L4_M4_44
+ b .Ldtrmm_kernel_L4_M4_44
-dtrmm_kernel_L4_M4_32:
+.Ldtrmm_kernel_L4_M4_32:
tst counterL, #1
- ble dtrmm_kernel_L4_M4_40
+ ble .Ldtrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b dtrmm_kernel_L4_M4_44
+ b .Ldtrmm_kernel_L4_M4_44
-dtrmm_kernel_L4_M4_40:
+.Ldtrmm_kernel_L4_M4_40:
INIT4x4
-dtrmm_kernel_L4_M4_44:
+.Ldtrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
- ble dtrmm_kernel_L4_M4_100
+ ble .Ldtrmm_kernel_L4_M4_100
-dtrmm_kernel_L4_M4_46:
+.Ldtrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
-dtrmm_kernel_L4_M4_100:
+.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L4_M4_END:
+.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
- bne dtrmm_kernel_L4_M4_20
+ bne .Ldtrmm_kernel_L4_M4_20
-dtrmm_kernel_L4_M2_BEGIN:
+.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L4_END
+ ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L4_M1_BEGIN
+ ble .Ldtrmm_kernel_L4_M1_BEGIN
-dtrmm_kernel_L4_M2_20:
+.Ldtrmm_kernel_L4_M2_20:
INIT2x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L4_M2_40
+ ble .Ldtrmm_kernel_L4_M2_40
-dtrmm_kernel_L4_M2_22:
+.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M2_22
+ bgt .Ldtrmm_kernel_L4_M2_22
-dtrmm_kernel_L4_M2_40:
+.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L4_M2_100
+ ble .Ldtrmm_kernel_L4_M2_100
-dtrmm_kernel_L4_M2_42:
+.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M2_42
+ bgt .Ldtrmm_kernel_L4_M2_42
-dtrmm_kernel_L4_M2_100:
+.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L4_M2_END:
+.Ldtrmm_kernel_L4_M2_END:
-dtrmm_kernel_L4_M1_BEGIN:
+.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L4_END
+ ble .Ldtrmm_kernel_L4_END
-dtrmm_kernel_L4_M1_20:
+.Ldtrmm_kernel_L4_M1_20:
INIT1x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L4_M1_40
+ ble .Ldtrmm_kernel_L4_M1_40
-dtrmm_kernel_L4_M1_22:
+.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M1_22
+ bgt .Ldtrmm_kernel_L4_M1_22
-dtrmm_kernel_L4_M1_40:
+.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L4_M1_100
+ ble .Ldtrmm_kernel_L4_M1_100
-dtrmm_kernel_L4_M1_42:
+.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M1_42
+ bgt .Ldtrmm_kernel_L4_M1_42
-dtrmm_kernel_L4_M1_100:
+.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
add tempOffset, tempOffset, #1
#endif
-dtrmm_kernel_L4_END:
+.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
#endif
subs counterJ, counterJ , #1 // j--
- bgt dtrmm_kernel_L4_BEGIN
+ bgt .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
-dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble dtrmm_kernel_L999 // error, N was less than 4?
+ ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble dtrmm_kernel_L1_BEGIN
+ ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-dtrmm_kernel_L2_M4_BEGIN:
+.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble dtrmm_kernel_L2_M2_BEGIN
+ ble .Ldtrmm_kernel_L2_M2_BEGIN
-dtrmm_kernel_L2_M4_20:
+.Ldtrmm_kernel_L2_M4_20:
INIT4x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dtrmm_kernel_L2_M4_40
+ ble .Ldtrmm_kernel_L2_M4_40
.align 5
-dtrmm_kernel_L2_M4_22:
+.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M4_22
+ bgt .Ldtrmm_kernel_L2_M4_22
-dtrmm_kernel_L2_M4_40:
+.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M4_100
+ ble .Ldtrmm_kernel_L2_M4_100
-dtrmm_kernel_L2_M4_42:
+.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M4_42
+ bgt .Ldtrmm_kernel_L2_M4_42
-dtrmm_kernel_L2_M4_100:
+.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L2_M4_END:
+.Ldtrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt dtrmm_kernel_L2_M4_20
+ bgt .Ldtrmm_kernel_L2_M4_20
-dtrmm_kernel_L2_M2_BEGIN:
+.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L2_END
+ ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L2_M1_BEGIN
+ ble .Ldtrmm_kernel_L2_M1_BEGIN
-dtrmm_kernel_L2_M2_20:
+.Ldtrmm_kernel_L2_M2_20:
INIT2x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dtrmm_kernel_L2_M2_40
+ ble .Ldtrmm_kernel_L2_M2_40
-dtrmm_kernel_L2_M2_22:
+.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M2_22
+ bgt .Ldtrmm_kernel_L2_M2_22
-dtrmm_kernel_L2_M2_40:
+.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M2_100
+ ble .Ldtrmm_kernel_L2_M2_100
-dtrmm_kernel_L2_M2_42:
+.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M2_42
+ bgt .Ldtrmm_kernel_L2_M2_42
-dtrmm_kernel_L2_M2_100:
+.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L2_M2_END:
+.Ldtrmm_kernel_L2_M2_END:
-dtrmm_kernel_L2_M1_BEGIN:
+.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L2_END
+ ble .Ldtrmm_kernel_L2_END
-dtrmm_kernel_L2_M1_20:
+.Ldtrmm_kernel_L2_M1_20:
INIT1x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble dtrmm_kernel_L2_M1_40
+ ble .Ldtrmm_kernel_L2_M1_40
-dtrmm_kernel_L2_M1_22:
+.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M1_22
+ bgt .Ldtrmm_kernel_L2_M1_22
-dtrmm_kernel_L2_M1_40:
+.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M1_100
+ ble .Ldtrmm_kernel_L2_M1_100
-dtrmm_kernel_L2_M1_42:
+.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M1_42
+ bgt .Ldtrmm_kernel_L2_M1_42
-dtrmm_kernel_L2_M1_100:
+.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
add tempOffset, tempOffset, #1
#endif
-dtrmm_kernel_L2_END:
+.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
-dtrmm_kernel_L1_BEGIN:
+.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble dtrmm_kernel_L999 // done
+ ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
mov pA, origPA // pA = A
-dtrmm_kernel_L1_M4_BEGIN:
+.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dtrmm_kernel_L1_M2_BEGIN
+ ble .Ldtrmm_kernel_L1_M2_BEGIN
-dtrmm_kernel_L1_M4_20:
+.Ldtrmm_kernel_L1_M4_20:
INIT4x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M4_40
+ ble .Ldtrmm_kernel_L1_M4_40
.align 5
-dtrmm_kernel_L1_M4_22:
+.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M4_22
+ bgt .Ldtrmm_kernel_L1_M4_22
-dtrmm_kernel_L1_M4_40:
+.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M4_100
+ ble .Ldtrmm_kernel_L1_M4_100
-dtrmm_kernel_L1_M4_42:
+.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M4_42
+ bgt .Ldtrmm_kernel_L1_M4_42
-dtrmm_kernel_L1_M4_100:
+.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L1_M4_END:
+.Ldtrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt dtrmm_kernel_L1_M4_20
+ bgt .Ldtrmm_kernel_L1_M4_20
-dtrmm_kernel_L1_M2_BEGIN:
+.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L1_END
+ ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L1_M1_BEGIN
+ ble .Ldtrmm_kernel_L1_M1_BEGIN
-dtrmm_kernel_L1_M2_20:
+.Ldtrmm_kernel_L1_M2_20:
INIT2x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M2_40
+ ble .Ldtrmm_kernel_L1_M2_40
-dtrmm_kernel_L1_M2_22:
+.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M2_22
+ bgt .Ldtrmm_kernel_L1_M2_22
-dtrmm_kernel_L1_M2_40:
+.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M2_100
+ ble .Ldtrmm_kernel_L1_M2_100
-dtrmm_kernel_L1_M2_42:
+.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M2_42
+ bgt .Ldtrmm_kernel_L1_M2_42
-dtrmm_kernel_L1_M2_100:
+.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L1_M2_END:
+.Ldtrmm_kernel_L1_M2_END:
-dtrmm_kernel_L1_M1_BEGIN:
+.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L1_END
+ ble .Ldtrmm_kernel_L1_END
-dtrmm_kernel_L1_M1_20:
+.Ldtrmm_kernel_L1_M1_20:
INIT1x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M1_40
+ ble .Ldtrmm_kernel_L1_M1_40
-dtrmm_kernel_L1_M1_22:
+.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M1_22
+ bgt .Ldtrmm_kernel_L1_M1_22
-dtrmm_kernel_L1_M1_40:
+.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M1_100
+ ble .Ldtrmm_kernel_L1_M1_100
-dtrmm_kernel_L1_M1_42:
+.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M1_42
+ bgt .Ldtrmm_kernel_L1_M1_42
-dtrmm_kernel_L1_M1_100:
+.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
-dtrmm_kernel_L1_END:
+.Ldtrmm_kernel_L1_END:
-dtrmm_kernel_L999:
+.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
- ble dtrmm_kernel_L4_BEGIN
+ ble .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
-dtrmm_kernel_L8_BEGIN:
+.Ldtrmm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
mov pA, origPA // pA = start of A array
-dtrmm_kernel_L8_M4_BEGIN:
+.Ldtrmm_kernel_L8_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dtrmm_kernel_L8_M2_BEGIN
+ ble .Ldtrmm_kernel_L8_M2_BEGIN
-dtrmm_kernel_L8_M4_20:
+.Ldtrmm_kernel_L8_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt dtrmm_kernel_L8_M4_32
+ blt .Ldtrmm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2
- ble dtrmm_kernel_L8_M4_22a
+ ble .Ldtrmm_kernel_L8_M4_22a
.align 5
-dtrmm_kernel_L8_M4_22:
+.Ldtrmm_kernel_L8_M4_22:
KERNEL4x8_M1
KERNEL4x8_M2
subs counterL, counterL, #1
- bgt dtrmm_kernel_L8_M4_22
+ bgt .Ldtrmm_kernel_L8_M4_22
-dtrmm_kernel_L8_M4_22a:
+.Ldtrmm_kernel_L8_M4_22a:
KERNEL4x8_M1
KERNEL4x8_E
- b dtrmm_kernel_L8_M4_44
+ b .Ldtrmm_kernel_L8_M4_44
-dtrmm_kernel_L8_M4_32:
+.Ldtrmm_kernel_L8_M4_32:
tst counterL, #1
- ble dtrmm_kernel_L8_M4_40
+ ble .Ldtrmm_kernel_L8_M4_40
KERNEL4x8_I
KERNEL4x8_E
- b dtrmm_kernel_L8_M4_44
+ b .Ldtrmm_kernel_L8_M4_44
-dtrmm_kernel_L8_M4_40:
+.Ldtrmm_kernel_L8_M4_40:
INIT4x8
-dtrmm_kernel_L8_M4_44:
+.Ldtrmm_kernel_L8_M4_44:
ands counterL, tempK, #1
- ble dtrmm_kernel_L8_M4_100
+ ble .Ldtrmm_kernel_L8_M4_100
-dtrmm_kernel_L8_M4_46:
+.Ldtrmm_kernel_L8_M4_46:
KERNEL4x8_SUB
-dtrmm_kernel_L8_M4_100:
+.Ldtrmm_kernel_L8_M4_100:
SAVE4x8
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L8_M4_END:
+.Ldtrmm_kernel_L8_M4_END:
subs counterI, counterI, #1
- bne dtrmm_kernel_L8_M4_20
+ bne .Ldtrmm_kernel_L8_M4_20
-dtrmm_kernel_L8_M2_BEGIN:
+.Ldtrmm_kernel_L8_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L8_END
+ ble .Ldtrmm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L8_M1_BEGIN
+ ble .Ldtrmm_kernel_L8_M1_BEGIN
-dtrmm_kernel_L8_M2_20:
+.Ldtrmm_kernel_L8_M2_20:
INIT2x8
asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L8_M2_40
+ ble .Ldtrmm_kernel_L8_M2_40
-dtrmm_kernel_L8_M2_22:
+.Ldtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB
KERNEL2x8_SUB
KERNEL2x8_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L8_M2_22
+ bgt .Ldtrmm_kernel_L8_M2_22
-dtrmm_kernel_L8_M2_40:
+.Ldtrmm_kernel_L8_M2_40:
ands counterL, tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L8_M2_100
+ ble .Ldtrmm_kernel_L8_M2_100
-dtrmm_kernel_L8_M2_42:
+.Ldtrmm_kernel_L8_M2_42:
KERNEL2x8_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L8_M2_42
+ bgt .Ldtrmm_kernel_L8_M2_42
-dtrmm_kernel_L8_M2_100:
+.Ldtrmm_kernel_L8_M2_100:
SAVE2x8
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L8_M2_END:
+.Ldtrmm_kernel_L8_M2_END:
-dtrmm_kernel_L8_M1_BEGIN:
+.Ldtrmm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L8_END
+ ble .Ldtrmm_kernel_L8_END
-dtrmm_kernel_L8_M1_20:
+.Ldtrmm_kernel_L8_M1_20:
INIT1x8
asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L8_M1_40
+ ble .Ldtrmm_kernel_L8_M1_40
-dtrmm_kernel_L8_M1_22:
+.Ldtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L8_M1_22
+ bgt .Ldtrmm_kernel_L8_M1_22
-dtrmm_kernel_L8_M1_40:
+.Ldtrmm_kernel_L8_M1_40:
ands counterL, tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L8_M1_100
+ ble .Ldtrmm_kernel_L8_M1_100
-dtrmm_kernel_L8_M1_42:
+.Ldtrmm_kernel_L8_M1_42:
KERNEL1x8_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L8_M1_42
+ bgt .Ldtrmm_kernel_L8_M1_42
-dtrmm_kernel_L8_M1_100:
+.Ldtrmm_kernel_L8_M1_100:
SAVE1x8
add tempOffset, tempOffset, #1
#endif
-dtrmm_kernel_L8_END:
+.Ldtrmm_kernel_L8_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8
#endif
subs counterJ, counterJ , #1 // j--
- bgt dtrmm_kernel_L8_BEGIN
+ bgt .Ldtrmm_kernel_L8_BEGIN
/******************************************************************************/
-dtrmm_kernel_L4_BEGIN:
+.Ldtrmm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #7
- ble dtrmm_kernel_L999
+ ble .Ldtrmm_kernel_L999
tst counterJ , #4
- ble dtrmm_kernel_L2_BEGIN
+ ble .Ldtrmm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pA, origPA // pA = start of A array
-dtrmm_kernel_L4_M4_BEGIN:
+.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dtrmm_kernel_L4_M2_BEGIN
+ ble .Ldtrmm_kernel_L4_M2_BEGIN
-dtrmm_kernel_L4_M4_20:
+.Ldtrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt dtrmm_kernel_L4_M4_32
+ blt .Ldtrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble dtrmm_kernel_L4_M4_22a
+ ble .Ldtrmm_kernel_L4_M4_22a
.align 5
-dtrmm_kernel_L4_M4_22:
+.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M4_22
+ bgt .Ldtrmm_kernel_L4_M4_22
-dtrmm_kernel_L4_M4_22a:
+.Ldtrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b dtrmm_kernel_L4_M4_44
+ b .Ldtrmm_kernel_L4_M4_44
-dtrmm_kernel_L4_M4_32:
+.Ldtrmm_kernel_L4_M4_32:
tst counterL, #1
- ble dtrmm_kernel_L4_M4_40
+ ble .Ldtrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b dtrmm_kernel_L4_M4_44
+ b .Ldtrmm_kernel_L4_M4_44
-dtrmm_kernel_L4_M4_40:
+.Ldtrmm_kernel_L4_M4_40:
INIT4x4
-dtrmm_kernel_L4_M4_44:
+.Ldtrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
- ble dtrmm_kernel_L4_M4_100
+ ble .Ldtrmm_kernel_L4_M4_100
-dtrmm_kernel_L4_M4_46:
+.Ldtrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
-dtrmm_kernel_L4_M4_100:
+.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L4_M4_END:
+.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
- bne dtrmm_kernel_L4_M4_20
+ bne .Ldtrmm_kernel_L4_M4_20
-dtrmm_kernel_L4_M2_BEGIN:
+.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L4_END
+ ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L4_M1_BEGIN
+ ble .Ldtrmm_kernel_L4_M1_BEGIN
-dtrmm_kernel_L4_M2_20:
+.Ldtrmm_kernel_L4_M2_20:
INIT2x4
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L4_M2_40
+ ble .Ldtrmm_kernel_L4_M2_40
-dtrmm_kernel_L4_M2_22:
+.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M2_22
+ bgt .Ldtrmm_kernel_L4_M2_22
-dtrmm_kernel_L4_M2_40:
+.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L4_M2_100
+ ble .Ldtrmm_kernel_L4_M2_100
-dtrmm_kernel_L4_M2_42:
+.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M2_42
+ bgt .Ldtrmm_kernel_L4_M2_42
-dtrmm_kernel_L4_M2_100:
+.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L4_M2_END:
+.Ldtrmm_kernel_L4_M2_END:
-dtrmm_kernel_L4_M1_BEGIN:
+.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L4_END
+ ble .Ldtrmm_kernel_L4_END
-dtrmm_kernel_L4_M1_20:
+.Ldtrmm_kernel_L4_M1_20:
INIT1x4
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L4_M1_40
+ ble .Ldtrmm_kernel_L4_M1_40
-dtrmm_kernel_L4_M1_22:
+.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M1_22
+ bgt .Ldtrmm_kernel_L4_M1_22
-dtrmm_kernel_L4_M1_40:
+.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L4_M1_100
+ ble .Ldtrmm_kernel_L4_M1_100
-dtrmm_kernel_L4_M1_42:
+.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M1_42
+ bgt .Ldtrmm_kernel_L4_M1_42
-dtrmm_kernel_L4_M1_100:
+.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
-dtrmm_kernel_L4_END:
+.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
/******************************************************************************/
-dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble dtrmm_kernel_L999 // error, N was less than 4?
+ ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble dtrmm_kernel_L1_BEGIN
+ ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-dtrmm_kernel_L2_M4_BEGIN:
+.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble dtrmm_kernel_L2_M2_BEGIN
+ ble .Ldtrmm_kernel_L2_M2_BEGIN
-dtrmm_kernel_L2_M4_20:
+.Ldtrmm_kernel_L2_M4_20:
INIT4x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dtrmm_kernel_L2_M4_40
+ ble .Ldtrmm_kernel_L2_M4_40
.align 5
-dtrmm_kernel_L2_M4_22:
+.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M4_22
+ bgt .Ldtrmm_kernel_L2_M4_22
-dtrmm_kernel_L2_M4_40:
+.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M4_100
+ ble .Ldtrmm_kernel_L2_M4_100
-dtrmm_kernel_L2_M4_42:
+.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M4_42
+ bgt .Ldtrmm_kernel_L2_M4_42
-dtrmm_kernel_L2_M4_100:
+.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L2_M4_END:
+.Ldtrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt dtrmm_kernel_L2_M4_20
+ bgt .Ldtrmm_kernel_L2_M4_20
-dtrmm_kernel_L2_M2_BEGIN:
+.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L2_END
+ ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L2_M1_BEGIN
+ ble .Ldtrmm_kernel_L2_M1_BEGIN
-dtrmm_kernel_L2_M2_20:
+.Ldtrmm_kernel_L2_M2_20:
INIT2x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dtrmm_kernel_L2_M2_40
+ ble .Ldtrmm_kernel_L2_M2_40
-dtrmm_kernel_L2_M2_22:
+.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M2_22
+ bgt .Ldtrmm_kernel_L2_M2_22
-dtrmm_kernel_L2_M2_40:
+.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M2_100
+ ble .Ldtrmm_kernel_L2_M2_100
-dtrmm_kernel_L2_M2_42:
+.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M2_42
+ bgt .Ldtrmm_kernel_L2_M2_42
-dtrmm_kernel_L2_M2_100:
+.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L2_M2_END:
+.Ldtrmm_kernel_L2_M2_END:
-dtrmm_kernel_L2_M1_BEGIN:
+.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L2_END
+ ble .Ldtrmm_kernel_L2_END
-dtrmm_kernel_L2_M1_20:
+.Ldtrmm_kernel_L2_M1_20:
INIT1x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble dtrmm_kernel_L2_M1_40
+ ble .Ldtrmm_kernel_L2_M1_40
-dtrmm_kernel_L2_M1_22:
+.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M1_22
+ bgt .Ldtrmm_kernel_L2_M1_22
-dtrmm_kernel_L2_M1_40:
+.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M1_100
+ ble .Ldtrmm_kernel_L2_M1_100
-dtrmm_kernel_L2_M1_42:
+.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M1_42
+ bgt .Ldtrmm_kernel_L2_M1_42
-dtrmm_kernel_L2_M1_100:
+.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
-dtrmm_kernel_L2_END:
+.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
-dtrmm_kernel_L1_BEGIN:
+.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble dtrmm_kernel_L999 // done
+ ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
#endif
mov pA, origPA // pA = A
-dtrmm_kernel_L1_M4_BEGIN:
+.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble dtrmm_kernel_L1_M2_BEGIN
+ ble .Ldtrmm_kernel_L1_M2_BEGIN
-dtrmm_kernel_L1_M4_20:
+.Ldtrmm_kernel_L1_M4_20:
INIT4x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M4_40
+ ble .Ldtrmm_kernel_L1_M4_40
.align 5
-dtrmm_kernel_L1_M4_22:
+.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M4_22
+ bgt .Ldtrmm_kernel_L1_M4_22
-dtrmm_kernel_L1_M4_40:
+.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M4_100
+ ble .Ldtrmm_kernel_L1_M4_100
-dtrmm_kernel_L1_M4_42:
+.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M4_42
+ bgt .Ldtrmm_kernel_L1_M4_42
-dtrmm_kernel_L1_M4_100:
+.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L1_M4_END:
+.Ldtrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt dtrmm_kernel_L1_M4_20
+ bgt .Ldtrmm_kernel_L1_M4_20
-dtrmm_kernel_L1_M2_BEGIN:
+.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L1_END
+ ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L1_M1_BEGIN
+ ble .Ldtrmm_kernel_L1_M1_BEGIN
-dtrmm_kernel_L1_M2_20:
+.Ldtrmm_kernel_L1_M2_20:
INIT2x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M2_40
+ ble .Ldtrmm_kernel_L1_M2_40
-dtrmm_kernel_L1_M2_22:
+.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M2_22
+ bgt .Ldtrmm_kernel_L1_M2_22
-dtrmm_kernel_L1_M2_40:
+.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M2_100
+ ble .Ldtrmm_kernel_L1_M2_100
-dtrmm_kernel_L1_M2_42:
+.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M2_42
+ bgt .Ldtrmm_kernel_L1_M2_42
-dtrmm_kernel_L1_M2_100:
+.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L1_M2_END:
+.Ldtrmm_kernel_L1_M2_END:
-dtrmm_kernel_L1_M1_BEGIN:
+.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L1_END
+ ble .Ldtrmm_kernel_L1_END
-dtrmm_kernel_L1_M1_20:
+.Ldtrmm_kernel_L1_M1_20:
INIT1x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M1_40
+ ble .Ldtrmm_kernel_L1_M1_40
-dtrmm_kernel_L1_M1_22:
+.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M1_22
+ bgt .Ldtrmm_kernel_L1_M1_22
-dtrmm_kernel_L1_M1_40:
+.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M1_100
+ ble .Ldtrmm_kernel_L1_M1_100
-dtrmm_kernel_L1_M1_42:
+.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M1_42
+ bgt .Ldtrmm_kernel_L1_M1_42
-dtrmm_kernel_L1_M1_100:
+.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
-dtrmm_kernel_L1_END:
+.Ldtrmm_kernel_L1_END:
-dtrmm_kernel_L999:
+.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble dtrmm_kernel_L2_BEGIN
+ ble .Ldtrmm_kernel_L2_BEGIN
/******************************************************************************/
-dtrmm_kernel_L4_BEGIN:
+.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
#endif
mov pA, origPA // pA = start of A array
-dtrmm_kernel_L4_M8_BEGIN:
+.Ldtrmm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dtrmm_kernel_L4_M4_BEGIN
+ ble .Ldtrmm_kernel_L4_M4_BEGIN
.align 5
-dtrmm_kernel_L4_M8_20:
+.Ldtrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do?
- blt dtrmm_kernel_L4_M8_32
+ blt .Ldtrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
KERNEL8x4_M2
subs counterL, counterL, #2 // subtract 2
- ble dtrmm_kernel_L4_M8_22a
+ ble .Ldtrmm_kernel_L4_M8_22a
.align 5
-dtrmm_kernel_L4_M8_22:
+.Ldtrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M8_22
+ bgt .Ldtrmm_kernel_L4_M8_22
.align 5
-dtrmm_kernel_L4_M8_22a:
+.Ldtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b dtrmm_kernel_L4_M8_44
+ b .Ldtrmm_kernel_L4_M8_44
.align 5
-dtrmm_kernel_L4_M8_32:
+.Ldtrmm_kernel_L4_M8_32:
tst counterL, #1
- ble dtrmm_kernel_L4_M8_40
+ ble .Ldtrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_M2
KERNEL8x4_M1
KERNEL8x4_E
- b dtrmm_kernel_L4_M8_44
+ b .Ldtrmm_kernel_L4_M8_44
-dtrmm_kernel_L4_M8_40:
+.Ldtrmm_kernel_L4_M8_40:
INIT8x4
-dtrmm_kernel_L4_M8_44:
+.Ldtrmm_kernel_L4_M8_44:
ands counterL , tempK, #7
- ble dtrmm_kernel_L4_M8_100
+ ble .Ldtrmm_kernel_L4_M8_100
.align 5
-dtrmm_kernel_L4_M8_46:
+.Ldtrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
subs counterL, counterL, #1
- bne dtrmm_kernel_L4_M8_46
+ bne .Ldtrmm_kernel_L4_M8_46
-dtrmm_kernel_L4_M8_100:
+.Ldtrmm_kernel_L4_M8_100:
SAVE8x4
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
-dtrmm_kernel_L4_M8_END:
+.Ldtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
- bne dtrmm_kernel_L4_M8_20
+ bne .Ldtrmm_kernel_L4_M8_20
-dtrmm_kernel_L4_M4_BEGIN:
+.Ldtrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dtrmm_kernel_L4_END
+ ble .Ldtrmm_kernel_L4_END
tst counterI, #4
- ble dtrmm_kernel_L4_M2_BEGIN
+ ble .Ldtrmm_kernel_L4_M2_BEGIN
-dtrmm_kernel_L4_M4_20:
+.Ldtrmm_kernel_L4_M4_20:
INIT4x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L4_M4_40
+ ble .Ldtrmm_kernel_L4_M4_40
-dtrmm_kernel_L4_M4_22:
+.Ldtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M4_22
+ bgt .Ldtrmm_kernel_L4_M4_22
-dtrmm_kernel_L4_M4_40:
+.Ldtrmm_kernel_L4_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L4_M4_100
+ ble .Ldtrmm_kernel_L4_M4_100
-dtrmm_kernel_L4_M4_42:
+.Ldtrmm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M4_42
+ bgt .Ldtrmm_kernel_L4_M4_42
-dtrmm_kernel_L4_M4_100:
+.Ldtrmm_kernel_L4_M4_100:
SAVE4x4
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L4_M4_END:
+.Ldtrmm_kernel_L4_M4_END:
-dtrmm_kernel_L4_M2_BEGIN:
+.Ldtrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L4_END
+ ble .Ldtrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L4_M1_BEGIN
+ ble .Ldtrmm_kernel_L4_M1_BEGIN
-dtrmm_kernel_L4_M2_20:
+.Ldtrmm_kernel_L4_M2_20:
INIT2x4
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L4_M2_40
+ ble .Ldtrmm_kernel_L4_M2_40
-dtrmm_kernel_L4_M2_22:
+.Ldtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M2_22
+ bgt .Ldtrmm_kernel_L4_M2_22
-dtrmm_kernel_L4_M2_40:
+.Ldtrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L4_M2_100
+ ble .Ldtrmm_kernel_L4_M2_100
-dtrmm_kernel_L4_M2_42:
+.Ldtrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M2_42
+ bgt .Ldtrmm_kernel_L4_M2_42
-dtrmm_kernel_L4_M2_100:
+.Ldtrmm_kernel_L4_M2_100:
SAVE2x4
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L4_M2_END:
+.Ldtrmm_kernel_L4_M2_END:
-dtrmm_kernel_L4_M1_BEGIN:
+.Ldtrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L4_END
+ ble .Ldtrmm_kernel_L4_END
-dtrmm_kernel_L4_M1_20:
+.Ldtrmm_kernel_L4_M1_20:
INIT1x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L4_M1_40
+ ble .Ldtrmm_kernel_L4_M1_40
-dtrmm_kernel_L4_M1_22:
+.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M1_22
+ bgt .Ldtrmm_kernel_L4_M1_22
-dtrmm_kernel_L4_M1_40:
+.Ldtrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L4_M1_100
+ ble .Ldtrmm_kernel_L4_M1_100
-dtrmm_kernel_L4_M1_42:
+.Ldtrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L4_M1_42
+ bgt .Ldtrmm_kernel_L4_M1_42
-dtrmm_kernel_L4_M1_100:
+.Ldtrmm_kernel_L4_M1_100:
SAVE1x4
add tempOffset, tempOffset, #1
#endif
-dtrmm_kernel_L4_END:
+.Ldtrmm_kernel_L4_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8
#endif
subs counterJ, counterJ , #1 // j--
- bgt dtrmm_kernel_L4_BEGIN
+ bgt .Ldtrmm_kernel_L4_BEGIN
/******************************************************************************/
-dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble dtrmm_kernel_L999 // error, N was less than 4?
+ ble .Ldtrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble dtrmm_kernel_L1_BEGIN
+ ble .Ldtrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
#endif
mov pA, origPA // pA = A
-dtrmm_kernel_L2_M8_BEGIN:
+.Ldtrmm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dtrmm_kernel_L2_M4_BEGIN
+ ble .Ldtrmm_kernel_L2_M4_BEGIN
-dtrmm_kernel_L2_M8_20:
+.Ldtrmm_kernel_L2_M8_20:
INIT8x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dtrmm_kernel_L2_M8_40
+ ble .Ldtrmm_kernel_L2_M8_40
.align 5
-dtrmm_kernel_L2_M8_22:
+.Ldtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M8_22
+ bgt .Ldtrmm_kernel_L2_M8_22
-dtrmm_kernel_L2_M8_40:
+.Ldtrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M8_100
+ ble .Ldtrmm_kernel_L2_M8_100
-dtrmm_kernel_L2_M8_42:
+.Ldtrmm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M8_42
+ bgt .Ldtrmm_kernel_L2_M8_42
-dtrmm_kernel_L2_M8_100:
+.Ldtrmm_kernel_L2_M8_100:
SAVE8x2
add tempOffset, tempOffset, #8
#endif
-dtrmm_kernel_L2_M8_END:
+.Ldtrmm_kernel_L2_M8_END:
subs counterI, counterI, #1
- bgt dtrmm_kernel_L2_M8_20
+ bgt .Ldtrmm_kernel_L2_M8_20
-dtrmm_kernel_L2_M4_BEGIN:
+.Ldtrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dtrmm_kernel_L2_END
+ ble .Ldtrmm_kernel_L2_END
tst counterI, #4 // counterI = counterI / 2
- ble dtrmm_kernel_L2_M2_BEGIN
+ ble .Ldtrmm_kernel_L2_M2_BEGIN
-dtrmm_kernel_L2_M4_20:
+.Ldtrmm_kernel_L2_M4_20:
INIT4x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dtrmm_kernel_L2_M4_40
+ ble .Ldtrmm_kernel_L2_M4_40
.align 5
-dtrmm_kernel_L2_M4_22:
+.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M4_22
+ bgt .Ldtrmm_kernel_L2_M4_22
-dtrmm_kernel_L2_M4_40:
+.Ldtrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M4_100
+ ble .Ldtrmm_kernel_L2_M4_100
-dtrmm_kernel_L2_M4_42:
+.Ldtrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M4_42
+ bgt .Ldtrmm_kernel_L2_M4_42
-dtrmm_kernel_L2_M4_100:
+.Ldtrmm_kernel_L2_M4_100:
SAVE4x2
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L2_M4_END:
+.Ldtrmm_kernel_L2_M4_END:
-dtrmm_kernel_L2_M2_BEGIN:
+.Ldtrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L2_END
+ ble .Ldtrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L2_M1_BEGIN
+ ble .Ldtrmm_kernel_L2_M1_BEGIN
-dtrmm_kernel_L2_M2_20:
+.Ldtrmm_kernel_L2_M2_20:
INIT2x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble dtrmm_kernel_L2_M2_40
+ ble .Ldtrmm_kernel_L2_M2_40
-dtrmm_kernel_L2_M2_22:
+.Ldtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M2_22
+ bgt .Ldtrmm_kernel_L2_M2_22
-dtrmm_kernel_L2_M2_40:
+.Ldtrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M2_100
+ ble .Ldtrmm_kernel_L2_M2_100
-dtrmm_kernel_L2_M2_42:
+.Ldtrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M2_42
+ bgt .Ldtrmm_kernel_L2_M2_42
-dtrmm_kernel_L2_M2_100:
+.Ldtrmm_kernel_L2_M2_100:
SAVE2x2
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L2_M2_END:
+.Ldtrmm_kernel_L2_M2_END:
-dtrmm_kernel_L2_M1_BEGIN:
+.Ldtrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L2_END
+ ble .Ldtrmm_kernel_L2_END
-dtrmm_kernel_L2_M1_20:
+.Ldtrmm_kernel_L2_M1_20:
INIT1x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble dtrmm_kernel_L2_M1_40
+ ble .Ldtrmm_kernel_L2_M1_40
-dtrmm_kernel_L2_M1_22:
+.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M1_22
+ bgt .Ldtrmm_kernel_L2_M1_22
-dtrmm_kernel_L2_M1_40:
+.Ldtrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L2_M1_100
+ ble .Ldtrmm_kernel_L2_M1_100
-dtrmm_kernel_L2_M1_42:
+.Ldtrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L2_M1_42
+ bgt .Ldtrmm_kernel_L2_M1_42
-dtrmm_kernel_L2_M1_100:
+.Ldtrmm_kernel_L2_M1_100:
SAVE1x2
add tempOffset, tempOffset, #1
#endif
-dtrmm_kernel_L2_END:
+.Ldtrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
-dtrmm_kernel_L1_BEGIN:
+.Ldtrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble dtrmm_kernel_L999 // done
+ ble .Ldtrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next
#endif
mov pA, origPA // pA = A
-dtrmm_kernel_L1_M8_BEGIN:
+.Ldtrmm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble dtrmm_kernel_L1_M4_BEGIN
+ ble .Ldtrmm_kernel_L1_M4_BEGIN
-dtrmm_kernel_L1_M8_20:
+.Ldtrmm_kernel_L1_M8_20:
INIT8x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M8_40
+ ble .Ldtrmm_kernel_L1_M8_40
.align 5
-dtrmm_kernel_L1_M8_22:
+.Ldtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M8_22
+ bgt .Ldtrmm_kernel_L1_M8_22
-dtrmm_kernel_L1_M8_40:
+.Ldtrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M8_100
+ ble .Ldtrmm_kernel_L1_M8_100
-dtrmm_kernel_L1_M8_42:
+.Ldtrmm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M8_42
+ bgt .Ldtrmm_kernel_L1_M8_42
-dtrmm_kernel_L1_M8_100:
+.Ldtrmm_kernel_L1_M8_100:
SAVE8x1
add tempOffset, tempOffset, #8
#endif
-dtrmm_kernel_L1_M8_END:
+.Ldtrmm_kernel_L1_M8_END:
subs counterI, counterI, #1
- bgt dtrmm_kernel_L1_M8_20
+ bgt .Ldtrmm_kernel_L1_M8_20
-dtrmm_kernel_L1_M4_BEGIN:
+.Ldtrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble dtrmm_kernel_L1_END
+ ble .Ldtrmm_kernel_L1_END
tst counterI, #4 // counterI = counterI / 2
- ble dtrmm_kernel_L1_M2_BEGIN
+ ble .Ldtrmm_kernel_L1_M2_BEGIN
-dtrmm_kernel_L1_M4_20:
+.Ldtrmm_kernel_L1_M4_20:
INIT4x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M4_40
+ ble .Ldtrmm_kernel_L1_M4_40
.align 5
-dtrmm_kernel_L1_M4_22:
+.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M4_22
+ bgt .Ldtrmm_kernel_L1_M4_22
-dtrmm_kernel_L1_M4_40:
+.Ldtrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M4_100
+ ble .Ldtrmm_kernel_L1_M4_100
-dtrmm_kernel_L1_M4_42:
+.Ldtrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M4_42
+ bgt .Ldtrmm_kernel_L1_M4_42
-dtrmm_kernel_L1_M4_100:
+.Ldtrmm_kernel_L1_M4_100:
SAVE4x1
add tempOffset, tempOffset, #4
#endif
-dtrmm_kernel_L1_M4_END:
+.Ldtrmm_kernel_L1_M4_END:
-dtrmm_kernel_L1_M2_BEGIN:
+.Ldtrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble dtrmm_kernel_L1_END
+ ble .Ldtrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble dtrmm_kernel_L1_M1_BEGIN
+ ble .Ldtrmm_kernel_L1_M1_BEGIN
-dtrmm_kernel_L1_M2_20:
+.Ldtrmm_kernel_L1_M2_20:
INIT2x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M2_40
+ ble .Ldtrmm_kernel_L1_M2_40
-dtrmm_kernel_L1_M2_22:
+.Ldtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M2_22
+ bgt .Ldtrmm_kernel_L1_M2_22
-dtrmm_kernel_L1_M2_40:
+.Ldtrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M2_100
+ ble .Ldtrmm_kernel_L1_M2_100
-dtrmm_kernel_L1_M2_42:
+.Ldtrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M2_42
+ bgt .Ldtrmm_kernel_L1_M2_42
-dtrmm_kernel_L1_M2_100:
+.Ldtrmm_kernel_L1_M2_100:
SAVE2x1
add tempOffset, tempOffset, #2
#endif
-dtrmm_kernel_L1_M2_END:
+.Ldtrmm_kernel_L1_M2_END:
-dtrmm_kernel_L1_M1_BEGIN:
+.Ldtrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble dtrmm_kernel_L1_END
+ ble .Ldtrmm_kernel_L1_END
-dtrmm_kernel_L1_M1_20:
+.Ldtrmm_kernel_L1_M1_20:
INIT1x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble dtrmm_kernel_L1_M1_40
+ ble .Ldtrmm_kernel_L1_M1_40
-dtrmm_kernel_L1_M1_22:
+.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M1_22
+ bgt .Ldtrmm_kernel_L1_M1_22
-dtrmm_kernel_L1_M1_40:
+.Ldtrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble dtrmm_kernel_L1_M1_100
+ ble .Ldtrmm_kernel_L1_M1_100
-dtrmm_kernel_L1_M1_42:
+.Ldtrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt dtrmm_kernel_L1_M1_42
+ bgt .Ldtrmm_kernel_L1_M1_42
-dtrmm_kernel_L1_M1_100:
+.Ldtrmm_kernel_L1_M1_100:
SAVE1x1
-dtrmm_kernel_L1_END:
+.Ldtrmm_kernel_L1_END:
-dtrmm_kernel_L999:
+.Ldtrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
SAVE_REGS
cmp N, xzr
- ble gemv_n_kernel_L999
+ ble .Lgemv_n_kernel_L999
cmp M, xzr
- ble gemv_n_kernel_L999
+ ble .Lgemv_n_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
mov J, N
cmp INC_Y, #1
- bne gemv_n_kernel_S_BEGIN
+ bne .Lgemv_n_kernel_S_BEGIN
-gemv_n_kernel_F_LOOP:
+.Lgemv_n_kernel_F_LOOP:
ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
mov Y_IPTR, Y
mov Y_OPTR, Y
-gemv_n_kernel_F32:
+.Lgemv_n_kernel_F32:
asr I, M, #5
cmp I, xzr
- beq gemv_n_kernel_F4
+ beq .Lgemv_n_kernel_F4
-gemv_n_kernel_F320:
+.Lgemv_n_kernel_F320:
KERNEL_F16
KERNEL_F16
subs I, I, #1
- bne gemv_n_kernel_F320
+ bne .Lgemv_n_kernel_F320
-gemv_n_kernel_F4:
+.Lgemv_n_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
- beq gemv_n_kernel_F1
+ beq .Lgemv_n_kernel_F1
-gemv_n_kernel_F40:
+.Lgemv_n_kernel_F40:
KERNEL_F4
subs I, I, #1
- bne gemv_n_kernel_F40
+ bne .Lgemv_n_kernel_F40
-gemv_n_kernel_F1:
+.Lgemv_n_kernel_F1:
ands I, M, #3
- ble gemv_n_kernel_F_END
+ ble .Lgemv_n_kernel_F_END
-gemv_n_kernel_F10:
+.Lgemv_n_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne gemv_n_kernel_F10
+ bne .Lgemv_n_kernel_F10
-gemv_n_kernel_F_END:
+.Lgemv_n_kernel_F_END:
add A, A, LDA
subs J, J, #1
- bne gemv_n_kernel_F_LOOP
+ bne .Lgemv_n_kernel_F_LOOP
- b gemv_n_kernel_L999
+ b .Lgemv_n_kernel_L999
-gemv_n_kernel_S_BEGIN:
+.Lgemv_n_kernel_S_BEGIN:
INIT_S
-gemv_n_kernel_S_LOOP:
+.Lgemv_n_kernel_S_LOOP:
ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
asr I, M, #2
cmp I, xzr
- ble gemv_n_kernel_S1
+ ble .Lgemv_n_kernel_S1
-gemv_n_kernel_S4:
+.Lgemv_n_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne gemv_n_kernel_S4
+ bne .Lgemv_n_kernel_S4
-gemv_n_kernel_S1:
+.Lgemv_n_kernel_S1:
ands I, M, #3
- ble gemv_n_kernel_S_END
+ ble .Lgemv_n_kernel_S_END
-gemv_n_kernel_S10:
+.Lgemv_n_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne gemv_n_kernel_S10
+ bne .Lgemv_n_kernel_S10
-gemv_n_kernel_S_END:
+.Lgemv_n_kernel_S_END:
add A, A, LDA
subs J, J, #1
- bne gemv_n_kernel_S_LOOP
+ bne .Lgemv_n_kernel_S_LOOP
-gemv_n_kernel_L999:
+.Lgemv_n_kernel_L999:
mov w0, wzr
SAVE_REGS
cmp N, xzr
- ble gemv_t_kernel_L999
+ ble .Lgemv_t_kernel_L999
cmp M, xzr
- ble gemv_t_kernel_L999
+ ble .Lgemv_t_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
mov J, N
cmp INC_X, #1
- bne gemv_t_kernel_S_BEGIN
+ bne .Lgemv_t_kernel_S_BEGIN
-gemv_t_kernel_F_LOOP:
+.Lgemv_t_kernel_F_LOOP:
fmov TEMP, REG0
fmov TEMP1, REG0
mov A_PTR, A
mov X_PTR, X
-gemv_t_kernel_F32:
+.Lgemv_t_kernel_F32:
asr I, M, #5
cmp I, xzr
- beq gemv_t_kernel_F4
+ beq .Lgemv_t_kernel_F4
-gemv_t_kernel_F320:
+.Lgemv_t_kernel_F320:
KERNEL_F32
subs I, I, #1
- bne gemv_t_kernel_F320
+ bne .Lgemv_t_kernel_F320
KERNEL_F32_FINALIZE
-gemv_t_kernel_F4:
+.Lgemv_t_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
- beq gemv_t_kernel_F1
+ beq .Lgemv_t_kernel_F1
-gemv_t_kernel_F40:
+.Lgemv_t_kernel_F40:
KERNEL_F4
subs I, I, #1
- bne gemv_t_kernel_F40
+ bne .Lgemv_t_kernel_F40
-gemv_t_kernel_F1:
+.Lgemv_t_kernel_F1:
KERNEL_F4_FINALIZE
ands I, M, #3
- ble gemv_t_kernel_F_END
+ ble .Lgemv_t_kernel_F_END
-gemv_t_kernel_F10:
+.Lgemv_t_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne gemv_t_kernel_F10
+ bne .Lgemv_t_kernel_F10
-gemv_t_kernel_F_END:
+.Lgemv_t_kernel_F_END:
ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
- bne gemv_t_kernel_F_LOOP
+ bne .Lgemv_t_kernel_F_LOOP
- b gemv_t_kernel_L999
+ b .Lgemv_t_kernel_L999
-gemv_t_kernel_S_BEGIN:
+.Lgemv_t_kernel_S_BEGIN:
INIT_S
-gemv_t_kernel_S_LOOP:
+.Lgemv_t_kernel_S_LOOP:
fmov TEMP, REG0
mov A_PTR, A
asr I, M, #2
cmp I, xzr
- ble gemv_t_kernel_S1
+ ble .Lgemv_t_kernel_S1
-gemv_t_kernel_S4:
+.Lgemv_t_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne gemv_t_kernel_S4
+ bne .Lgemv_t_kernel_S4
-gemv_t_kernel_S1:
+.Lgemv_t_kernel_S1:
ands I, M, #3
- ble gemv_t_kernel_S_END
+ ble .Lgemv_t_kernel_S_END
-gemv_t_kernel_S10:
+.Lgemv_t_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne gemv_t_kernel_S10
+ bne .Lgemv_t_kernel_S10
-gemv_t_kernel_S_END:
+.Lgemv_t_kernel_S_END:
ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
- bne gemv_t_kernel_S_LOOP
+ bne .Lgemv_t_kernel_S_LOOP
-gemv_t_kernel_L999:
+.Lgemv_t_kernel_L999:
RESTORE_REGS
PROLOGUE
cmp N, xzr
- ble iamax_kernel_zero
+ ble .Liamax_kernel_zero
cmp INC_X, xzr
- ble iamax_kernel_zero
+ ble .Liamax_kernel_zero
cmp INC_X, #1
- bne iamax_kernel_S_BEGIN
+ bne .Liamax_kernel_S_BEGIN
mov x7, X
-iamax_kernel_F_BEGIN:
+.Liamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
- ble iamax_kernel_L999
+ ble .Liamax_kernel_L999
asr I, N, #3
cmp I, xzr
- beq iamax_kernel_F1
+ beq .Liamax_kernel_F1
add Z, Z, #1
-iamax_kernel_F8:
+.Liamax_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne iamax_kernel_F8
+ bne .Liamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
-iamax_kernel_F1:
+.Liamax_kernel_F1:
ands I, N, #7
- ble iamax_kernel_L999
+ ble .Liamax_kernel_L999
-iamax_kernel_F10:
+.Liamax_kernel_F10:
KERNEL_S1
subs I, I, #1
- bne iamax_kernel_F10
+ bne .Liamax_kernel_F10
- b iamax_kernel_L999
+ b .Liamax_kernel_L999
-iamax_kernel_S_BEGIN:
+.Liamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
- ble iamax_kernel_L999
+ ble .Liamax_kernel_L999
asr I, N, #2
cmp I, xzr
- ble iamax_kernel_S1
+ ble .Liamax_kernel_S1
-iamax_kernel_S4:
+.Liamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne iamax_kernel_S4
+ bne .Liamax_kernel_S4
-iamax_kernel_S1:
+.Liamax_kernel_S1:
ands I, N, #3
- ble iamax_kernel_L999
+ ble .Liamax_kernel_L999
-iamax_kernel_S10:
+.Liamax_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne iamax_kernel_S10
+ bne .Liamax_kernel_S10
-iamax_kernel_L999:
+.Liamax_kernel_L999:
mov x0, INDEX
ret
-iamax_kernel_zero:
+.Liamax_kernel_zero:
mov x0, xzr
ret
PROLOGUE
cmp N, xzr
- ble iamax_kernel_zero
+ ble .Lizamax_kernel_zero
cmp INC_X, xzr
- ble iamax_kernel_zero
+ ble .Lizamax_kernel_zero
cmp INC_X, #1
- bne iamax_kernel_S_BEGIN
+ bne .Lizamax_kernel_S_BEGIN
mov x7, X
-iamax_kernel_F_BEGIN:
+.Lizamax_kernel_F_BEGIN:
INIT_S
subs N, N, #1
- ble iamax_kernel_L999
+ ble .Lizamax_kernel_L999
asr I, N, #3
cmp I, xzr
- ble iamax_kernel_F1
+ ble .Lizamax_kernel_F1
add Z, Z, #1
-iamax_kernel_F8:
+.Lizamax_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne iamax_kernel_F8
+ bne .Lizamax_kernel_F8
KERNEL_F8_FINALIZE
sub Z, Z, #1
-iamax_kernel_F1:
+.Lizamax_kernel_F1:
ands I, N, #7
- ble iamax_kernel_L999
+ ble .Lizamax_kernel_L999
-iamax_kernel_F10:
+.Lizamax_kernel_F10:
KERNEL_S1
subs I, I, #1
- bne iamax_kernel_F10
+ bne .Lizamax_kernel_F10
- b iamax_kernel_L999
+ b .Lizamax_kernel_L999
-iamax_kernel_S_BEGIN:
+.Lizamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
- ble iamax_kernel_L999
+ ble .Lizamax_kernel_L999
asr I, N, #2
cmp I, xzr
- ble iamax_kernel_S1
+ ble .Lizamax_kernel_S1
-iamax_kernel_S4:
+.Lizamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne iamax_kernel_S4
+ bne .Lizamax_kernel_S4
-iamax_kernel_S1:
+.Lizamax_kernel_S1:
ands I, N, #3
- ble iamax_kernel_L999
+ ble .Lizamax_kernel_L999
-iamax_kernel_S10:
+.Lizamax_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne iamax_kernel_S10
+ bne .Lizamax_kernel_S10
-iamax_kernel_L999:
+.Lizamax_kernel_L999:
mov x0, INDEX
ret
-iamax_kernel_zero:
+.Lizamax_kernel_zero:
mov x0, xzr
ret
INIT
cmp N, #0
- ble nrm2_kernel_L999
+ ble .Lnrm2_kernel_L999
cmp INC_X, #0
- beq nrm2_kernel_L999
+ beq .Lnrm2_kernel_L999
cmp INC_X, #1
- bne nrm2_kernel_S_BEGIN
+ bne .Lnrm2_kernel_S_BEGIN
-nrm2_kernel_F_BEGIN:
+.Lnrm2_kernel_F_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, xzr
- ble nrm2_kernel_F1
+ ble .Lnrm2_kernel_F1
-nrm2_kernel_F8:
+.Lnrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne nrm2_kernel_F8
+ bne .Lnrm2_kernel_F8
-nrm2_kernel_F1:
+.Lnrm2_kernel_F1:
ands I, N, #7
- ble nrm2_kernel_L999
+ ble .Lnrm2_kernel_L999
-nrm2_kernel_F10:
+.Lnrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne nrm2_kernel_F10
+ bne .Lnrm2_kernel_F10
- b nrm2_kernel_L999
+ b .Lnrm2_kernel_L999
-nrm2_kernel_S_BEGIN:
+.Lnrm2_kernel_S_BEGIN:
INIT_S
.align 5
-nrm2_kernel_S10:
+.Lnrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne nrm2_kernel_S10
+ bne .Lnrm2_kernel_S10
-nrm2_kernel_L999:
+.Lnrm2_kernel_L999:
fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ
PROLOGUE
cmp N, xzr
- ble rot_kernel_L999
+ ble .Lrot_kernel_L999
INIT
cmp INC_X, #1
- bne rot_kernel_S_BEGIN
+ bne .Lrot_kernel_S_BEGIN
cmp INC_Y, #1
- bne rot_kernel_S_BEGIN
+ bne .Lrot_kernel_S_BEGIN
-rot_kernel_F_BEGIN:
+.Lrot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq rot_kernel_F1
+ beq .Lrot_kernel_F1
KERNEL_INIT_F4
-rot_kernel_F4:
+.Lrot_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne rot_kernel_F4
+ bne .Lrot_kernel_F4
-rot_kernel_F1:
+.Lrot_kernel_F1:
ands I, N, #3
- ble rot_kernel_L999
+ ble .Lrot_kernel_L999
INIT_F1
-rot_kernel_F10:
+.Lrot_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne rot_kernel_F10
+ bne .Lrot_kernel_F10
mov w0, wzr
ret
-rot_kernel_S_BEGIN:
+.Lrot_kernel_S_BEGIN:
INIT_S
INIT_F1
asr I, N, #2
cmp I, xzr
- ble rot_kernel_S1
+ ble .Lrot_kernel_S1
-rot_kernel_S4:
+.Lrot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne rot_kernel_S4
+ bne .Lrot_kernel_S4
-rot_kernel_S1:
+.Lrot_kernel_S1:
ands I, N, #3
- ble rot_kernel_L999
+ ble .Lrot_kernel_L999
-rot_kernel_S10:
+.Lrot_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne rot_kernel_S10
+ bne .Lrot_kernel_S10
-rot_kernel_L999:
+.Lrot_kernel_L999:
mov w0, wzr
ret
PROLOGUE
cmp N, xzr
- ble scal_kernel_L999
+ ble .Lscal_kernel_L999
fcmp DA, #0.0
- beq scal_kernel_zero
+ beq .Lscal_kernel_zero
cmp INC_X, #1
- bne scal_kernel_S_BEGIN
+ bne .Lscal_kernel_S_BEGIN
-scal_kernel_F_BEGIN:
+.Lscal_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
- beq scal_kernel_F1
+ beq .Lscal_kernel_F1
KERNEL_INIT_F8
-scal_kernel_F8:
+.Lscal_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne scal_kernel_F8
+ bne .Lscal_kernel_F8
-scal_kernel_F1:
+.Lscal_kernel_F1:
ands I, N, #7
- ble scal_kernel_L999
+ ble .Lscal_kernel_L999
-scal_kernel_F10:
+.Lscal_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne scal_kernel_F10
+ bne .Lscal_kernel_F10
mov w0, wzr
ret
-scal_kernel_S_BEGIN:
+.Lscal_kernel_S_BEGIN:
INIT_S
mov X_COPY, X
asr I, N, #2
cmp I, xzr
- ble scal_kernel_S1
+ ble .Lscal_kernel_S1
-scal_kernel_S4:
+.Lscal_kernel_S4:
KERNEL_S4
subs I, I, #1
- bne scal_kernel_S4
+ bne .Lscal_kernel_S4
-scal_kernel_S1:
+.Lscal_kernel_S1:
ands I, N, #3
- ble scal_kernel_L999
+ ble .Lscal_kernel_L999
-scal_kernel_S10:
+.Lscal_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne scal_kernel_S10
+ bne .Lscal_kernel_S10
-scal_kernel_L999:
+.Lscal_kernel_L999:
mov w0, wzr
ret
-scal_kernel_zero:
+.Lscal_kernel_zero:
INIT_S
-scal_kernel_Z1:
+.Lscal_kernel_Z1:
st1 DAV, [X], INC_X
subs N, N, #1
- bne scal_kernel_Z1
+ bne .Lscal_kernel_Z1
mov w0, wzr
ret
PROLOGUE
-sgemm_kernel_begin:
+.Lsgemm_kernel_begin:
.align 5
add sp, sp, #-(11 * 16)
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble sgemm_kernel_L2_BEGIN
+ ble .Lsgemm_kernel_L2_BEGIN
/******************************************************************************/
-sgemm_kernel_L4_BEGIN:
+.Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
mov pA, origPA // pA = start of A array
-sgemm_kernel_L4_M16_BEGIN:
+.Lsgemm_kernel_L4_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
- ble sgemm_kernel_L4_M8_BEGIN
+ ble .Lsgemm_kernel_L4_M8_BEGIN
.align 5
-sgemm_kernel_L4_M16_20:
+.Lsgemm_kernel_L4_M16_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
- blt sgemm_kernel_L4_M16_32
+ blt .Lsgemm_kernel_L4_M16_32
KERNEL16x4_I
KERNEL16x4_M2
KERNEL16x4_M2
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M16_22a
+ ble .Lsgemm_kernel_L4_M16_22a
.align 5
-sgemm_kernel_L4_M16_22:
+.Lsgemm_kernel_L4_M16_22:
KERNEL16x4_M1
KERNEL16x4_M2
KERNEL16x4_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M16_22
+ bgt .Lsgemm_kernel_L4_M16_22
.align 5
-sgemm_kernel_L4_M16_22a:
+.Lsgemm_kernel_L4_M16_22a:
KERNEL16x4_M1
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_E
- b sgemm_kernel_L4_M16_44
+ b .Lsgemm_kernel_L4_M16_44
.align 5
-sgemm_kernel_L4_M16_32:
+.Lsgemm_kernel_L4_M16_32:
tst counterL, #1
- ble sgemm_kernel_L4_M16_40
+ ble .Lsgemm_kernel_L4_M16_40
KERNEL16x4_I
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_E
- b sgemm_kernel_L4_M16_44
+ b .Lsgemm_kernel_L4_M16_44
-sgemm_kernel_L4_M16_40:
+.Lsgemm_kernel_L4_M16_40:
INIT16x4
-sgemm_kernel_L4_M16_44:
+.Lsgemm_kernel_L4_M16_44:
ands counterL , origK, #7
- ble sgemm_kernel_L4_M16_100
+ ble .Lsgemm_kernel_L4_M16_100
.align 5
-sgemm_kernel_L4_M16_46:
+.Lsgemm_kernel_L4_M16_46:
KERNEL16x4_SUB
subs counterL, counterL, #1
- bne sgemm_kernel_L4_M16_46
+ bne .Lsgemm_kernel_L4_M16_46
-sgemm_kernel_L4_M16_100:
+.Lsgemm_kernel_L4_M16_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE16x4
-sgemm_kernel_L4_M16_END:
+.Lsgemm_kernel_L4_M16_END:
subs counterI, counterI, #1
- bne sgemm_kernel_L4_M16_20
+ bne .Lsgemm_kernel_L4_M16_20
//------------------------------------------------------------------------------
-sgemm_kernel_L4_M8_BEGIN:
+.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #8
- ble sgemm_kernel_L4_M4_BEGIN
+ ble .Lsgemm_kernel_L4_M4_BEGIN
-sgemm_kernel_L4_M8_20:
+.Lsgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L4_M8_32
+ blt .Lsgemm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M8_22a
+ ble .Lsgemm_kernel_L4_M8_22a
.align 5
-sgemm_kernel_L4_M8_22:
+.Lsgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M8_22
+ bgt .Lsgemm_kernel_L4_M8_22
-sgemm_kernel_L4_M8_22a:
+.Lsgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
- b sgemm_kernel_L4_M8_44
+ b .Lsgemm_kernel_L4_M8_44
-sgemm_kernel_L4_M8_32:
+.Lsgemm_kernel_L4_M8_32:
tst counterL, #1
- ble sgemm_kernel_L4_M8_40
+ ble .Lsgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_E
- b sgemm_kernel_L4_M8_44
+ b .Lsgemm_kernel_L4_M8_44
-sgemm_kernel_L4_M8_40:
+.Lsgemm_kernel_L4_M8_40:
INIT8x4
-sgemm_kernel_L4_M8_44:
+.Lsgemm_kernel_L4_M8_44:
ands counterL , origK, #1
- ble sgemm_kernel_L4_M8_100
+ ble .Lsgemm_kernel_L4_M8_100
-sgemm_kernel_L4_M8_46:
+.Lsgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
-sgemm_kernel_L4_M8_100:
+.Lsgemm_kernel_L4_M8_100:
SAVE8x4
-sgemm_kernel_L4_M8_END:
+.Lsgemm_kernel_L4_M8_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L4_M4_BEGIN:
+.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #4
- ble sgemm_kernel_L4_M2_BEGIN
+ ble .Lsgemm_kernel_L4_M2_BEGIN
-sgemm_kernel_L4_M4_20:
+.Lsgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L4_M4_32
+ blt .Lsgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M4_22a
+ ble .Lsgemm_kernel_L4_M4_22a
.align 5
-sgemm_kernel_L4_M4_22:
+.Lsgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M4_22
+ bgt .Lsgemm_kernel_L4_M4_22
-sgemm_kernel_L4_M4_22a:
+.Lsgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b sgemm_kernel_L4_M4_44
+ b .Lsgemm_kernel_L4_M4_44
-sgemm_kernel_L4_M4_32:
+.Lsgemm_kernel_L4_M4_32:
tst counterL, #1
- ble sgemm_kernel_L4_M4_40
+ ble .Lsgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b sgemm_kernel_L4_M4_44
+ b .Lsgemm_kernel_L4_M4_44
-sgemm_kernel_L4_M4_40:
+.Lsgemm_kernel_L4_M4_40:
INIT4x4
-sgemm_kernel_L4_M4_44:
+.Lsgemm_kernel_L4_M4_44:
ands counterL , origK, #1
- ble sgemm_kernel_L4_M4_100
+ ble .Lsgemm_kernel_L4_M4_100
-sgemm_kernel_L4_M4_46:
+.Lsgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
-sgemm_kernel_L4_M4_100:
+.Lsgemm_kernel_L4_M4_100:
SAVE4x4
-sgemm_kernel_L4_M4_END:
+.Lsgemm_kernel_L4_M4_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L4_M2_BEGIN:
+.Lsgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L4_M1_BEGIN
+ ble .Lsgemm_kernel_L4_M1_BEGIN
-sgemm_kernel_L4_M2_20:
+.Lsgemm_kernel_L4_M2_20:
INIT2x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L4_M2_40
+ ble .Lsgemm_kernel_L4_M2_40
-sgemm_kernel_L4_M2_22:
+.Lsgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M2_22
+ bgt .Lsgemm_kernel_L4_M2_22
-sgemm_kernel_L4_M2_40:
+.Lsgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M2_100
+ ble .Lsgemm_kernel_L4_M2_100
-sgemm_kernel_L4_M2_42:
+.Lsgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M2_42
+ bgt .Lsgemm_kernel_L4_M2_42
-sgemm_kernel_L4_M2_100:
+.Lsgemm_kernel_L4_M2_100:
SAVE2x4
-sgemm_kernel_L4_M2_END:
+.Lsgemm_kernel_L4_M2_END:
-sgemm_kernel_L4_M1_BEGIN:
+.Lsgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
-sgemm_kernel_L4_M1_20:
+.Lsgemm_kernel_L4_M1_20:
INIT1x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L4_M1_40
+ ble .Lsgemm_kernel_L4_M1_40
-sgemm_kernel_L4_M1_22:
+.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M1_22
+ bgt .Lsgemm_kernel_L4_M1_22
-sgemm_kernel_L4_M1_40:
+.Lsgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M1_100
+ ble .Lsgemm_kernel_L4_M1_100
-sgemm_kernel_L4_M1_42:
+.Lsgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M1_42
+ bgt .Lsgemm_kernel_L4_M1_42
-sgemm_kernel_L4_M1_100:
+.Lsgemm_kernel_L4_M1_100:
SAVE1x4
-sgemm_kernel_L4_END:
+.Lsgemm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
subs counterJ, counterJ , #1 // j--
- bgt sgemm_kernel_L4_BEGIN
+ bgt .Lsgemm_kernel_L4_BEGIN
/******************************************************************************/
-sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble sgemm_kernel_L999
+ ble .Lsgemm_kernel_L999
tst counterJ , #2
- ble sgemm_kernel_L1_BEGIN
+ ble .Lsgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-sgemm_kernel_L2_M16_BEGIN:
+.Lsgemm_kernel_L2_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI,#0
- ble sgemm_kernel_L2_M8_BEGIN
+ ble .Lsgemm_kernel_L2_M8_BEGIN
-sgemm_kernel_L2_M16_20:
+.Lsgemm_kernel_L2_M16_20:
INIT16x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M16_40
+ ble .Lsgemm_kernel_L2_M16_40
.align 5
-sgemm_kernel_L2_M16_22:
+.Lsgemm_kernel_L2_M16_22:
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M16_22
+ bgt .Lsgemm_kernel_L2_M16_22
-sgemm_kernel_L2_M16_40:
+.Lsgemm_kernel_L2_M16_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M16_100
+ ble .Lsgemm_kernel_L2_M16_100
-sgemm_kernel_L2_M16_42:
+.Lsgemm_kernel_L2_M16_42:
KERNEL16x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M16_42
+ bgt .Lsgemm_kernel_L2_M16_42
-sgemm_kernel_L2_M16_100:
+.Lsgemm_kernel_L2_M16_100:
SAVE16x2
-sgemm_kernel_L2_M16_END:
+.Lsgemm_kernel_L2_M16_END:
subs counterI, counterI, #1
- bgt sgemm_kernel_L2_M16_20
+ bgt .Lsgemm_kernel_L2_M16_20
//------------------------------------------------------------------------------
-sgemm_kernel_L2_M8_BEGIN:
+.Lsgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #8
- ble sgemm_kernel_L2_M4_BEGIN
+ ble .Lsgemm_kernel_L2_M4_BEGIN
-sgemm_kernel_L2_M8_20:
+.Lsgemm_kernel_L2_M8_20:
INIT8x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M8_40
+ ble .Lsgemm_kernel_L2_M8_40
.align 5
-sgemm_kernel_L2_M8_22:
+.Lsgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M8_22
+ bgt .Lsgemm_kernel_L2_M8_22
-sgemm_kernel_L2_M8_40:
+.Lsgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M8_100
+ ble .Lsgemm_kernel_L2_M8_100
-sgemm_kernel_L2_M8_42:
+.Lsgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M8_42
+ bgt .Lsgemm_kernel_L2_M8_42
-sgemm_kernel_L2_M8_100:
+.Lsgemm_kernel_L2_M8_100:
SAVE8x2
-sgemm_kernel_L2_M8_END:
+.Lsgemm_kernel_L2_M8_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L2_M4_BEGIN:
+.Lsgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #4
- ble sgemm_kernel_L2_M2_BEGIN
+ ble .Lsgemm_kernel_L2_M2_BEGIN
-sgemm_kernel_L2_M4_20:
+.Lsgemm_kernel_L2_M4_20:
INIT4x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M4_40
+ ble .Lsgemm_kernel_L2_M4_40
.align 5
-sgemm_kernel_L2_M4_22:
+.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M4_22
+ bgt .Lsgemm_kernel_L2_M4_22
-sgemm_kernel_L2_M4_40:
+.Lsgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M4_100
+ ble .Lsgemm_kernel_L2_M4_100
-sgemm_kernel_L2_M4_42:
+.Lsgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M4_42
+ bgt .Lsgemm_kernel_L2_M4_42
-sgemm_kernel_L2_M4_100:
+.Lsgemm_kernel_L2_M4_100:
SAVE4x2
-sgemm_kernel_L2_M4_END:
+.Lsgemm_kernel_L2_M4_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L2_M2_BEGIN:
+.Lsgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L2_M1_BEGIN
+ ble .Lsgemm_kernel_L2_M1_BEGIN
-sgemm_kernel_L2_M2_20:
+.Lsgemm_kernel_L2_M2_20:
INIT2x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M2_40
+ ble .Lsgemm_kernel_L2_M2_40
-sgemm_kernel_L2_M2_22:
+.Lsgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M2_22
+ bgt .Lsgemm_kernel_L2_M2_22
-sgemm_kernel_L2_M2_40:
+.Lsgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M2_100
+ ble .Lsgemm_kernel_L2_M2_100
-sgemm_kernel_L2_M2_42:
+.Lsgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M2_42
+ bgt .Lsgemm_kernel_L2_M2_42
-sgemm_kernel_L2_M2_100:
+.Lsgemm_kernel_L2_M2_100:
SAVE2x2
-sgemm_kernel_L2_M2_END:
+.Lsgemm_kernel_L2_M2_END:
-sgemm_kernel_L2_M1_BEGIN:
+.Lsgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
-sgemm_kernel_L2_M1_20:
+.Lsgemm_kernel_L2_M1_20:
INIT1x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble sgemm_kernel_L2_M1_40
+ ble .Lsgemm_kernel_L2_M1_40
-sgemm_kernel_L2_M1_22:
+.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M1_22
+ bgt .Lsgemm_kernel_L2_M1_22
-sgemm_kernel_L2_M1_40:
+.Lsgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M1_100
+ ble .Lsgemm_kernel_L2_M1_100
-sgemm_kernel_L2_M1_42:
+.Lsgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M1_42
+ bgt .Lsgemm_kernel_L2_M1_42
-sgemm_kernel_L2_M1_100:
+.Lsgemm_kernel_L2_M1_100:
SAVE1x2
-sgemm_kernel_L2_END:
+.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/
-sgemm_kernel_L1_BEGIN:
+.Lsgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble sgemm_kernel_L999 // done
+ ble .Lsgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
mov pA, origPA // pA = A
-sgemm_kernel_L1_M16_BEGIN:
+.Lsgemm_kernel_L1_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
- ble sgemm_kernel_L1_M8_BEGIN
+ ble .Lsgemm_kernel_L1_M8_BEGIN
-sgemm_kernel_L1_M16_20:
+.Lsgemm_kernel_L1_M16_20:
INIT16x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M16_40
+ ble .Lsgemm_kernel_L1_M16_40
.align 5
-sgemm_kernel_L1_M16_22:
+.Lsgemm_kernel_L1_M16_22:
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M16_22
+ bgt .Lsgemm_kernel_L1_M16_22
-sgemm_kernel_L1_M16_40:
+.Lsgemm_kernel_L1_M16_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M16_100
+ ble .Lsgemm_kernel_L1_M16_100
-sgemm_kernel_L1_M16_42:
+.Lsgemm_kernel_L1_M16_42:
KERNEL16x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M16_42
+ bgt .Lsgemm_kernel_L1_M16_42
-sgemm_kernel_L1_M16_100:
+.Lsgemm_kernel_L1_M16_100:
SAVE16x1
-sgemm_kernel_L1_M16_END:
+.Lsgemm_kernel_L1_M16_END:
subs counterI, counterI, #1
- bgt sgemm_kernel_L1_M16_20
+ bgt .Lsgemm_kernel_L1_M16_20
//------------------------------------------------------------------------------
-sgemm_kernel_L1_M8_BEGIN:
+.Lsgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #8
- ble sgemm_kernel_L1_M4_BEGIN
+ ble .Lsgemm_kernel_L1_M4_BEGIN
-sgemm_kernel_L1_M8_20:
+.Lsgemm_kernel_L1_M8_20:
INIT8x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M8_40
+ ble .Lsgemm_kernel_L1_M8_40
.align 5
-sgemm_kernel_L1_M8_22:
+.Lsgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M8_22
+ bgt .Lsgemm_kernel_L1_M8_22
-sgemm_kernel_L1_M8_40:
+.Lsgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M8_100
+ ble .Lsgemm_kernel_L1_M8_100
-sgemm_kernel_L1_M8_42:
+.Lsgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M8_42
+ bgt .Lsgemm_kernel_L1_M8_42
-sgemm_kernel_L1_M8_100:
+.Lsgemm_kernel_L1_M8_100:
SAVE8x1
-sgemm_kernel_L1_M8_END:
+.Lsgemm_kernel_L1_M8_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L1_M4_BEGIN:
+.Lsgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #4
- ble sgemm_kernel_L1_M2_BEGIN
+ ble .Lsgemm_kernel_L1_M2_BEGIN
-sgemm_kernel_L1_M4_20:
+.Lsgemm_kernel_L1_M4_20:
INIT4x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M4_40
+ ble .Lsgemm_kernel_L1_M4_40
.align 5
-sgemm_kernel_L1_M4_22:
+.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M4_22
+ bgt .Lsgemm_kernel_L1_M4_22
-sgemm_kernel_L1_M4_40:
+.Lsgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M4_100
+ ble .Lsgemm_kernel_L1_M4_100
-sgemm_kernel_L1_M4_42:
+.Lsgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M4_42
+ bgt .Lsgemm_kernel_L1_M4_42
-sgemm_kernel_L1_M4_100:
+.Lsgemm_kernel_L1_M4_100:
SAVE4x1
-sgemm_kernel_L1_M4_END:
+.Lsgemm_kernel_L1_M4_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L1_M2_BEGIN:
+.Lsgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L1_M1_BEGIN
+ ble .Lsgemm_kernel_L1_M1_BEGIN
-sgemm_kernel_L1_M2_20:
+.Lsgemm_kernel_L1_M2_20:
INIT2x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M2_40
+ ble .Lsgemm_kernel_L1_M2_40
-sgemm_kernel_L1_M2_22:
+.Lsgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M2_22
+ bgt .Lsgemm_kernel_L1_M2_22
-sgemm_kernel_L1_M2_40:
+.Lsgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M2_100
+ ble .Lsgemm_kernel_L1_M2_100
-sgemm_kernel_L1_M2_42:
+.Lsgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M2_42
+ bgt .Lsgemm_kernel_L1_M2_42
-sgemm_kernel_L1_M2_100:
+.Lsgemm_kernel_L1_M2_100:
SAVE2x1
-sgemm_kernel_L1_M2_END:
+.Lsgemm_kernel_L1_M2_END:
-sgemm_kernel_L1_M1_BEGIN:
+.Lsgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
-sgemm_kernel_L1_M1_20:
+.Lsgemm_kernel_L1_M1_20:
INIT1x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M1_40
+ ble .Lsgemm_kernel_L1_M1_40
-sgemm_kernel_L1_M1_22:
+.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M1_22
+ bgt .Lsgemm_kernel_L1_M1_22
-sgemm_kernel_L1_M1_40:
+.Lsgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M1_100
+ ble .Lsgemm_kernel_L1_M1_100
-sgemm_kernel_L1_M1_42:
+.Lsgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M1_42
+ bgt .Lsgemm_kernel_L1_M1_42
-sgemm_kernel_L1_M1_100:
+.Lsgemm_kernel_L1_M1_100:
SAVE1x1
-sgemm_kernel_L1_END:
+.Lsgemm_kernel_L1_END:
-sgemm_kernel_L999:
+.Lsgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
PROLOGUE
-sgemm_kernel_begin:
+.Lsgemm_kernel_begin:
.align 5
add sp, sp, #-(11 * 16)
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble sgemm_kernel_L2_BEGIN
+ ble .Lsgemm_kernel_L2_BEGIN
/******************************************************************************/
-sgemm_kernel_L4_BEGIN:
+.Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
mov pA, origPA // pA = start of A array
-sgemm_kernel_L4_M16_BEGIN:
+.Lsgemm_kernel_L4_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
- ble sgemm_kernel_L4_M8_BEGIN
+ ble .Lsgemm_kernel_L4_M8_BEGIN
.align 5
-sgemm_kernel_L4_M16_20:
+.Lsgemm_kernel_L4_M16_20:
mov pB, origPB
asr counterL , origK, #4 // L = K / 16
cmp counterL , #2
- blt sgemm_kernel_L4_M16_32
+ blt .Lsgemm_kernel_L4_M16_32
KERNEL16x4_I
KERNEL16x4_M2
KERNEL16x4_M1_M2_x1
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M16_22a
+ ble .Lsgemm_kernel_L4_M16_22a
.align 5
-sgemm_kernel_L4_M16_22:
+.Lsgemm_kernel_L4_M16_22:
KERNEL16x4_M1_M2_x8
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M16_22
+ bgt .Lsgemm_kernel_L4_M16_22
.align 5
-sgemm_kernel_L4_M16_22a:
+.Lsgemm_kernel_L4_M16_22a:
KERNEL16x4_M1_M2_x4
KERNEL16x4_M1_M2_x2
KERNEL16x4_M1
KERNEL16x4_E
- b sgemm_kernel_L4_M16_44
+ b .Lsgemm_kernel_L4_M16_44
.align 5
-sgemm_kernel_L4_M16_32:
+.Lsgemm_kernel_L4_M16_32:
tst counterL, #1
- ble sgemm_kernel_L4_M16_40
+ ble .Lsgemm_kernel_L4_M16_40
KERNEL16x4_I
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_E
- b sgemm_kernel_L4_M16_44
+ b .Lsgemm_kernel_L4_M16_44
-sgemm_kernel_L4_M16_40:
+.Lsgemm_kernel_L4_M16_40:
INIT16x4
-sgemm_kernel_L4_M16_44:
+.Lsgemm_kernel_L4_M16_44:
ands counterL , origK, #15
- ble sgemm_kernel_L4_M16_100
+ ble .Lsgemm_kernel_L4_M16_100
.align 5
-sgemm_kernel_L4_M16_46:
+.Lsgemm_kernel_L4_M16_46:
KERNEL16x4_SUB
subs counterL, counterL, #1
- bne sgemm_kernel_L4_M16_46
+ bne .Lsgemm_kernel_L4_M16_46
-sgemm_kernel_L4_M16_100:
+.Lsgemm_kernel_L4_M16_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE16x4
-sgemm_kernel_L4_M16_END:
+.Lsgemm_kernel_L4_M16_END:
subs counterI, counterI, #1
- bne sgemm_kernel_L4_M16_20
+ bne .Lsgemm_kernel_L4_M16_20
//------------------------------------------------------------------------------
-sgemm_kernel_L4_M8_BEGIN:
+.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #8
- ble sgemm_kernel_L4_M4_BEGIN
+ ble .Lsgemm_kernel_L4_M4_BEGIN
-sgemm_kernel_L4_M8_20:
+.Lsgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L4_M8_32
+ blt .Lsgemm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M8_22a
+ ble .Lsgemm_kernel_L4_M8_22a
.align 5
-sgemm_kernel_L4_M8_22:
+.Lsgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M8_22
+ bgt .Lsgemm_kernel_L4_M8_22
-sgemm_kernel_L4_M8_22a:
+.Lsgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
- b sgemm_kernel_L4_M8_44
+ b .Lsgemm_kernel_L4_M8_44
-sgemm_kernel_L4_M8_32:
+.Lsgemm_kernel_L4_M8_32:
tst counterL, #1
- ble sgemm_kernel_L4_M8_40
+ ble .Lsgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_E
- b sgemm_kernel_L4_M8_44
+ b .Lsgemm_kernel_L4_M8_44
-sgemm_kernel_L4_M8_40:
+.Lsgemm_kernel_L4_M8_40:
INIT8x4
-sgemm_kernel_L4_M8_44:
+.Lsgemm_kernel_L4_M8_44:
ands counterL , origK, #1
- ble sgemm_kernel_L4_M8_100
+ ble .Lsgemm_kernel_L4_M8_100
-sgemm_kernel_L4_M8_46:
+.Lsgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
-sgemm_kernel_L4_M8_100:
+.Lsgemm_kernel_L4_M8_100:
SAVE8x4
-sgemm_kernel_L4_M8_END:
+.Lsgemm_kernel_L4_M8_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L4_M4_BEGIN:
+.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #4
- ble sgemm_kernel_L4_M2_BEGIN
+ ble .Lsgemm_kernel_L4_M2_BEGIN
-sgemm_kernel_L4_M4_20:
+.Lsgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L4_M4_32
+ blt .Lsgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M4_22a
+ ble .Lsgemm_kernel_L4_M4_22a
.align 5
-sgemm_kernel_L4_M4_22:
+.Lsgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M4_22
+ bgt .Lsgemm_kernel_L4_M4_22
-sgemm_kernel_L4_M4_22a:
+.Lsgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b sgemm_kernel_L4_M4_44
+ b .Lsgemm_kernel_L4_M4_44
-sgemm_kernel_L4_M4_32:
+.Lsgemm_kernel_L4_M4_32:
tst counterL, #1
- ble sgemm_kernel_L4_M4_40
+ ble .Lsgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b sgemm_kernel_L4_M4_44
+ b .Lsgemm_kernel_L4_M4_44
-sgemm_kernel_L4_M4_40:
+.Lsgemm_kernel_L4_M4_40:
INIT4x4
-sgemm_kernel_L4_M4_44:
+.Lsgemm_kernel_L4_M4_44:
ands counterL , origK, #1
- ble sgemm_kernel_L4_M4_100
+ ble .Lsgemm_kernel_L4_M4_100
-sgemm_kernel_L4_M4_46:
+.Lsgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
-sgemm_kernel_L4_M4_100:
+.Lsgemm_kernel_L4_M4_100:
SAVE4x4
-sgemm_kernel_L4_M4_END:
+.Lsgemm_kernel_L4_M4_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L4_M2_BEGIN:
+.Lsgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L4_M1_BEGIN
+ ble .Lsgemm_kernel_L4_M1_BEGIN
-sgemm_kernel_L4_M2_20:
+.Lsgemm_kernel_L4_M2_20:
INIT2x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L4_M2_40
+ ble .Lsgemm_kernel_L4_M2_40
-sgemm_kernel_L4_M2_22:
+.Lsgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M2_22
+ bgt .Lsgemm_kernel_L4_M2_22
-sgemm_kernel_L4_M2_40:
+.Lsgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M2_100
+ ble .Lsgemm_kernel_L4_M2_100
-sgemm_kernel_L4_M2_42:
+.Lsgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M2_42
+ bgt .Lsgemm_kernel_L4_M2_42
-sgemm_kernel_L4_M2_100:
+.Lsgemm_kernel_L4_M2_100:
SAVE2x4
-sgemm_kernel_L4_M2_END:
+.Lsgemm_kernel_L4_M2_END:
-sgemm_kernel_L4_M1_BEGIN:
+.Lsgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
-sgemm_kernel_L4_M1_20:
+.Lsgemm_kernel_L4_M1_20:
INIT1x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L4_M1_40
+ ble .Lsgemm_kernel_L4_M1_40
-sgemm_kernel_L4_M1_22:
+.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M1_22
+ bgt .Lsgemm_kernel_L4_M1_22
-sgemm_kernel_L4_M1_40:
+.Lsgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M1_100
+ ble .Lsgemm_kernel_L4_M1_100
-sgemm_kernel_L4_M1_42:
+.Lsgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M1_42
+ bgt .Lsgemm_kernel_L4_M1_42
-sgemm_kernel_L4_M1_100:
+.Lsgemm_kernel_L4_M1_100:
SAVE1x4
-sgemm_kernel_L4_END:
+.Lsgemm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
subs counterJ, counterJ , #1 // j--
- bgt sgemm_kernel_L4_BEGIN
+ bgt .Lsgemm_kernel_L4_BEGIN
/******************************************************************************/
-sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble sgemm_kernel_L999
+ ble .Lsgemm_kernel_L999
tst counterJ , #2
- ble sgemm_kernel_L1_BEGIN
+ ble .Lsgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-sgemm_kernel_L2_M16_BEGIN:
+.Lsgemm_kernel_L2_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI,#0
- ble sgemm_kernel_L2_M8_BEGIN
+ ble .Lsgemm_kernel_L2_M8_BEGIN
-sgemm_kernel_L2_M16_20:
+.Lsgemm_kernel_L2_M16_20:
INIT16x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M16_40
+ ble .Lsgemm_kernel_L2_M16_40
.align 5
-sgemm_kernel_L2_M16_22:
+.Lsgemm_kernel_L2_M16_22:
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M16_22
+ bgt .Lsgemm_kernel_L2_M16_22
-sgemm_kernel_L2_M16_40:
+.Lsgemm_kernel_L2_M16_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M16_100
+ ble .Lsgemm_kernel_L2_M16_100
-sgemm_kernel_L2_M16_42:
+.Lsgemm_kernel_L2_M16_42:
KERNEL16x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M16_42
+ bgt .Lsgemm_kernel_L2_M16_42
-sgemm_kernel_L2_M16_100:
+.Lsgemm_kernel_L2_M16_100:
SAVE16x2
-sgemm_kernel_L2_M16_END:
+.Lsgemm_kernel_L2_M16_END:
subs counterI, counterI, #1
- bgt sgemm_kernel_L2_M16_20
+ bgt .Lsgemm_kernel_L2_M16_20
//------------------------------------------------------------------------------
-sgemm_kernel_L2_M8_BEGIN:
+.Lsgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #8
- ble sgemm_kernel_L2_M4_BEGIN
+ ble .Lsgemm_kernel_L2_M4_BEGIN
-sgemm_kernel_L2_M8_20:
+.Lsgemm_kernel_L2_M8_20:
INIT8x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M8_40
+ ble .Lsgemm_kernel_L2_M8_40
.align 5
-sgemm_kernel_L2_M8_22:
+.Lsgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M8_22
+ bgt .Lsgemm_kernel_L2_M8_22
-sgemm_kernel_L2_M8_40:
+.Lsgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M8_100
+ ble .Lsgemm_kernel_L2_M8_100
-sgemm_kernel_L2_M8_42:
+.Lsgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M8_42
+ bgt .Lsgemm_kernel_L2_M8_42
-sgemm_kernel_L2_M8_100:
+.Lsgemm_kernel_L2_M8_100:
SAVE8x2
-sgemm_kernel_L2_M8_END:
+.Lsgemm_kernel_L2_M8_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L2_M4_BEGIN:
+.Lsgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #4
- ble sgemm_kernel_L2_M2_BEGIN
+ ble .Lsgemm_kernel_L2_M2_BEGIN
-sgemm_kernel_L2_M4_20:
+.Lsgemm_kernel_L2_M4_20:
INIT4x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M4_40
+ ble .Lsgemm_kernel_L2_M4_40
.align 5
-sgemm_kernel_L2_M4_22:
+.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M4_22
+ bgt .Lsgemm_kernel_L2_M4_22
-sgemm_kernel_L2_M4_40:
+.Lsgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M4_100
+ ble .Lsgemm_kernel_L2_M4_100
-sgemm_kernel_L2_M4_42:
+.Lsgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M4_42
+ bgt .Lsgemm_kernel_L2_M4_42
-sgemm_kernel_L2_M4_100:
+.Lsgemm_kernel_L2_M4_100:
SAVE4x2
-sgemm_kernel_L2_M4_END:
+.Lsgemm_kernel_L2_M4_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L2_M2_BEGIN:
+.Lsgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L2_M1_BEGIN
+ ble .Lsgemm_kernel_L2_M1_BEGIN
-sgemm_kernel_L2_M2_20:
+.Lsgemm_kernel_L2_M2_20:
INIT2x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M2_40
+ ble .Lsgemm_kernel_L2_M2_40
-sgemm_kernel_L2_M2_22:
+.Lsgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M2_22
+ bgt .Lsgemm_kernel_L2_M2_22
-sgemm_kernel_L2_M2_40:
+.Lsgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M2_100
+ ble .Lsgemm_kernel_L2_M2_100
-sgemm_kernel_L2_M2_42:
+.Lsgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M2_42
+ bgt .Lsgemm_kernel_L2_M2_42
-sgemm_kernel_L2_M2_100:
+.Lsgemm_kernel_L2_M2_100:
SAVE2x2
-sgemm_kernel_L2_M2_END:
+.Lsgemm_kernel_L2_M2_END:
-sgemm_kernel_L2_M1_BEGIN:
+.Lsgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
-sgemm_kernel_L2_M1_20:
+.Lsgemm_kernel_L2_M1_20:
INIT1x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble sgemm_kernel_L2_M1_40
+ ble .Lsgemm_kernel_L2_M1_40
-sgemm_kernel_L2_M1_22:
+.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M1_22
+ bgt .Lsgemm_kernel_L2_M1_22
-sgemm_kernel_L2_M1_40:
+.Lsgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M1_100
+ ble .Lsgemm_kernel_L2_M1_100
-sgemm_kernel_L2_M1_42:
+.Lsgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M1_42
+ bgt .Lsgemm_kernel_L2_M1_42
-sgemm_kernel_L2_M1_100:
+.Lsgemm_kernel_L2_M1_100:
SAVE1x2
-sgemm_kernel_L2_END:
+.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/
-sgemm_kernel_L1_BEGIN:
+.Lsgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble sgemm_kernel_L999 // done
+ ble .Lsgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
mov pA, origPA // pA = A
-sgemm_kernel_L1_M16_BEGIN:
+.Lsgemm_kernel_L1_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
- ble sgemm_kernel_L1_M8_BEGIN
+ ble .Lsgemm_kernel_L1_M8_BEGIN
-sgemm_kernel_L1_M16_20:
+.Lsgemm_kernel_L1_M16_20:
INIT16x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M16_40
+ ble .Lsgemm_kernel_L1_M16_40
.align 5
-sgemm_kernel_L1_M16_22:
+.Lsgemm_kernel_L1_M16_22:
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M16_22
+ bgt .Lsgemm_kernel_L1_M16_22
-sgemm_kernel_L1_M16_40:
+.Lsgemm_kernel_L1_M16_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M16_100
+ ble .Lsgemm_kernel_L1_M16_100
-sgemm_kernel_L1_M16_42:
+.Lsgemm_kernel_L1_M16_42:
KERNEL16x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M16_42
+ bgt .Lsgemm_kernel_L1_M16_42
-sgemm_kernel_L1_M16_100:
+.Lsgemm_kernel_L1_M16_100:
SAVE16x1
-sgemm_kernel_L1_M16_END:
+.Lsgemm_kernel_L1_M16_END:
subs counterI, counterI, #1
- bgt sgemm_kernel_L1_M16_20
+ bgt .Lsgemm_kernel_L1_M16_20
//------------------------------------------------------------------------------
-sgemm_kernel_L1_M8_BEGIN:
+.Lsgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #8
- ble sgemm_kernel_L1_M4_BEGIN
+ ble .Lsgemm_kernel_L1_M4_BEGIN
-sgemm_kernel_L1_M8_20:
+.Lsgemm_kernel_L1_M8_20:
INIT8x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M8_40
+ ble .Lsgemm_kernel_L1_M8_40
.align 5
-sgemm_kernel_L1_M8_22:
+.Lsgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M8_22
+ bgt .Lsgemm_kernel_L1_M8_22
-sgemm_kernel_L1_M8_40:
+.Lsgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M8_100
+ ble .Lsgemm_kernel_L1_M8_100
-sgemm_kernel_L1_M8_42:
+.Lsgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M8_42
+ bgt .Lsgemm_kernel_L1_M8_42
-sgemm_kernel_L1_M8_100:
+.Lsgemm_kernel_L1_M8_100:
SAVE8x1
-sgemm_kernel_L1_M8_END:
+.Lsgemm_kernel_L1_M8_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L1_M4_BEGIN:
+.Lsgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #4
- ble sgemm_kernel_L1_M2_BEGIN
+ ble .Lsgemm_kernel_L1_M2_BEGIN
-sgemm_kernel_L1_M4_20:
+.Lsgemm_kernel_L1_M4_20:
INIT4x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M4_40
+ ble .Lsgemm_kernel_L1_M4_40
.align 5
-sgemm_kernel_L1_M4_22:
+.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M4_22
+ bgt .Lsgemm_kernel_L1_M4_22
-sgemm_kernel_L1_M4_40:
+.Lsgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M4_100
+ ble .Lsgemm_kernel_L1_M4_100
-sgemm_kernel_L1_M4_42:
+.Lsgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M4_42
+ bgt .Lsgemm_kernel_L1_M4_42
-sgemm_kernel_L1_M4_100:
+.Lsgemm_kernel_L1_M4_100:
SAVE4x1
-sgemm_kernel_L1_M4_END:
+.Lsgemm_kernel_L1_M4_END:
//------------------------------------------------------------------------------
-sgemm_kernel_L1_M2_BEGIN:
+.Lsgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L1_M1_BEGIN
+ ble .Lsgemm_kernel_L1_M1_BEGIN
-sgemm_kernel_L1_M2_20:
+.Lsgemm_kernel_L1_M2_20:
INIT2x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M2_40
+ ble .Lsgemm_kernel_L1_M2_40
-sgemm_kernel_L1_M2_22:
+.Lsgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M2_22
+ bgt .Lsgemm_kernel_L1_M2_22
-sgemm_kernel_L1_M2_40:
+.Lsgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M2_100
+ ble .Lsgemm_kernel_L1_M2_100
-sgemm_kernel_L1_M2_42:
+.Lsgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M2_42
+ bgt .Lsgemm_kernel_L1_M2_42
-sgemm_kernel_L1_M2_100:
+.Lsgemm_kernel_L1_M2_100:
SAVE2x1
-sgemm_kernel_L1_M2_END:
+.Lsgemm_kernel_L1_M2_END:
-sgemm_kernel_L1_M1_BEGIN:
+.Lsgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
-sgemm_kernel_L1_M1_20:
+.Lsgemm_kernel_L1_M1_20:
INIT1x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M1_40
+ ble .Lsgemm_kernel_L1_M1_40
-sgemm_kernel_L1_M1_22:
+.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M1_22
+ bgt .Lsgemm_kernel_L1_M1_22
-sgemm_kernel_L1_M1_40:
+.Lsgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M1_100
+ ble .Lsgemm_kernel_L1_M1_100
-sgemm_kernel_L1_M1_42:
+.Lsgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M1_42
+ bgt .Lsgemm_kernel_L1_M1_42
-sgemm_kernel_L1_M1_100:
+.Lsgemm_kernel_L1_M1_100:
SAVE1x1
-sgemm_kernel_L1_END:
+.Lsgemm_kernel_L1_END:
-sgemm_kernel_L999:
+.Lsgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble sgemm_kernel_L2_BEGIN
+ ble .Lsgemm_kernel_L2_BEGIN
/******************************************************************************/
-sgemm_kernel_L4_BEGIN:
+.Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
add pA_2, temp, pA_1
add pA_3, temp, pA_2
-sgemm_kernel_L4_M16_BEGIN:
+.Lsgemm_kernel_L4_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
- ble sgemm_kernel_L4_M8_BEGIN
+ ble .Lsgemm_kernel_L4_M8_BEGIN
-sgemm_kernel_L4_M16_20:
+.Lsgemm_kernel_L4_M16_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L4_M16_32
+ blt .Lsgemm_kernel_L4_M16_32
KERNEL16x4_I // do one in the K
KERNEL16x4_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M16_22a
+ ble .Lsgemm_kernel_L4_M16_22a
.align 5
-sgemm_kernel_L4_M16_22:
+.Lsgemm_kernel_L4_M16_22:
KERNEL16x4_M1
KERNEL16x4_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M16_22
+ bgt .Lsgemm_kernel_L4_M16_22
-sgemm_kernel_L4_M16_22a:
+.Lsgemm_kernel_L4_M16_22a:
KERNEL16x4_M1
KERNEL16x4_E
- b sgemm_kernel_L4_M16_44
+ b .Lsgemm_kernel_L4_M16_44
-sgemm_kernel_L4_M16_32:
+.Lsgemm_kernel_L4_M16_32:
tst counterL, #1
- ble sgemm_kernel_L4_M16_40
+ ble .Lsgemm_kernel_L4_M16_40
KERNEL16x4_I
KERNEL16x4_E
- b sgemm_kernel_L4_M16_44
+ b .Lsgemm_kernel_L4_M16_44
-sgemm_kernel_L4_M16_40:
+.Lsgemm_kernel_L4_M16_40:
INIT16x4
-sgemm_kernel_L4_M16_44:
+.Lsgemm_kernel_L4_M16_44:
ands counterL , origK, #1
- ble sgemm_kernel_L4_M16_100
+ ble .Lsgemm_kernel_L4_M16_100
-sgemm_kernel_L4_M16_46:
+.Lsgemm_kernel_L4_M16_46:
KERNEL16x4_SUB
-sgemm_kernel_L4_M16_100:
+.Lsgemm_kernel_L4_M16_100:
SAVE16x4
-sgemm_kernel_L4_M16_END:
+.Lsgemm_kernel_L4_M16_END:
lsl temp, origK, #4 // k * 4 * 4 = Four rows of A
add pA_0, pA_0, temp
add pA_0, pA_0, temp
add pA_2, pA_1, temp
add pA_3, pA_2, temp
subs counterI, counterI, #1
- bne sgemm_kernel_L4_M16_20
+ bne .Lsgemm_kernel_L4_M16_20
-sgemm_kernel_L4_M8_BEGIN:
+.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #8
- ble sgemm_kernel_L4_M4_BEGIN
+ ble .Lsgemm_kernel_L4_M4_BEGIN
-sgemm_kernel_L4_M8_20:
+.Lsgemm_kernel_L4_M8_20:
INIT8x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble sgemm_kernel_L4_M8_40
+ ble .Lsgemm_kernel_L4_M8_40
-sgemm_kernel_L4_M8_22:
+.Lsgemm_kernel_L4_M8_22:
KERNEL8x4_SUB
KERNEL8x4_SUB
KERNEL8x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M8_22
+ bgt .Lsgemm_kernel_L4_M8_22
-sgemm_kernel_L4_M8_40:
+.Lsgemm_kernel_L4_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M8_100
+ ble .Lsgemm_kernel_L4_M8_100
-sgemm_kernel_L4_M8_42:
+.Lsgemm_kernel_L4_M8_42:
KERNEL8x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M8_42
+ bgt .Lsgemm_kernel_L4_M8_42
-sgemm_kernel_L4_M8_100:
+.Lsgemm_kernel_L4_M8_100:
SAVE8x4
-sgemm_kernel_L4_M8_END:
+.Lsgemm_kernel_L4_M8_END:
lsl temp, origK, #4 // k * 4 * 4
add pA_0, pA_0, temp
-sgemm_kernel_L4_M4_BEGIN:
+.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #4
- ble sgemm_kernel_L4_M2_BEGIN
+ ble .Lsgemm_kernel_L4_M2_BEGIN
-sgemm_kernel_L4_M4_20:
+.Lsgemm_kernel_L4_M4_20:
INIT4x4
mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble sgemm_kernel_L4_M4_40
+ ble .Lsgemm_kernel_L4_M4_40
-sgemm_kernel_L4_M4_22:
+.Lsgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
KERNEL4x4_SUB
KERNEL4x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M4_22
+ bgt .Lsgemm_kernel_L4_M4_22
-sgemm_kernel_L4_M4_40:
+.Lsgemm_kernel_L4_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M4_100
+ ble .Lsgemm_kernel_L4_M4_100
-sgemm_kernel_L4_M4_42:
+.Lsgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M4_42
+ bgt .Lsgemm_kernel_L4_M4_42
-sgemm_kernel_L4_M4_100:
+.Lsgemm_kernel_L4_M4_100:
SAVE4x4
-sgemm_kernel_L4_M4_END:
+.Lsgemm_kernel_L4_M4_END:
-sgemm_kernel_L4_M2_BEGIN:
+.Lsgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L4_M1_BEGIN
+ ble .Lsgemm_kernel_L4_M1_BEGIN
-sgemm_kernel_L4_M2_20:
+.Lsgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L4_M2_40
+ ble .Lsgemm_kernel_L4_M2_40
-sgemm_kernel_L4_M2_22:
+.Lsgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M2_22
+ bgt .Lsgemm_kernel_L4_M2_22
-sgemm_kernel_L4_M2_40:
+.Lsgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M2_100
+ ble .Lsgemm_kernel_L4_M2_100
-sgemm_kernel_L4_M2_42:
+.Lsgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M2_42
+ bgt .Lsgemm_kernel_L4_M2_42
-sgemm_kernel_L4_M2_100:
+.Lsgemm_kernel_L4_M2_100:
SAVE2x4
-sgemm_kernel_L4_M2_END:
+.Lsgemm_kernel_L4_M2_END:
-sgemm_kernel_L4_M1_BEGIN:
+.Lsgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
-sgemm_kernel_L4_M1_20:
+.Lsgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L4_M1_40
+ ble .Lsgemm_kernel_L4_M1_40
-sgemm_kernel_L4_M1_22:
+.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M1_22
+ bgt .Lsgemm_kernel_L4_M1_22
-sgemm_kernel_L4_M1_40:
+.Lsgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M1_100
+ ble .Lsgemm_kernel_L4_M1_100
-sgemm_kernel_L4_M1_42:
+.Lsgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M1_42
+ bgt .Lsgemm_kernel_L4_M1_42
-sgemm_kernel_L4_M1_100:
+.Lsgemm_kernel_L4_M1_100:
SAVE1x4
-sgemm_kernel_L4_END:
+.Lsgemm_kernel_L4_END:
lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4
subs counterJ, counterJ , #1 // j--
- bgt sgemm_kernel_L4_BEGIN
+ bgt .Lsgemm_kernel_L4_BEGIN
/******************************************************************************/
-sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble sgemm_kernel_L999
+ ble .Lsgemm_kernel_L999
tst counterJ , #2
- ble sgemm_kernel_L1_BEGIN
+ ble .Lsgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
-sgemm_kernel_L2_M4_BEGIN:
+.Lsgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble sgemm_kernel_L2_M2_BEGIN
+ ble .Lsgemm_kernel_L2_M2_BEGIN
-sgemm_kernel_L2_M4_20:
+.Lsgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M4_40
+ ble .Lsgemm_kernel_L2_M4_40
.align 5
-sgemm_kernel_L2_M4_22:
+.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M4_22
+ bgt .Lsgemm_kernel_L2_M4_22
-sgemm_kernel_L2_M4_40:
+.Lsgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M4_100
+ ble .Lsgemm_kernel_L2_M4_100
-sgemm_kernel_L2_M4_42:
+.Lsgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M4_42
+ bgt .Lsgemm_kernel_L2_M4_42
-sgemm_kernel_L2_M4_100:
+.Lsgemm_kernel_L2_M4_100:
SAVE4x2
-sgemm_kernel_L2_M4_END:
+.Lsgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt sgemm_kernel_L2_M4_20
+ bgt .Lsgemm_kernel_L2_M4_20
-sgemm_kernel_L2_M2_BEGIN:
+.Lsgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L2_M1_BEGIN
+ ble .Lsgemm_kernel_L2_M1_BEGIN
-sgemm_kernel_L2_M2_20:
+.Lsgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M2_40
+ ble .Lsgemm_kernel_L2_M2_40
-sgemm_kernel_L2_M2_22:
+.Lsgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M2_22
+ bgt .Lsgemm_kernel_L2_M2_22
-sgemm_kernel_L2_M2_40:
+.Lsgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M2_100
+ ble .Lsgemm_kernel_L2_M2_100
-sgemm_kernel_L2_M2_42:
+.Lsgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M2_42
+ bgt .Lsgemm_kernel_L2_M2_42
-sgemm_kernel_L2_M2_100:
+.Lsgemm_kernel_L2_M2_100:
SAVE2x2
-sgemm_kernel_L2_M2_END:
+.Lsgemm_kernel_L2_M2_END:
-sgemm_kernel_L2_M1_BEGIN:
+.Lsgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
-sgemm_kernel_L2_M1_20:
+.Lsgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble sgemm_kernel_L2_M1_40
+ ble .Lsgemm_kernel_L2_M1_40
-sgemm_kernel_L2_M1_22:
+.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M1_22
+ bgt .Lsgemm_kernel_L2_M1_22
-sgemm_kernel_L2_M1_40:
+.Lsgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M1_100
+ ble .Lsgemm_kernel_L2_M1_100
-sgemm_kernel_L2_M1_42:
+.Lsgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M1_42
+ bgt .Lsgemm_kernel_L2_M1_42
-sgemm_kernel_L2_M1_100:
+.Lsgemm_kernel_L2_M1_100:
SAVE1x2
-sgemm_kernel_L2_END:
+.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/
-sgemm_kernel_L1_BEGIN:
+.Lsgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble sgemm_kernel_L999 // done
+ ble .Lsgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
-sgemm_kernel_L1_M4_BEGIN:
+.Lsgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble sgemm_kernel_L1_M2_BEGIN
+ ble .Lsgemm_kernel_L1_M2_BEGIN
-sgemm_kernel_L1_M4_20:
+.Lsgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M4_40
+ ble .Lsgemm_kernel_L1_M4_40
.align 5
-sgemm_kernel_L1_M4_22:
+.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M4_22
+ bgt .Lsgemm_kernel_L1_M4_22
-sgemm_kernel_L1_M4_40:
+.Lsgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M4_100
+ ble .Lsgemm_kernel_L1_M4_100
-sgemm_kernel_L1_M4_42:
+.Lsgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M4_42
+ bgt .Lsgemm_kernel_L1_M4_42
-sgemm_kernel_L1_M4_100:
+.Lsgemm_kernel_L1_M4_100:
SAVE4x1
-sgemm_kernel_L1_M4_END:
+.Lsgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt sgemm_kernel_L1_M4_20
+ bgt .Lsgemm_kernel_L1_M4_20
-sgemm_kernel_L1_M2_BEGIN:
+.Lsgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L1_M1_BEGIN
+ ble .Lsgemm_kernel_L1_M1_BEGIN
-sgemm_kernel_L1_M2_20:
+.Lsgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M2_40
+ ble .Lsgemm_kernel_L1_M2_40
-sgemm_kernel_L1_M2_22:
+.Lsgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M2_22
+ bgt .Lsgemm_kernel_L1_M2_22
-sgemm_kernel_L1_M2_40:
+.Lsgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M2_100
+ ble .Lsgemm_kernel_L1_M2_100
-sgemm_kernel_L1_M2_42:
+.Lsgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M2_42
+ bgt .Lsgemm_kernel_L1_M2_42
-sgemm_kernel_L1_M2_100:
+.Lsgemm_kernel_L1_M2_100:
SAVE2x1
-sgemm_kernel_L1_M2_END:
+.Lsgemm_kernel_L1_M2_END:
-sgemm_kernel_L1_M1_BEGIN:
+.Lsgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
-sgemm_kernel_L1_M1_20:
+.Lsgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M1_40
+ ble .Lsgemm_kernel_L1_M1_40
-sgemm_kernel_L1_M1_22:
+.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M1_22
+ bgt .Lsgemm_kernel_L1_M1_22
-sgemm_kernel_L1_M1_40:
+.Lsgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M1_100
+ ble .Lsgemm_kernel_L1_M1_100
-sgemm_kernel_L1_M1_42:
+.Lsgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M1_42
+ bgt .Lsgemm_kernel_L1_M1_42
-sgemm_kernel_L1_M1_100:
+.Lsgemm_kernel_L1_M1_100:
SAVE1x1
-sgemm_kernel_L1_END:
+.Lsgemm_kernel_L1_END:
-sgemm_kernel_L999:
+.Lsgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
PROLOGUE
-sgemm_kernel_begin:
+.Lsgemm_kernel_begin:
.align 5
add sp, sp, #-(11 * 16)
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
- ble sgemm_kernel_L4_BEGIN
+ ble .Lsgemm_kernel_L4_BEGIN
/******************************************************************************/
/******************************************************************************/
-sgemm_kernel_L8_BEGIN:
+.Lsgemm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
/******************************************************************************/
-sgemm_kernel_L8_M8_BEGIN:
+.Lsgemm_kernel_L8_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble sgemm_kernel_L8_M4_BEGIN
+ ble .Lsgemm_kernel_L8_M4_BEGIN
-sgemm_kernel_L8_M8_20:
+.Lsgemm_kernel_L8_M8_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L8_M8_32
+ blt .Lsgemm_kernel_L8_M8_32
KERNEL8x8_I // do one in the K
KERNEL8x8_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L8_M8_22a
+ ble .Lsgemm_kernel_L8_M8_22a
.align 5
-sgemm_kernel_L8_M8_22:
+.Lsgemm_kernel_L8_M8_22:
KERNEL8x8_M1
KERNEL8x8_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L8_M8_22
+ bgt .Lsgemm_kernel_L8_M8_22
-sgemm_kernel_L8_M8_22a:
+.Lsgemm_kernel_L8_M8_22a:
KERNEL8x8_M1
KERNEL8x8_E
- b sgemm_kernel_L8_M8_44
+ b .Lsgemm_kernel_L8_M8_44
-sgemm_kernel_L8_M8_32:
+.Lsgemm_kernel_L8_M8_32:
tst counterL, #1
- ble sgemm_kernel_L8_M8_40
+ ble .Lsgemm_kernel_L8_M8_40
KERNEL8x8_I
KERNEL8x8_E
- b sgemm_kernel_L8_M8_44
+ b .Lsgemm_kernel_L8_M8_44
-sgemm_kernel_L8_M8_40:
+.Lsgemm_kernel_L8_M8_40:
INIT8x8
-sgemm_kernel_L8_M8_44:
+.Lsgemm_kernel_L8_M8_44:
ands counterL , origK, #1
- ble sgemm_kernel_L8_M8_100
+ ble .Lsgemm_kernel_L8_M8_100
-sgemm_kernel_L8_M8_46:
+.Lsgemm_kernel_L8_M8_46:
KERNEL8x8_SUB
-sgemm_kernel_L8_M8_100:
+.Lsgemm_kernel_L8_M8_100:
SAVE8x8
-sgemm_kernel_L8_M8_END:
+.Lsgemm_kernel_L8_M8_END:
subs counterI, counterI, #1
- bne sgemm_kernel_L8_M8_20
+ bne .Lsgemm_kernel_L8_M8_20
/******************************************************************************/
-sgemm_kernel_L8_M4_BEGIN:
+.Lsgemm_kernel_L8_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L8_END
+ ble .Lsgemm_kernel_L8_END
tst counterI, #4
- ble sgemm_kernel_L8_M2_BEGIN
+ ble .Lsgemm_kernel_L8_M2_BEGIN
-sgemm_kernel_L8_M4_20:
+.Lsgemm_kernel_L8_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L8_M4_32
+ blt .Lsgemm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L8_M4_22a
+ ble .Lsgemm_kernel_L8_M4_22a
.align 5
-sgemm_kernel_L8_M4_22:
+.Lsgemm_kernel_L8_M4_22:
KERNEL4x8_M1
KERNEL4x8_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L8_M4_22
+ bgt .Lsgemm_kernel_L8_M4_22
-sgemm_kernel_L8_M4_22a:
+.Lsgemm_kernel_L8_M4_22a:
KERNEL4x8_M1
KERNEL4x8_E
- b sgemm_kernel_L8_M4_44
+ b .Lsgemm_kernel_L8_M4_44
-sgemm_kernel_L8_M4_32:
+.Lsgemm_kernel_L8_M4_32:
tst counterL, #1
- ble sgemm_kernel_L8_M4_40
+ ble .Lsgemm_kernel_L8_M4_40
KERNEL4x8_I
KERNEL4x8_E
- b sgemm_kernel_L8_M4_44
+ b .Lsgemm_kernel_L8_M4_44
-sgemm_kernel_L8_M4_40:
+.Lsgemm_kernel_L8_M4_40:
INIT4x8
-sgemm_kernel_L8_M4_44:
+.Lsgemm_kernel_L8_M4_44:
ands counterL , origK, #1
- ble sgemm_kernel_L8_M4_100
+ ble .Lsgemm_kernel_L8_M4_100
-sgemm_kernel_L8_M4_46:
+.Lsgemm_kernel_L8_M4_46:
KERNEL4x8_SUB
-sgemm_kernel_L8_M4_100:
+.Lsgemm_kernel_L8_M4_100:
SAVE4x8
-sgemm_kernel_L8_M4_END:
+.Lsgemm_kernel_L8_M4_END:
/******************************************************************************/
-sgemm_kernel_L8_M2_BEGIN:
+.Lsgemm_kernel_L8_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L8_END
+ ble .Lsgemm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L8_M1_BEGIN
+ ble .Lsgemm_kernel_L8_M1_BEGIN
-sgemm_kernel_L8_M2_20:
+.Lsgemm_kernel_L8_M2_20:
INIT2x8
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L8_M2_40
+ ble .Lsgemm_kernel_L8_M2_40
-sgemm_kernel_L8_M2_22:
+.Lsgemm_kernel_L8_M2_22:
KERNEL2x8_SUB
KERNEL2x8_SUB
KERNEL2x8_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L8_M2_22
+ bgt .Lsgemm_kernel_L8_M2_22
-sgemm_kernel_L8_M2_40:
+.Lsgemm_kernel_L8_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L8_M2_100
+ ble .Lsgemm_kernel_L8_M2_100
-sgemm_kernel_L8_M2_42:
+.Lsgemm_kernel_L8_M2_42:
KERNEL2x8_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L8_M2_42
+ bgt .Lsgemm_kernel_L8_M2_42
-sgemm_kernel_L8_M2_100:
+.Lsgemm_kernel_L8_M2_100:
SAVE2x8
-sgemm_kernel_L8_M2_END:
+.Lsgemm_kernel_L8_M2_END:
/******************************************************************************/
-sgemm_kernel_L8_M1_BEGIN:
+.Lsgemm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L8_END
+ ble .Lsgemm_kernel_L8_END
-sgemm_kernel_L8_M1_20:
+.Lsgemm_kernel_L8_M1_20:
INIT1x8
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L8_M1_40
+ ble .Lsgemm_kernel_L8_M1_40
-sgemm_kernel_L8_M1_22:
+.Lsgemm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L8_M1_22
+ bgt .Lsgemm_kernel_L8_M1_22
-sgemm_kernel_L8_M1_40:
+.Lsgemm_kernel_L8_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L8_M1_100
+ ble .Lsgemm_kernel_L8_M1_100
-sgemm_kernel_L8_M1_42:
+.Lsgemm_kernel_L8_M1_42:
KERNEL1x8_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L8_M1_42
+ bgt .Lsgemm_kernel_L8_M1_42
-sgemm_kernel_L8_M1_100:
+.Lsgemm_kernel_L8_M1_100:
SAVE1x8
-sgemm_kernel_L8_END:
+.Lsgemm_kernel_L8_END:
lsl temp, origK, #5 // B = B + K * 4 * 8
add origPB, origPB, temp
subs counterJ, counterJ , #1 // j--
- bgt sgemm_kernel_L8_BEGIN
+ bgt .Lsgemm_kernel_L8_BEGIN
/******************************************************************************/
/******************************************************************************/
-sgemm_kernel_L4_BEGIN:
+.Lsgemm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #7
- ble sgemm_kernel_L999
+ ble .Lsgemm_kernel_L999
tst counterJ , #4
- ble sgemm_kernel_L2_BEGIN
+ ble .Lsgemm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = pC
/******************************************************************************/
-sgemm_kernel_L4_M8_BEGIN:
+.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble sgemm_kernel_L4_M4_BEGIN
+ ble .Lsgemm_kernel_L4_M4_BEGIN
-sgemm_kernel_L4_M8_20:
+.Lsgemm_kernel_L4_M8_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L4_M8_32
+ blt .Lsgemm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M8_22a
+ ble .Lsgemm_kernel_L4_M8_22a
.align 5
-sgemm_kernel_L4_M8_22:
+.Lsgemm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M8_22
+ bgt .Lsgemm_kernel_L4_M8_22
-sgemm_kernel_L4_M8_22a:
+.Lsgemm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
- b sgemm_kernel_L4_M8_44
+ b .Lsgemm_kernel_L4_M8_44
-sgemm_kernel_L4_M8_32:
+.Lsgemm_kernel_L4_M8_32:
tst counterL, #1
- ble sgemm_kernel_L4_M8_40
+ ble .Lsgemm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_E
- b sgemm_kernel_L4_M8_44
+ b .Lsgemm_kernel_L4_M8_44
-sgemm_kernel_L4_M8_40:
+.Lsgemm_kernel_L4_M8_40:
INIT8x4
-sgemm_kernel_L4_M8_44:
+.Lsgemm_kernel_L4_M8_44:
ands counterL , origK, #1
- ble sgemm_kernel_L4_M8_100
+ ble .Lsgemm_kernel_L4_M8_100
-sgemm_kernel_L4_M8_46:
+.Lsgemm_kernel_L4_M8_46:
KERNEL8x4_SUB
-sgemm_kernel_L4_M8_100:
+.Lsgemm_kernel_L4_M8_100:
SAVE8x4
-sgemm_kernel_L4_M8_END:
+.Lsgemm_kernel_L4_M8_END:
subs counterI, counterI, #1
- bne sgemm_kernel_L4_M8_20
+ bne .Lsgemm_kernel_L4_M8_20
/******************************************************************************/
-sgemm_kernel_L4_M4_BEGIN:
+.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #4
- ble sgemm_kernel_L4_M2_BEGIN
+ ble .Lsgemm_kernel_L4_M2_BEGIN
-sgemm_kernel_L4_M4_20:
+.Lsgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt sgemm_kernel_L4_M4_32
+ blt .Lsgemm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble sgemm_kernel_L4_M4_22a
+ ble .Lsgemm_kernel_L4_M4_22a
.align 5
-sgemm_kernel_L4_M4_22:
+.Lsgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M4_22
+ bgt .Lsgemm_kernel_L4_M4_22
-sgemm_kernel_L4_M4_22a:
+.Lsgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b sgemm_kernel_L4_M4_44
+ b .Lsgemm_kernel_L4_M4_44
-sgemm_kernel_L4_M4_32:
+.Lsgemm_kernel_L4_M4_32:
tst counterL, #1
- ble sgemm_kernel_L4_M4_40
+ ble .Lsgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b sgemm_kernel_L4_M4_44
+ b .Lsgemm_kernel_L4_M4_44
-sgemm_kernel_L4_M4_40:
+.Lsgemm_kernel_L4_M4_40:
INIT4x4
-sgemm_kernel_L4_M4_44:
+.Lsgemm_kernel_L4_M4_44:
ands counterL , origK, #1
- ble sgemm_kernel_L4_M4_100
+ ble .Lsgemm_kernel_L4_M4_100
-sgemm_kernel_L4_M4_46:
+.Lsgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
-sgemm_kernel_L4_M4_100:
+.Lsgemm_kernel_L4_M4_100:
SAVE4x4
-sgemm_kernel_L4_M4_END:
+.Lsgemm_kernel_L4_M4_END:
/******************************************************************************/
-sgemm_kernel_L4_M2_BEGIN:
+.Lsgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L4_M1_BEGIN
+ ble .Lsgemm_kernel_L4_M1_BEGIN
-sgemm_kernel_L4_M2_20:
+.Lsgemm_kernel_L4_M2_20:
INIT2x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L4_M2_40
+ ble .Lsgemm_kernel_L4_M2_40
-sgemm_kernel_L4_M2_22:
+.Lsgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M2_22
+ bgt .Lsgemm_kernel_L4_M2_22
-sgemm_kernel_L4_M2_40:
+.Lsgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M2_100
+ ble .Lsgemm_kernel_L4_M2_100
-sgemm_kernel_L4_M2_42:
+.Lsgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M2_42
+ bgt .Lsgemm_kernel_L4_M2_42
-sgemm_kernel_L4_M2_100:
+.Lsgemm_kernel_L4_M2_100:
SAVE2x4
-sgemm_kernel_L4_M2_END:
+.Lsgemm_kernel_L4_M2_END:
/******************************************************************************/
-sgemm_kernel_L4_M1_BEGIN:
+.Lsgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L4_END
+ ble .Lsgemm_kernel_L4_END
-sgemm_kernel_L4_M1_20:
+.Lsgemm_kernel_L4_M1_20:
INIT1x4
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L4_M1_40
+ ble .Lsgemm_kernel_L4_M1_40
-sgemm_kernel_L4_M1_22:
+.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M1_22
+ bgt .Lsgemm_kernel_L4_M1_22
-sgemm_kernel_L4_M1_40:
+.Lsgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L4_M1_100
+ ble .Lsgemm_kernel_L4_M1_100
-sgemm_kernel_L4_M1_42:
+.Lsgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L4_M1_42
+ bgt .Lsgemm_kernel_L4_M1_42
-sgemm_kernel_L4_M1_100:
+.Lsgemm_kernel_L4_M1_100:
SAVE1x4
-sgemm_kernel_L4_END:
+.Lsgemm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
/******************************************************************************/
/******************************************************************************/
-sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble sgemm_kernel_L999
+ ble .Lsgemm_kernel_L999
tst counterJ , #2
- ble sgemm_kernel_L1_BEGIN
+ ble .Lsgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
/******************************************************************************/
-sgemm_kernel_L2_M8_BEGIN:
+.Lsgemm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI,#0
- ble sgemm_kernel_L2_M4_BEGIN
+ ble .Lsgemm_kernel_L2_M4_BEGIN
-sgemm_kernel_L2_M8_20:
+.Lsgemm_kernel_L2_M8_20:
INIT8x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M8_40
+ ble .Lsgemm_kernel_L2_M8_40
.align 5
-sgemm_kernel_L2_M8_22:
+.Lsgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M8_22
+ bgt .Lsgemm_kernel_L2_M8_22
-sgemm_kernel_L2_M8_40:
+.Lsgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M8_100
+ ble .Lsgemm_kernel_L2_M8_100
-sgemm_kernel_L2_M8_42:
+.Lsgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M8_42
+ bgt .Lsgemm_kernel_L2_M8_42
-sgemm_kernel_L2_M8_100:
+.Lsgemm_kernel_L2_M8_100:
SAVE8x2
-sgemm_kernel_L2_M8_END:
+.Lsgemm_kernel_L2_M8_END:
subs counterI, counterI, #1
- bgt sgemm_kernel_L2_M8_20
+ bgt .Lsgemm_kernel_L2_M8_20
/******************************************************************************/
-sgemm_kernel_L2_M4_BEGIN:
+.Lsgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #4
- ble sgemm_kernel_L2_M2_BEGIN
+ ble .Lsgemm_kernel_L2_M2_BEGIN
-sgemm_kernel_L2_M4_20:
+.Lsgemm_kernel_L2_M4_20:
INIT4x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M4_40
+ ble .Lsgemm_kernel_L2_M4_40
.align 5
-sgemm_kernel_L2_M4_22:
+.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M4_22
+ bgt .Lsgemm_kernel_L2_M4_22
-sgemm_kernel_L2_M4_40:
+.Lsgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M4_100
+ ble .Lsgemm_kernel_L2_M4_100
-sgemm_kernel_L2_M4_42:
+.Lsgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M4_42
+ bgt .Lsgemm_kernel_L2_M4_42
-sgemm_kernel_L2_M4_100:
+.Lsgemm_kernel_L2_M4_100:
SAVE4x2
-sgemm_kernel_L2_M4_END:
+.Lsgemm_kernel_L2_M4_END:
/******************************************************************************/
-sgemm_kernel_L2_M2_BEGIN:
+.Lsgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L2_M1_BEGIN
+ ble .Lsgemm_kernel_L2_M1_BEGIN
-sgemm_kernel_L2_M2_20:
+.Lsgemm_kernel_L2_M2_20:
INIT2x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble sgemm_kernel_L2_M2_40
+ ble .Lsgemm_kernel_L2_M2_40
-sgemm_kernel_L2_M2_22:
+.Lsgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M2_22
+ bgt .Lsgemm_kernel_L2_M2_22
-sgemm_kernel_L2_M2_40:
+.Lsgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M2_100
+ ble .Lsgemm_kernel_L2_M2_100
-sgemm_kernel_L2_M2_42:
+.Lsgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M2_42
+ bgt .Lsgemm_kernel_L2_M2_42
-sgemm_kernel_L2_M2_100:
+.Lsgemm_kernel_L2_M2_100:
SAVE2x2
-sgemm_kernel_L2_M2_END:
+.Lsgemm_kernel_L2_M2_END:
/******************************************************************************/
-sgemm_kernel_L2_M1_BEGIN:
+.Lsgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L2_END
+ ble .Lsgemm_kernel_L2_END
-sgemm_kernel_L2_M1_20:
+.Lsgemm_kernel_L2_M1_20:
INIT1x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble sgemm_kernel_L2_M1_40
+ ble .Lsgemm_kernel_L2_M1_40
-sgemm_kernel_L2_M1_22:
+.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M1_22
+ bgt .Lsgemm_kernel_L2_M1_22
-sgemm_kernel_L2_M1_40:
+.Lsgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L2_M1_100
+ ble .Lsgemm_kernel_L2_M1_100
-sgemm_kernel_L2_M1_42:
+.Lsgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L2_M1_42
+ bgt .Lsgemm_kernel_L2_M1_42
-sgemm_kernel_L2_M1_100:
+.Lsgemm_kernel_L2_M1_100:
SAVE1x2
-sgemm_kernel_L2_END:
+.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4
/******************************************************************************/
/******************************************************************************/
-sgemm_kernel_L1_BEGIN:
+.Lsgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble sgemm_kernel_L999 // done
+ ble .Lsgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
/******************************************************************************/
-sgemm_kernel_L1_M8_BEGIN:
+.Lsgemm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3
cmp counterI, #0
- ble sgemm_kernel_L1_M4_BEGIN
+ ble .Lsgemm_kernel_L1_M4_BEGIN
-sgemm_kernel_L1_M8_20:
+.Lsgemm_kernel_L1_M8_20:
INIT8x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M8_40
+ ble .Lsgemm_kernel_L1_M8_40
.align 5
-sgemm_kernel_L1_M8_22:
+.Lsgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M8_22
+ bgt .Lsgemm_kernel_L1_M8_22
-sgemm_kernel_L1_M8_40:
+.Lsgemm_kernel_L1_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M8_100
+ ble .Lsgemm_kernel_L1_M8_100
-sgemm_kernel_L1_M8_42:
+.Lsgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M8_42
+ bgt .Lsgemm_kernel_L1_M8_42
-sgemm_kernel_L1_M8_100:
+.Lsgemm_kernel_L1_M8_100:
SAVE8x1
-sgemm_kernel_L1_M8_END:
+.Lsgemm_kernel_L1_M8_END:
subs counterI, counterI, #1
- bgt sgemm_kernel_L1_M8_20
+ bgt .Lsgemm_kernel_L1_M8_20
/******************************************************************************/
-sgemm_kernel_L1_M4_BEGIN:
+.Lsgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #4
- ble sgemm_kernel_L1_M2_BEGIN
+ ble .Lsgemm_kernel_L1_M2_BEGIN
-sgemm_kernel_L1_M4_20:
+.Lsgemm_kernel_L1_M4_20:
INIT4x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M4_40
+ ble .Lsgemm_kernel_L1_M4_40
.align 5
-sgemm_kernel_L1_M4_22:
+.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M4_22
+ bgt .Lsgemm_kernel_L1_M4_22
-sgemm_kernel_L1_M4_40:
+.Lsgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M4_100
+ ble .Lsgemm_kernel_L1_M4_100
-sgemm_kernel_L1_M4_42:
+.Lsgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M4_42
+ bgt .Lsgemm_kernel_L1_M4_42
-sgemm_kernel_L1_M4_100:
+.Lsgemm_kernel_L1_M4_100:
SAVE4x1
-sgemm_kernel_L1_M4_END:
+.Lsgemm_kernel_L1_M4_END:
/******************************************************************************/
-sgemm_kernel_L1_M2_BEGIN:
+.Lsgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble sgemm_kernel_L1_M1_BEGIN
+ ble .Lsgemm_kernel_L1_M1_BEGIN
-sgemm_kernel_L1_M2_20:
+.Lsgemm_kernel_L1_M2_20:
INIT2x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M2_40
+ ble .Lsgemm_kernel_L1_M2_40
-sgemm_kernel_L1_M2_22:
+.Lsgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M2_22
+ bgt .Lsgemm_kernel_L1_M2_22
-sgemm_kernel_L1_M2_40:
+.Lsgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M2_100
+ ble .Lsgemm_kernel_L1_M2_100
-sgemm_kernel_L1_M2_42:
+.Lsgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M2_42
+ bgt .Lsgemm_kernel_L1_M2_42
-sgemm_kernel_L1_M2_100:
+.Lsgemm_kernel_L1_M2_100:
SAVE2x1
-sgemm_kernel_L1_M2_END:
+.Lsgemm_kernel_L1_M2_END:
/******************************************************************************/
-sgemm_kernel_L1_M1_BEGIN:
+.Lsgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble sgemm_kernel_L1_END
+ ble .Lsgemm_kernel_L1_END
-sgemm_kernel_L1_M1_20:
+.Lsgemm_kernel_L1_M1_20:
INIT1x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble sgemm_kernel_L1_M1_40
+ ble .Lsgemm_kernel_L1_M1_40
-sgemm_kernel_L1_M1_22:
+.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M1_22
+ bgt .Lsgemm_kernel_L1_M1_22
-sgemm_kernel_L1_M1_40:
+.Lsgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble sgemm_kernel_L1_M1_100
+ ble .Lsgemm_kernel_L1_M1_100
-sgemm_kernel_L1_M1_42:
+.Lsgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt sgemm_kernel_L1_M1_42
+ bgt .Lsgemm_kernel_L1_M1_42
-sgemm_kernel_L1_M1_100:
+.Lsgemm_kernel_L1_M1_100:
SAVE1x1
-sgemm_kernel_L1_END:
+.Lsgemm_kernel_L1_END:
/******************************************************************************/
-sgemm_kernel_L999:
+.Lsgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
PROLOGUE
-strmm_kernel_begin:
+.Lstrmm_kernel_begin:
.align 5
add sp, sp, #-(11 * 16)
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble strmm_kernel_L2_BEGIN
+ ble .Lstrmm_kernel_L2_BEGIN
/******************************************************************************/
-strmm_kernel_L4_BEGIN:
+.Lstrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
#endif
mov pA, origPA // pA = start of A array
-strmm_kernel_L4_M16_BEGIN:
+.Lstrmm_kernel_L4_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
- ble strmm_kernel_L4_M8_BEGIN
+ ble .Lstrmm_kernel_L4_M8_BEGIN
.align 5
-strmm_kernel_L4_M16_20:
+.Lstrmm_kernel_L4_M16_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #3
cmp counterL , #2
- blt strmm_kernel_L4_M16_32
+ blt .Lstrmm_kernel_L4_M16_32
KERNEL16x4_I
KERNEL16x4_M2
KERNEL16x4_M2
subs counterL, counterL, #2
- ble strmm_kernel_L4_M16_22a
+ ble .Lstrmm_kernel_L4_M16_22a
.align 5
-strmm_kernel_L4_M16_22:
+.Lstrmm_kernel_L4_M16_22:
KERNEL16x4_M1
KERNEL16x4_M2
KERNEL16x4_M2
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M16_22
+ bgt .Lstrmm_kernel_L4_M16_22
.align 5
-strmm_kernel_L4_M16_22a:
+.Lstrmm_kernel_L4_M16_22a:
KERNEL16x4_M1
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_E
- b strmm_kernel_L4_M16_44
+ b .Lstrmm_kernel_L4_M16_44
.align 5
-strmm_kernel_L4_M16_32:
+.Lstrmm_kernel_L4_M16_32:
tst counterL, #1
- ble strmm_kernel_L4_M16_40
+ ble .Lstrmm_kernel_L4_M16_40
KERNEL16x4_I
KERNEL16x4_M2
KERNEL16x4_M1
KERNEL16x4_E
- b strmm_kernel_L4_M16_44
+ b .Lstrmm_kernel_L4_M16_44
-strmm_kernel_L4_M16_40:
+.Lstrmm_kernel_L4_M16_40:
INIT16x4
-strmm_kernel_L4_M16_44:
+.Lstrmm_kernel_L4_M16_44:
ands counterL , tempK, #7
- ble strmm_kernel_L4_M16_100
+ ble .Lstrmm_kernel_L4_M16_100
.align 5
-strmm_kernel_L4_M16_46:
+.Lstrmm_kernel_L4_M16_46:
KERNEL16x4_SUB
subs counterL, counterL, #1
- bne strmm_kernel_L4_M16_46
+ bne .Lstrmm_kernel_L4_M16_46
-strmm_kernel_L4_M16_100:
+.Lstrmm_kernel_L4_M16_100:
SAVE16x4
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
-strmm_kernel_L4_M16_END:
+.Lstrmm_kernel_L4_M16_END:
subs counterI, counterI, #1
- bne strmm_kernel_L4_M16_20
+ bne .Lstrmm_kernel_L4_M16_20
//------------------------------------------------------------------------------
-strmm_kernel_L4_M8_BEGIN:
+.Lstrmm_kernel_L4_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
tst counterI, #8
- ble strmm_kernel_L4_M4_BEGIN
+ ble .Lstrmm_kernel_L4_M4_BEGIN
-strmm_kernel_L4_M8_20:
+.Lstrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt strmm_kernel_L4_M8_32
+ blt .Lstrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2
- ble strmm_kernel_L4_M8_22a
+ ble .Lstrmm_kernel_L4_M8_22a
.align 5
-strmm_kernel_L4_M8_22:
+.Lstrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M8_22
+ bgt .Lstrmm_kernel_L4_M8_22
-strmm_kernel_L4_M8_22a:
+.Lstrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
- b strmm_kernel_L4_M8_44
+ b .Lstrmm_kernel_L4_M8_44
-strmm_kernel_L4_M8_32:
+.Lstrmm_kernel_L4_M8_32:
tst counterL, #1
- ble strmm_kernel_L4_M8_40
+ ble .Lstrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_E
- b strmm_kernel_L4_M8_44
+ b .Lstrmm_kernel_L4_M8_44
-strmm_kernel_L4_M8_40:
+.Lstrmm_kernel_L4_M8_40:
INIT8x4
-strmm_kernel_L4_M8_44:
+.Lstrmm_kernel_L4_M8_44:
ands counterL , tempK, #1
- ble strmm_kernel_L4_M8_100
+ ble .Lstrmm_kernel_L4_M8_100
-strmm_kernel_L4_M8_46:
+.Lstrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
-strmm_kernel_L4_M8_100:
+.Lstrmm_kernel_L4_M8_100:
SAVE8x4
add tempOffset, tempOffset, #8
#endif
-strmm_kernel_L4_M8_END:
+.Lstrmm_kernel_L4_M8_END:
//------------------------------------------------------------------------------
-strmm_kernel_L4_M4_BEGIN:
+.Lstrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
tst counterI, #4
- ble strmm_kernel_L4_M2_BEGIN
+ ble .Lstrmm_kernel_L4_M2_BEGIN
-strmm_kernel_L4_M4_20:
+.Lstrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
#endif
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt strmm_kernel_L4_M4_32
+ blt .Lstrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble strmm_kernel_L4_M4_22a
+ ble .Lstrmm_kernel_L4_M4_22a
.align 5
-strmm_kernel_L4_M4_22:
+.Lstrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M4_22
+ bgt .Lstrmm_kernel_L4_M4_22
-strmm_kernel_L4_M4_22a:
+.Lstrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b strmm_kernel_L4_M4_44
+ b .Lstrmm_kernel_L4_M4_44
-strmm_kernel_L4_M4_32:
+.Lstrmm_kernel_L4_M4_32:
tst counterL, #1
- ble strmm_kernel_L4_M4_40
+ ble .Lstrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b strmm_kernel_L4_M4_44
+ b .Lstrmm_kernel_L4_M4_44
-strmm_kernel_L4_M4_40:
+.Lstrmm_kernel_L4_M4_40:
INIT4x4
-strmm_kernel_L4_M4_44:
+.Lstrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
- ble strmm_kernel_L4_M4_100
+ ble .Lstrmm_kernel_L4_M4_100
-strmm_kernel_L4_M4_46:
+.Lstrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
-strmm_kernel_L4_M4_100:
+.Lstrmm_kernel_L4_M4_100:
SAVE4x4
#if defined(LEFT)
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L4_M4_END:
+.Lstrmm_kernel_L4_M4_END:
//------------------------------------------------------------------------------
-strmm_kernel_L4_M2_BEGIN:
+.Lstrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L4_M1_BEGIN
+ ble .Lstrmm_kernel_L4_M1_BEGIN
-strmm_kernel_L4_M2_20:
+.Lstrmm_kernel_L4_M2_20:
INIT2x4
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L4_M2_40
+ ble .Lstrmm_kernel_L4_M2_40
-strmm_kernel_L4_M2_22:
+.Lstrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M2_22
+ bgt .Lstrmm_kernel_L4_M2_22
-strmm_kernel_L4_M2_40:
+.Lstrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L4_M2_100
+ ble .Lstrmm_kernel_L4_M2_100
-strmm_kernel_L4_M2_42:
+.Lstrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M2_42
+ bgt .Lstrmm_kernel_L4_M2_42
-strmm_kernel_L4_M2_100:
+.Lstrmm_kernel_L4_M2_100:
SAVE2x4
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
-strmm_kernel_L4_M2_END:
+.Lstrmm_kernel_L4_M2_END:
-strmm_kernel_L4_M1_BEGIN:
+.Lstrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
-strmm_kernel_L4_M1_20:
+.Lstrmm_kernel_L4_M1_20:
INIT1x4
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L4_M1_40
+ ble .Lstrmm_kernel_L4_M1_40
-strmm_kernel_L4_M1_22:
+.Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M1_22
+ bgt .Lstrmm_kernel_L4_M1_22
-strmm_kernel_L4_M1_40:
+.Lstrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L4_M1_100
+ ble .Lstrmm_kernel_L4_M1_100
-strmm_kernel_L4_M1_42:
+.Lstrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M1_42
+ bgt .Lstrmm_kernel_L4_M1_42
-strmm_kernel_L4_M1_100:
+.Lstrmm_kernel_L4_M1_100:
SAVE1x4
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
-strmm_kernel_L4_END:
+.Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
#if !defined(LEFT)
add tempOffset, tempOffset, #4
#endif
subs counterJ, counterJ , #1 // j--
- bgt strmm_kernel_L4_BEGIN
+ bgt .Lstrmm_kernel_L4_BEGIN
/******************************************************************************/
-strmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble strmm_kernel_L999
+ ble .Lstrmm_kernel_L999
tst counterJ , #2
- ble strmm_kernel_L1_BEGIN
+ ble .Lstrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
#endif
mov pA, origPA // pA = A
-strmm_kernel_L2_M16_BEGIN:
+.Lstrmm_kernel_L2_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI,#0
- ble strmm_kernel_L2_M8_BEGIN
+ ble .Lstrmm_kernel_L2_M8_BEGIN
-strmm_kernel_L2_M16_20:
+.Lstrmm_kernel_L2_M16_20:
INIT16x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M16_40
+ ble .Lstrmm_kernel_L2_M16_40
.align 5
-strmm_kernel_L2_M16_22:
+.Lstrmm_kernel_L2_M16_22:
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
KERNEL16x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M16_22
+ bgt .Lstrmm_kernel_L2_M16_22
-strmm_kernel_L2_M16_40:
+.Lstrmm_kernel_L2_M16_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M16_100
+ ble .Lstrmm_kernel_L2_M16_100
-strmm_kernel_L2_M16_42:
+.Lstrmm_kernel_L2_M16_42:
KERNEL16x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M16_42
+ bgt .Lstrmm_kernel_L2_M16_42
-strmm_kernel_L2_M16_100:
+.Lstrmm_kernel_L2_M16_100:
SAVE16x2
add tempOffset, tempOffset, #16
#endif
-strmm_kernel_L2_M16_END:
+.Lstrmm_kernel_L2_M16_END:
subs counterI, counterI, #1
- bgt strmm_kernel_L2_M16_20
+ bgt .Lstrmm_kernel_L2_M16_20
//------------------------------------------------------------------------------
-strmm_kernel_L2_M8_BEGIN:
+.Lstrmm_kernel_L2_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
tst counterI, #8
- ble strmm_kernel_L2_M4_BEGIN
+ ble .Lstrmm_kernel_L2_M4_BEGIN
-strmm_kernel_L2_M8_20:
+.Lstrmm_kernel_L2_M8_20:
INIT8x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M8_40
+ ble .Lstrmm_kernel_L2_M8_40
.align 5
-strmm_kernel_L2_M8_22:
+.Lstrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M8_22
+ bgt .Lstrmm_kernel_L2_M8_22
-strmm_kernel_L2_M8_40:
+.Lstrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M8_100
+ ble .Lstrmm_kernel_L2_M8_100
-strmm_kernel_L2_M8_42:
+.Lstrmm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M8_42
+ bgt .Lstrmm_kernel_L2_M8_42
-strmm_kernel_L2_M8_100:
+.Lstrmm_kernel_L2_M8_100:
SAVE8x2
add tempOffset, tempOffset, #8
#endif
-strmm_kernel_L2_M8_END:
+.Lstrmm_kernel_L2_M8_END:
//------------------------------------------------------------------------------
-strmm_kernel_L2_M4_BEGIN:
+.Lstrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
tst counterI, #4
- ble strmm_kernel_L2_M2_BEGIN
+ ble .Lstrmm_kernel_L2_M2_BEGIN
-strmm_kernel_L2_M4_20:
+.Lstrmm_kernel_L2_M4_20:
INIT4x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M4_40
+ ble .Lstrmm_kernel_L2_M4_40
.align 5
-strmm_kernel_L2_M4_22:
+.Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M4_22
+ bgt .Lstrmm_kernel_L2_M4_22
-strmm_kernel_L2_M4_40:
+.Lstrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M4_100
+ ble .Lstrmm_kernel_L2_M4_100
-strmm_kernel_L2_M4_42:
+.Lstrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M4_42
+ bgt .Lstrmm_kernel_L2_M4_42
-strmm_kernel_L2_M4_100:
+.Lstrmm_kernel_L2_M4_100:
SAVE4x2
#if defined(LEFT)
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L2_M4_END:
+.Lstrmm_kernel_L2_M4_END:
//------------------------------------------------------------------------------
-strmm_kernel_L2_M2_BEGIN:
+.Lstrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L2_M1_BEGIN
+ ble .Lstrmm_kernel_L2_M1_BEGIN
-strmm_kernel_L2_M2_20:
+.Lstrmm_kernel_L2_M2_20:
INIT2x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M2_40
+ ble .Lstrmm_kernel_L2_M2_40
-strmm_kernel_L2_M2_22:
+.Lstrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M2_22
+ bgt .Lstrmm_kernel_L2_M2_22
-strmm_kernel_L2_M2_40:
+.Lstrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M2_100
+ ble .Lstrmm_kernel_L2_M2_100
-strmm_kernel_L2_M2_42:
+.Lstrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M2_42
+ bgt .Lstrmm_kernel_L2_M2_42
-strmm_kernel_L2_M2_100:
+.Lstrmm_kernel_L2_M2_100:
SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
add tempOffset, tempOffset, #2
#endif
-strmm_kernel_L2_M2_END:
+.Lstrmm_kernel_L2_M2_END:
-strmm_kernel_L2_M1_BEGIN:
+.Lstrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
-strmm_kernel_L2_M1_20:
+.Lstrmm_kernel_L2_M1_20:
INIT1x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble strmm_kernel_L2_M1_40
+ ble .Lstrmm_kernel_L2_M1_40
-strmm_kernel_L2_M1_22:
+.Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M1_22
+ bgt .Lstrmm_kernel_L2_M1_22
-strmm_kernel_L2_M1_40:
+.Lstrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M1_100
+ ble .Lstrmm_kernel_L2_M1_100
-strmm_kernel_L2_M1_42:
+.Lstrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M1_42
+ bgt .Lstrmm_kernel_L2_M1_42
-strmm_kernel_L2_M1_100:
+.Lstrmm_kernel_L2_M1_100:
SAVE1x2
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
-strmm_kernel_L2_END:
+.Lstrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
-strmm_kernel_L1_BEGIN:
+.Lstrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble strmm_kernel_L999 // done
+ ble .Lstrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
#endif
mov pA, origPA // pA = A
-strmm_kernel_L1_M16_BEGIN:
+.Lstrmm_kernel_L1_M16_BEGIN:
mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0
- ble strmm_kernel_L1_M8_BEGIN
+ ble .Lstrmm_kernel_L1_M8_BEGIN
-strmm_kernel_L1_M16_20:
+.Lstrmm_kernel_L1_M16_20:
INIT16x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M16_40
+ ble .Lstrmm_kernel_L1_M16_40
.align 5
-strmm_kernel_L1_M16_22:
+.Lstrmm_kernel_L1_M16_22:
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
KERNEL16x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M16_22
+ bgt .Lstrmm_kernel_L1_M16_22
-strmm_kernel_L1_M16_40:
+.Lstrmm_kernel_L1_M16_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M16_100
+ ble .Lstrmm_kernel_L1_M16_100
-strmm_kernel_L1_M16_42:
+.Lstrmm_kernel_L1_M16_42:
KERNEL16x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M16_42
+ bgt .Lstrmm_kernel_L1_M16_42
-strmm_kernel_L1_M16_100:
+.Lstrmm_kernel_L1_M16_100:
SAVE16x1
add tempOffset, tempOffset, #16
#endif
-strmm_kernel_L1_M16_END:
+.Lstrmm_kernel_L1_M16_END:
subs counterI, counterI, #1
- bgt strmm_kernel_L1_M16_20
+ bgt .Lstrmm_kernel_L1_M16_20
//------------------------------------------------------------------------------
-strmm_kernel_L1_M8_BEGIN:
+.Lstrmm_kernel_L1_M8_BEGIN:
mov counterI, origM
tst counterI , #15
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
tst counterI, #8
- ble strmm_kernel_L1_M4_BEGIN
+ ble .Lstrmm_kernel_L1_M4_BEGIN
-strmm_kernel_L1_M8_20:
+.Lstrmm_kernel_L1_M8_20:
INIT8x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M8_40
+ ble .Lstrmm_kernel_L1_M8_40
.align 5
-strmm_kernel_L1_M8_22:
+.Lstrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M8_22
+ bgt .Lstrmm_kernel_L1_M8_22
-strmm_kernel_L1_M8_40:
+.Lstrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M8_100
+ ble .Lstrmm_kernel_L1_M8_100
-strmm_kernel_L1_M8_42:
+.Lstrmm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M8_42
+ bgt .Lstrmm_kernel_L1_M8_42
-strmm_kernel_L1_M8_100:
+.Lstrmm_kernel_L1_M8_100:
SAVE8x1
add tempOffset, tempOffset, #8
#endif
-strmm_kernel_L1_M8_END:
+.Lstrmm_kernel_L1_M8_END:
//------------------------------------------------------------------------------
-strmm_kernel_L1_M4_BEGIN:
+.Lstrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
tst counterI, #4
- ble strmm_kernel_L1_M2_BEGIN
+ ble .Lstrmm_kernel_L1_M2_BEGIN
-strmm_kernel_L1_M4_20:
+.Lstrmm_kernel_L1_M4_20:
INIT4x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M4_40
+ ble .Lstrmm_kernel_L1_M4_40
.align 5
-strmm_kernel_L1_M4_22:
+.Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M4_22
+ bgt .Lstrmm_kernel_L1_M4_22
-strmm_kernel_L1_M4_40:
+.Lstrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M4_100
+ ble .Lstrmm_kernel_L1_M4_100
-strmm_kernel_L1_M4_42:
+.Lstrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M4_42
+ bgt .Lstrmm_kernel_L1_M4_42
-strmm_kernel_L1_M4_100:
+.Lstrmm_kernel_L1_M4_100:
SAVE4x1
#if defined(LEFT)
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L1_M4_END:
+.Lstrmm_kernel_L1_M4_END:
//------------------------------------------------------------------------------
-strmm_kernel_L1_M2_BEGIN:
+.Lstrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L1_M1_BEGIN
+ ble .Lstrmm_kernel_L1_M1_BEGIN
-strmm_kernel_L1_M2_20:
+.Lstrmm_kernel_L1_M2_20:
INIT2x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M2_40
+ ble .Lstrmm_kernel_L1_M2_40
-strmm_kernel_L1_M2_22:
+.Lstrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M2_22
+ bgt .Lstrmm_kernel_L1_M2_22
-strmm_kernel_L1_M2_40:
+.Lstrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M2_100
+ ble .Lstrmm_kernel_L1_M2_100
-strmm_kernel_L1_M2_42:
+.Lstrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M2_42
+ bgt .Lstrmm_kernel_L1_M2_42
-strmm_kernel_L1_M2_100:
+.Lstrmm_kernel_L1_M2_100:
SAVE2x1
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
-strmm_kernel_L1_M2_END:
+.Lstrmm_kernel_L1_M2_END:
-strmm_kernel_L1_M1_BEGIN:
+.Lstrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
-strmm_kernel_L1_M1_20:
+.Lstrmm_kernel_L1_M1_20:
INIT1x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M1_40
+ ble .Lstrmm_kernel_L1_M1_40
-strmm_kernel_L1_M1_22:
+.Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M1_22
+ bgt .Lstrmm_kernel_L1_M1_22
-strmm_kernel_L1_M1_40:
+.Lstrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M1_100
+ ble .Lstrmm_kernel_L1_M1_100
-strmm_kernel_L1_M1_42:
+.Lstrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M1_42
+ bgt .Lstrmm_kernel_L1_M1_42
-strmm_kernel_L1_M1_100:
+.Lstrmm_kernel_L1_M1_100:
SAVE1x1
-strmm_kernel_L1_END:
+.Lstrmm_kernel_L1_END:
-strmm_kernel_L999:
+.Lstrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
PROLOGUE
-strmm_kernel_begin:
+.Lstrmm_kernel_begin:
.align 5
add sp, sp, #-(11 * 16)
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble strmm_kernel_L2_BEGIN
+ ble .Lstrmm_kernel_L2_BEGIN
/******************************************************************************/
-strmm_kernel_L4_BEGIN:
+.Lstrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2
mov pA, origPA // pA = start of A array
-strmm_kernel_L4_M4_BEGIN:
+.Lstrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble strmm_kernel_L4_M2_BEGIN
+ ble .Lstrmm_kernel_L4_M2_BEGIN
-strmm_kernel_L4_M4_20:
+.Lstrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt strmm_kernel_L4_M4_32
+ blt .Lstrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble strmm_kernel_L4_M4_22a
+ ble .Lstrmm_kernel_L4_M4_22a
.align 5
-strmm_kernel_L4_M4_22:
+.Lstrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M4_22
+ bgt .Lstrmm_kernel_L4_M4_22
-strmm_kernel_L4_M4_22a:
+.Lstrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b strmm_kernel_L4_M4_44
+ b .Lstrmm_kernel_L4_M4_44
-strmm_kernel_L4_M4_32:
+.Lstrmm_kernel_L4_M4_32:
tst counterL, #1
- ble strmm_kernel_L4_M4_40
+ ble .Lstrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b strmm_kernel_L4_M4_44
+ b .Lstrmm_kernel_L4_M4_44
-strmm_kernel_L4_M4_40:
+.Lstrmm_kernel_L4_M4_40:
INIT4x4
-strmm_kernel_L4_M4_44:
+.Lstrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
- ble strmm_kernel_L4_M4_100
+ ble .Lstrmm_kernel_L4_M4_100
-strmm_kernel_L4_M4_46:
+.Lstrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
-strmm_kernel_L4_M4_100:
+.Lstrmm_kernel_L4_M4_100:
SAVE4x4
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L4_M4_END:
+.Lstrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
- bne strmm_kernel_L4_M4_20
+ bne .Lstrmm_kernel_L4_M4_20
-strmm_kernel_L4_M2_BEGIN:
+.Lstrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L4_M1_BEGIN
+ ble .Lstrmm_kernel_L4_M1_BEGIN
-strmm_kernel_L4_M2_20:
+.Lstrmm_kernel_L4_M2_20:
INIT2x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L4_M2_40
+ ble .Lstrmm_kernel_L4_M2_40
-strmm_kernel_L4_M2_22:
+.Lstrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M2_22
+ bgt .Lstrmm_kernel_L4_M2_22
-strmm_kernel_L4_M2_40:
+.Lstrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L4_M2_100
+ ble .Lstrmm_kernel_L4_M2_100
-strmm_kernel_L4_M2_42:
+.Lstrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M2_42
+ bgt .Lstrmm_kernel_L4_M2_42
-strmm_kernel_L4_M2_100:
+.Lstrmm_kernel_L4_M2_100:
SAVE2x4
#endif
-strmm_kernel_L4_M2_END:
+.Lstrmm_kernel_L4_M2_END:
-strmm_kernel_L4_M1_BEGIN:
+.Lstrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
-strmm_kernel_L4_M1_20:
+.Lstrmm_kernel_L4_M1_20:
INIT1x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L4_M1_40
+ ble .Lstrmm_kernel_L4_M1_40
-strmm_kernel_L4_M1_22:
+.Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M1_22
+ bgt .Lstrmm_kernel_L4_M1_22
-strmm_kernel_L4_M1_40:
+.Lstrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L4_M1_100
+ ble .Lstrmm_kernel_L4_M1_100
-strmm_kernel_L4_M1_42:
+.Lstrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M1_42
+ bgt .Lstrmm_kernel_L4_M1_42
-strmm_kernel_L4_M1_100:
+.Lstrmm_kernel_L4_M1_100:
SAVE1x4
#endif
-strmm_kernel_L4_END:
+.Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
#if !defined(LEFT)
#endif
subs counterJ, counterJ , #1 // j--
- bgt strmm_kernel_L4_BEGIN
+ bgt .Lstrmm_kernel_L4_BEGIN
/******************************************************************************/
-strmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble strmm_kernel_L999
+ ble .Lstrmm_kernel_L999
tst counterJ , #2
- ble strmm_kernel_L1_BEGIN
+ ble .Lstrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-strmm_kernel_L2_M4_BEGIN:
+.Lstrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble strmm_kernel_L2_M2_BEGIN
+ ble .Lstrmm_kernel_L2_M2_BEGIN
-strmm_kernel_L2_M4_20:
+.Lstrmm_kernel_L2_M4_20:
INIT4x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M4_40
+ ble .Lstrmm_kernel_L2_M4_40
.align 5
-strmm_kernel_L2_M4_22:
+.Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M4_22
+ bgt .Lstrmm_kernel_L2_M4_22
-strmm_kernel_L2_M4_40:
+.Lstrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M4_100
+ ble .Lstrmm_kernel_L2_M4_100
-strmm_kernel_L2_M4_42:
+.Lstrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M4_42
+ bgt .Lstrmm_kernel_L2_M4_42
-strmm_kernel_L2_M4_100:
+.Lstrmm_kernel_L2_M4_100:
SAVE4x2
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L2_M4_END:
+.Lstrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt strmm_kernel_L2_M4_20
+ bgt .Lstrmm_kernel_L2_M4_20
-strmm_kernel_L2_M2_BEGIN:
+.Lstrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L2_M1_BEGIN
+ ble .Lstrmm_kernel_L2_M1_BEGIN
-strmm_kernel_L2_M2_20:
+.Lstrmm_kernel_L2_M2_20:
INIT2x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M2_40
+ ble .Lstrmm_kernel_L2_M2_40
-strmm_kernel_L2_M2_22:
+.Lstrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M2_22
+ bgt .Lstrmm_kernel_L2_M2_22
-strmm_kernel_L2_M2_40:
+.Lstrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M2_100
+ ble .Lstrmm_kernel_L2_M2_100
-strmm_kernel_L2_M2_42:
+.Lstrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M2_42
+ bgt .Lstrmm_kernel_L2_M2_42
-strmm_kernel_L2_M2_100:
+.Lstrmm_kernel_L2_M2_100:
SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
add tempOffset, tempOffset, #2
#endif
-strmm_kernel_L2_M2_END:
+.Lstrmm_kernel_L2_M2_END:
-strmm_kernel_L2_M1_BEGIN:
+.Lstrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
-strmm_kernel_L2_M1_20:
+.Lstrmm_kernel_L2_M1_20:
INIT1x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble strmm_kernel_L2_M1_40
+ ble .Lstrmm_kernel_L2_M1_40
-strmm_kernel_L2_M1_22:
+.Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M1_22
+ bgt .Lstrmm_kernel_L2_M1_22
-strmm_kernel_L2_M1_40:
+.Lstrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M1_100
+ ble .Lstrmm_kernel_L2_M1_100
-strmm_kernel_L2_M1_42:
+.Lstrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M1_42
+ bgt .Lstrmm_kernel_L2_M1_42
-strmm_kernel_L2_M1_100:
+.Lstrmm_kernel_L2_M1_100:
SAVE1x2
add tempOffset, tempOffset, #1
#endif
-strmm_kernel_L2_END:
+.Lstrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
-strmm_kernel_L1_BEGIN:
+.Lstrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble strmm_kernel_L999 // done
+ ble .Lstrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
mov pA, origPA // pA = A
-strmm_kernel_L1_M4_BEGIN:
+.Lstrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble strmm_kernel_L1_M2_BEGIN
+ ble .Lstrmm_kernel_L1_M2_BEGIN
-strmm_kernel_L1_M4_20:
+.Lstrmm_kernel_L1_M4_20:
INIT4x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M4_40
+ ble .Lstrmm_kernel_L1_M4_40
.align 5
-strmm_kernel_L1_M4_22:
+.Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M4_22
+ bgt .Lstrmm_kernel_L1_M4_22
-strmm_kernel_L1_M4_40:
+.Lstrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M4_100
+ ble .Lstrmm_kernel_L1_M4_100
-strmm_kernel_L1_M4_42:
+.Lstrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M4_42
+ bgt .Lstrmm_kernel_L1_M4_42
-strmm_kernel_L1_M4_100:
+.Lstrmm_kernel_L1_M4_100:
SAVE4x1
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L1_M4_END:
+.Lstrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt strmm_kernel_L1_M4_20
+ bgt .Lstrmm_kernel_L1_M4_20
-strmm_kernel_L1_M2_BEGIN:
+.Lstrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L1_M1_BEGIN
+ ble .Lstrmm_kernel_L1_M1_BEGIN
-strmm_kernel_L1_M2_20:
+.Lstrmm_kernel_L1_M2_20:
INIT2x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M2_40
+ ble .Lstrmm_kernel_L1_M2_40
-strmm_kernel_L1_M2_22:
+.Lstrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M2_22
+ bgt .Lstrmm_kernel_L1_M2_22
-strmm_kernel_L1_M2_40:
+.Lstrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M2_100
+ ble .Lstrmm_kernel_L1_M2_100
-strmm_kernel_L1_M2_42:
+.Lstrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M2_42
+ bgt .Lstrmm_kernel_L1_M2_42
-strmm_kernel_L1_M2_100:
+.Lstrmm_kernel_L1_M2_100:
SAVE2x1
#endif
-strmm_kernel_L1_M2_END:
+.Lstrmm_kernel_L1_M2_END:
-strmm_kernel_L1_M1_BEGIN:
+.Lstrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
-strmm_kernel_L1_M1_20:
+.Lstrmm_kernel_L1_M1_20:
INIT1x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M1_40
+ ble .Lstrmm_kernel_L1_M1_40
-strmm_kernel_L1_M1_22:
+.Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M1_22
+ bgt .Lstrmm_kernel_L1_M1_22
-strmm_kernel_L1_M1_40:
+.Lstrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M1_100
+ ble .Lstrmm_kernel_L1_M1_100
-strmm_kernel_L1_M1_42:
+.Lstrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M1_42
+ bgt .Lstrmm_kernel_L1_M1_42
-strmm_kernel_L1_M1_100:
+.Lstrmm_kernel_L1_M1_100:
SAVE1x1
#endif
#endif
-strmm_kernel_L1_END:
+.Lstrmm_kernel_L1_END:
#if 0
#if !defined(LEFT)
#endif
#endif
-strmm_kernel_L999:
+.Lstrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
PROLOGUE
-strmm_kernel_begin:
+.Lstrmm_kernel_begin:
.align 5
add sp, sp, #-(11 * 16)
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
- ble strmm_kernel_L4_BEGIN
+ ble .Lstrmm_kernel_L4_BEGIN
/******************************************************************************/
/******************************************************************************/
-strmm_kernel_L8_BEGIN:
+.Lstrmm_kernel_L8_BEGIN:
mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3
/******************************************************************************/
-strmm_kernel_L8_M8_BEGIN:
+.Lstrmm_kernel_L8_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble strmm_kernel_L8_M4_BEGIN
+ ble .Lstrmm_kernel_L8_M4_BEGIN
-strmm_kernel_L8_M8_20:
+.Lstrmm_kernel_L8_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt strmm_kernel_L8_M8_32
+ blt .Lstrmm_kernel_L8_M8_32
KERNEL8x8_I // do one in the K
KERNEL8x8_M2 // do another in the K
subs counterL, counterL, #2
- ble strmm_kernel_L8_M8_22a
+ ble .Lstrmm_kernel_L8_M8_22a
.align 5
-strmm_kernel_L8_M8_22:
+.Lstrmm_kernel_L8_M8_22:
KERNEL8x8_M1
KERNEL8x8_M2
subs counterL, counterL, #1
- bgt strmm_kernel_L8_M8_22
+ bgt .Lstrmm_kernel_L8_M8_22
-strmm_kernel_L8_M8_22a:
+.Lstrmm_kernel_L8_M8_22a:
KERNEL8x8_M1
KERNEL8x8_E
- b strmm_kernel_L8_M8_44
+ b .Lstrmm_kernel_L8_M8_44
-strmm_kernel_L8_M8_32:
+.Lstrmm_kernel_L8_M8_32:
tst counterL, #1
- ble strmm_kernel_L8_M8_40
+ ble .Lstrmm_kernel_L8_M8_40
KERNEL8x8_I
KERNEL8x8_E
- b strmm_kernel_L8_M8_44
+ b .Lstrmm_kernel_L8_M8_44
-strmm_kernel_L8_M8_40:
+.Lstrmm_kernel_L8_M8_40:
INIT8x8
-strmm_kernel_L8_M8_44:
+.Lstrmm_kernel_L8_M8_44:
ands counterL , tempK, #1
- ble strmm_kernel_L8_M8_100
+ ble .Lstrmm_kernel_L8_M8_100
-strmm_kernel_L8_M8_46:
+.Lstrmm_kernel_L8_M8_46:
KERNEL8x8_SUB
-strmm_kernel_L8_M8_100:
+.Lstrmm_kernel_L8_M8_100:
SAVE8x8
add tempOffset, tempOffset, #8
#endif
-strmm_kernel_L8_M8_END:
+.Lstrmm_kernel_L8_M8_END:
subs counterI, counterI, #1
- bne strmm_kernel_L8_M8_20
+ bne .Lstrmm_kernel_L8_M8_20
/******************************************************************************/
-strmm_kernel_L8_M4_BEGIN:
+.Lstrmm_kernel_L8_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble strmm_kernel_L8_END
+ ble .Lstrmm_kernel_L8_END
tst counterI, #4
- ble strmm_kernel_L8_M2_BEGIN
+ ble .Lstrmm_kernel_L8_M2_BEGIN
-strmm_kernel_L8_M4_20:
+.Lstrmm_kernel_L8_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt strmm_kernel_L8_M4_32
+ blt .Lstrmm_kernel_L8_M4_32
KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K
subs counterL, counterL, #2
- ble strmm_kernel_L8_M4_22a
+ ble .Lstrmm_kernel_L8_M4_22a
.align 5
-strmm_kernel_L8_M4_22:
+.Lstrmm_kernel_L8_M4_22:
KERNEL4x8_M1
KERNEL4x8_M2
subs counterL, counterL, #1
- bgt strmm_kernel_L8_M4_22
+ bgt .Lstrmm_kernel_L8_M4_22
-strmm_kernel_L8_M4_22a:
+.Lstrmm_kernel_L8_M4_22a:
KERNEL4x8_M1
KERNEL4x8_E
- b strmm_kernel_L8_M4_44
+ b .Lstrmm_kernel_L8_M4_44
-strmm_kernel_L8_M4_32:
+.Lstrmm_kernel_L8_M4_32:
tst counterL, #1
- ble strmm_kernel_L8_M4_40
+ ble .Lstrmm_kernel_L8_M4_40
KERNEL4x8_I
KERNEL4x8_E
- b strmm_kernel_L8_M4_44
+ b .Lstrmm_kernel_L8_M4_44
-strmm_kernel_L8_M4_40:
+.Lstrmm_kernel_L8_M4_40:
INIT4x8
-strmm_kernel_L8_M4_44:
+.Lstrmm_kernel_L8_M4_44:
ands counterL , tempK, #1
- ble strmm_kernel_L8_M4_100
+ ble .Lstrmm_kernel_L8_M4_100
-strmm_kernel_L8_M4_46:
+.Lstrmm_kernel_L8_M4_46:
KERNEL4x8_SUB
-strmm_kernel_L8_M4_100:
+.Lstrmm_kernel_L8_M4_100:
SAVE4x8
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L8_M4_END:
+.Lstrmm_kernel_L8_M4_END:
/******************************************************************************/
-strmm_kernel_L8_M2_BEGIN:
+.Lstrmm_kernel_L8_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L8_END
+ ble .Lstrmm_kernel_L8_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L8_M1_BEGIN
+ ble .Lstrmm_kernel_L8_M1_BEGIN
-strmm_kernel_L8_M2_20:
+.Lstrmm_kernel_L8_M2_20:
INIT2x8
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L8_M2_40
+ ble .Lstrmm_kernel_L8_M2_40
-strmm_kernel_L8_M2_22:
+.Lstrmm_kernel_L8_M2_22:
KERNEL2x8_SUB
KERNEL2x8_SUB
KERNEL2x8_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L8_M2_22
+ bgt .Lstrmm_kernel_L8_M2_22
-strmm_kernel_L8_M2_40:
+.Lstrmm_kernel_L8_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L8_M2_100
+ ble .Lstrmm_kernel_L8_M2_100
-strmm_kernel_L8_M2_42:
+.Lstrmm_kernel_L8_M2_42:
KERNEL2x8_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L8_M2_42
+ bgt .Lstrmm_kernel_L8_M2_42
-strmm_kernel_L8_M2_100:
+.Lstrmm_kernel_L8_M2_100:
SAVE2x8
add tempOffset, tempOffset, #2
#endif
-strmm_kernel_L8_M2_END:
+.Lstrmm_kernel_L8_M2_END:
/******************************************************************************/
-strmm_kernel_L8_M1_BEGIN:
+.Lstrmm_kernel_L8_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L8_END
+ ble .Lstrmm_kernel_L8_END
-strmm_kernel_L8_M1_20:
+.Lstrmm_kernel_L8_M1_20:
INIT1x8
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L8_M1_40
+ ble .Lstrmm_kernel_L8_M1_40
-strmm_kernel_L8_M1_22:
+.Lstrmm_kernel_L8_M1_22:
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
KERNEL1x8_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L8_M1_22
+ bgt .Lstrmm_kernel_L8_M1_22
-strmm_kernel_L8_M1_40:
+.Lstrmm_kernel_L8_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L8_M1_100
+ ble .Lstrmm_kernel_L8_M1_100
-strmm_kernel_L8_M1_42:
+.Lstrmm_kernel_L8_M1_42:
KERNEL1x8_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L8_M1_42
+ bgt .Lstrmm_kernel_L8_M1_42
-strmm_kernel_L8_M1_100:
+.Lstrmm_kernel_L8_M1_100:
SAVE1x8
add tempOffset, tempOffset, #1
#endif
-strmm_kernel_L8_END:
+.Lstrmm_kernel_L8_END:
lsl temp, origK, #5 // B = B + K * 4 * 8
add origPB, origPB, temp
#endif
subs counterJ, counterJ , #1 // j--
- bgt strmm_kernel_L8_BEGIN
+ bgt .Lstrmm_kernel_L8_BEGIN
/******************************************************************************/
/******************************************************************************/
-strmm_kernel_L4_BEGIN:
+.Lstrmm_kernel_L4_BEGIN:
mov counterJ , origN
tst counterJ , #7
- ble strmm_kernel_L999
+ ble .Lstrmm_kernel_L999
tst counterJ , #4
- ble strmm_kernel_L2_BEGIN
+ ble .Lstrmm_kernel_L2_BEGIN
mov pCRow0, pC // pCRow0 = pC
/******************************************************************************/
-strmm_kernel_L4_M8_BEGIN:
+.Lstrmm_kernel_L4_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0
- ble strmm_kernel_L4_M4_BEGIN
+ ble .Lstrmm_kernel_L4_M4_BEGIN
-strmm_kernel_L4_M8_20:
+.Lstrmm_kernel_L4_M8_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt strmm_kernel_L4_M8_32
+ blt .Lstrmm_kernel_L4_M8_32
KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K
subs counterL, counterL, #2
- ble strmm_kernel_L4_M8_22a
+ ble .Lstrmm_kernel_L4_M8_22a
.align 5
-strmm_kernel_L4_M8_22:
+.Lstrmm_kernel_L4_M8_22:
KERNEL8x4_M1
KERNEL8x4_M2
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M8_22
+ bgt .Lstrmm_kernel_L4_M8_22
-strmm_kernel_L4_M8_22a:
+.Lstrmm_kernel_L4_M8_22a:
KERNEL8x4_M1
KERNEL8x4_E
- b strmm_kernel_L4_M8_44
+ b .Lstrmm_kernel_L4_M8_44
-strmm_kernel_L4_M8_32:
+.Lstrmm_kernel_L4_M8_32:
tst counterL, #1
- ble strmm_kernel_L4_M8_40
+ ble .Lstrmm_kernel_L4_M8_40
KERNEL8x4_I
KERNEL8x4_E
- b strmm_kernel_L4_M8_44
+ b .Lstrmm_kernel_L4_M8_44
-strmm_kernel_L4_M8_40:
+.Lstrmm_kernel_L4_M8_40:
INIT8x4
-strmm_kernel_L4_M8_44:
+.Lstrmm_kernel_L4_M8_44:
ands counterL , tempK, #1
- ble strmm_kernel_L4_M8_100
+ ble .Lstrmm_kernel_L4_M8_100
-strmm_kernel_L4_M8_46:
+.Lstrmm_kernel_L4_M8_46:
KERNEL8x4_SUB
-strmm_kernel_L4_M8_100:
+.Lstrmm_kernel_L4_M8_100:
SAVE8x4
#if defined(LEFT)
add tempOffset, tempOffset, #8
#endif
-strmm_kernel_L4_M8_END:
+.Lstrmm_kernel_L4_M8_END:
subs counterI, counterI, #1
- bne strmm_kernel_L4_M8_20
+ bne .Lstrmm_kernel_L4_M8_20
/******************************************************************************/
-strmm_kernel_L4_M4_BEGIN:
+.Lstrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
tst counterI, #4
- ble strmm_kernel_L4_M2_BEGIN
+ ble .Lstrmm_kernel_L4_M2_BEGIN
-strmm_kernel_L4_M4_20:
+.Lstrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
#endif
asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do?
- blt strmm_kernel_L4_M4_32
+ blt .Lstrmm_kernel_L4_M4_32
KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K
subs counterL, counterL, #2
- ble strmm_kernel_L4_M4_22a
+ ble .Lstrmm_kernel_L4_M4_22a
.align 5
-strmm_kernel_L4_M4_22:
+.Lstrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M4_22
+ bgt .Lstrmm_kernel_L4_M4_22
-strmm_kernel_L4_M4_22a:
+.Lstrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_E
- b strmm_kernel_L4_M4_44
+ b .Lstrmm_kernel_L4_M4_44
-strmm_kernel_L4_M4_32:
+.Lstrmm_kernel_L4_M4_32:
tst counterL, #1
- ble strmm_kernel_L4_M4_40
+ ble .Lstrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_E
- b strmm_kernel_L4_M4_44
+ b .Lstrmm_kernel_L4_M4_44
-strmm_kernel_L4_M4_40:
+.Lstrmm_kernel_L4_M4_40:
INIT4x4
-strmm_kernel_L4_M4_44:
+.Lstrmm_kernel_L4_M4_44:
ands counterL , tempK, #1
- ble strmm_kernel_L4_M4_100
+ ble .Lstrmm_kernel_L4_M4_100
-strmm_kernel_L4_M4_46:
+.Lstrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
-strmm_kernel_L4_M4_100:
+.Lstrmm_kernel_L4_M4_100:
SAVE4x4
#if defined(LEFT)
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L4_M4_END:
+.Lstrmm_kernel_L4_M4_END:
/******************************************************************************/
-strmm_kernel_L4_M2_BEGIN:
+.Lstrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L4_M1_BEGIN
+ ble .Lstrmm_kernel_L4_M1_BEGIN
-strmm_kernel_L4_M2_20:
+.Lstrmm_kernel_L4_M2_20:
INIT2x4
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L4_M2_40
+ ble .Lstrmm_kernel_L4_M2_40
-strmm_kernel_L4_M2_22:
+.Lstrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M2_22
+ bgt .Lstrmm_kernel_L4_M2_22
-strmm_kernel_L4_M2_40:
+.Lstrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L4_M2_100
+ ble .Lstrmm_kernel_L4_M2_100
-strmm_kernel_L4_M2_42:
+.Lstrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M2_42
+ bgt .Lstrmm_kernel_L4_M2_42
-strmm_kernel_L4_M2_100:
+.Lstrmm_kernel_L4_M2_100:
SAVE2x4
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
-strmm_kernel_L4_M2_END:
+.Lstrmm_kernel_L4_M2_END:
/******************************************************************************/
-strmm_kernel_L4_M1_BEGIN:
+.Lstrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L4_END
+ ble .Lstrmm_kernel_L4_END
-strmm_kernel_L4_M1_20:
+.Lstrmm_kernel_L4_M1_20:
INIT1x4
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L4_M1_40
+ ble .Lstrmm_kernel_L4_M1_40
-strmm_kernel_L4_M1_22:
+.Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M1_22
+ bgt .Lstrmm_kernel_L4_M1_22
-strmm_kernel_L4_M1_40:
+.Lstrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L4_M1_100
+ ble .Lstrmm_kernel_L4_M1_100
-strmm_kernel_L4_M1_42:
+.Lstrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L4_M1_42
+ bgt .Lstrmm_kernel_L4_M1_42
-strmm_kernel_L4_M1_100:
+.Lstrmm_kernel_L4_M1_100:
SAVE1x4
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
-strmm_kernel_L4_END:
+.Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
#if !defined(LEFT)
add tempOffset, tempOffset, #4
/******************************************************************************/
/******************************************************************************/
-strmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble strmm_kernel_L999
+ ble .Lstrmm_kernel_L999
tst counterJ , #2
- ble strmm_kernel_L1_BEGIN
+ ble .Lstrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
/******************************************************************************/
-strmm_kernel_L2_M8_BEGIN:
+.Lstrmm_kernel_L2_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI,#0
- ble strmm_kernel_L2_M4_BEGIN
+ ble .Lstrmm_kernel_L2_M4_BEGIN
-strmm_kernel_L2_M8_20:
+.Lstrmm_kernel_L2_M8_20:
INIT8x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M8_40
+ ble .Lstrmm_kernel_L2_M8_40
.align 5
-strmm_kernel_L2_M8_22:
+.Lstrmm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M8_22
+ bgt .Lstrmm_kernel_L2_M8_22
-strmm_kernel_L2_M8_40:
+.Lstrmm_kernel_L2_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M8_100
+ ble .Lstrmm_kernel_L2_M8_100
-strmm_kernel_L2_M8_42:
+.Lstrmm_kernel_L2_M8_42:
KERNEL8x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M8_42
+ bgt .Lstrmm_kernel_L2_M8_42
-strmm_kernel_L2_M8_100:
+.Lstrmm_kernel_L2_M8_100:
SAVE8x2
#if defined(LEFT)
add tempOffset, tempOffset, #8
#endif
-strmm_kernel_L2_M8_END:
+.Lstrmm_kernel_L2_M8_END:
subs counterI, counterI, #1
- bgt strmm_kernel_L2_M8_20
+ bgt .Lstrmm_kernel_L2_M8_20
/******************************************************************************/
-strmm_kernel_L2_M4_BEGIN:
+.Lstrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
tst counterI, #4
- ble strmm_kernel_L2_M2_BEGIN
+ ble .Lstrmm_kernel_L2_M2_BEGIN
-strmm_kernel_L2_M4_20:
+.Lstrmm_kernel_L2_M4_20:
INIT4x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M4_40
+ ble .Lstrmm_kernel_L2_M4_40
.align 5
-strmm_kernel_L2_M4_22:
+.Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M4_22
+ bgt .Lstrmm_kernel_L2_M4_22
-strmm_kernel_L2_M4_40:
+.Lstrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M4_100
+ ble .Lstrmm_kernel_L2_M4_100
-strmm_kernel_L2_M4_42:
+.Lstrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M4_42
+ bgt .Lstrmm_kernel_L2_M4_42
-strmm_kernel_L2_M4_100:
+.Lstrmm_kernel_L2_M4_100:
SAVE4x2
#if defined(LEFT)
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L2_M4_END:
+.Lstrmm_kernel_L2_M4_END:
/******************************************************************************/
-strmm_kernel_L2_M2_BEGIN:
+.Lstrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L2_M1_BEGIN
+ ble .Lstrmm_kernel_L2_M1_BEGIN
-strmm_kernel_L2_M2_20:
+.Lstrmm_kernel_L2_M2_20:
INIT2x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble strmm_kernel_L2_M2_40
+ ble .Lstrmm_kernel_L2_M2_40
-strmm_kernel_L2_M2_22:
+.Lstrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M2_22
+ bgt .Lstrmm_kernel_L2_M2_22
-strmm_kernel_L2_M2_40:
+.Lstrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M2_100
+ ble .Lstrmm_kernel_L2_M2_100
-strmm_kernel_L2_M2_42:
+.Lstrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M2_42
+ bgt .Lstrmm_kernel_L2_M2_42
-strmm_kernel_L2_M2_100:
+.Lstrmm_kernel_L2_M2_100:
SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
add tempOffset, tempOffset, #2
#endif
-strmm_kernel_L2_M2_END:
+.Lstrmm_kernel_L2_M2_END:
/******************************************************************************/
-strmm_kernel_L2_M1_BEGIN:
+.Lstrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L2_END
+ ble .Lstrmm_kernel_L2_END
-strmm_kernel_L2_M1_20:
+.Lstrmm_kernel_L2_M1_20:
INIT1x2
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble strmm_kernel_L2_M1_40
+ ble .Lstrmm_kernel_L2_M1_40
-strmm_kernel_L2_M1_22:
+.Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M1_22
+ bgt .Lstrmm_kernel_L2_M1_22
-strmm_kernel_L2_M1_40:
+.Lstrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L2_M1_100
+ ble .Lstrmm_kernel_L2_M1_100
-strmm_kernel_L2_M1_42:
+.Lstrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L2_M1_42
+ bgt .Lstrmm_kernel_L2_M1_42
-strmm_kernel_L2_M1_100:
+.Lstrmm_kernel_L2_M1_100:
SAVE1x2
#if defined(LEFT)
add tempOffset, tempOffset, #1
#endif
-strmm_kernel_L2_END:
+.Lstrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
/******************************************************************************/
-strmm_kernel_L1_BEGIN:
+.Lstrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble strmm_kernel_L999 // done
+ ble .Lstrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
/******************************************************************************/
-strmm_kernel_L1_M8_BEGIN:
+.Lstrmm_kernel_L1_M8_BEGIN:
mov counterI, origM
asr counterI, counterI, #3
cmp counterI, #0
- ble strmm_kernel_L1_M4_BEGIN
+ ble .Lstrmm_kernel_L1_M4_BEGIN
-strmm_kernel_L1_M8_20:
+.Lstrmm_kernel_L1_M8_20:
INIT8x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M8_40
+ ble .Lstrmm_kernel_L1_M8_40
.align 5
-strmm_kernel_L1_M8_22:
+.Lstrmm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M8_22
+ bgt .Lstrmm_kernel_L1_M8_22
-strmm_kernel_L1_M8_40:
+.Lstrmm_kernel_L1_M8_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M8_100
+ ble .Lstrmm_kernel_L1_M8_100
-strmm_kernel_L1_M8_42:
+.Lstrmm_kernel_L1_M8_42:
KERNEL8x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M8_42
+ bgt .Lstrmm_kernel_L1_M8_42
-strmm_kernel_L1_M8_100:
+.Lstrmm_kernel_L1_M8_100:
SAVE8x1
#if defined(LEFT)
add tempOffset, tempOffset, #8
#endif
-strmm_kernel_L1_M8_END:
+.Lstrmm_kernel_L1_M8_END:
subs counterI, counterI, #1
- bgt strmm_kernel_L1_M8_20
+ bgt .Lstrmm_kernel_L1_M8_20
/******************************************************************************/
-strmm_kernel_L1_M4_BEGIN:
+.Lstrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
tst counterI , #7
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
tst counterI, #4
- ble strmm_kernel_L1_M2_BEGIN
+ ble .Lstrmm_kernel_L1_M2_BEGIN
-strmm_kernel_L1_M4_20:
+.Lstrmm_kernel_L1_M4_20:
INIT4x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M4_40
+ ble .Lstrmm_kernel_L1_M4_40
.align 5
-strmm_kernel_L1_M4_22:
+.Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M4_22
+ bgt .Lstrmm_kernel_L1_M4_22
-strmm_kernel_L1_M4_40:
+.Lstrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M4_100
+ ble .Lstrmm_kernel_L1_M4_100
-strmm_kernel_L1_M4_42:
+.Lstrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M4_42
+ bgt .Lstrmm_kernel_L1_M4_42
-strmm_kernel_L1_M4_100:
+.Lstrmm_kernel_L1_M4_100:
SAVE4x1
#if defined(LEFT)
add tempOffset, tempOffset, #4
#endif
-strmm_kernel_L1_M4_END:
+.Lstrmm_kernel_L1_M4_END:
/******************************************************************************/
-strmm_kernel_L1_M2_BEGIN:
+.Lstrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble strmm_kernel_L1_M1_BEGIN
+ ble .Lstrmm_kernel_L1_M1_BEGIN
-strmm_kernel_L1_M2_20:
+.Lstrmm_kernel_L1_M2_20:
INIT2x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M2_40
+ ble .Lstrmm_kernel_L1_M2_40
-strmm_kernel_L1_M2_22:
+.Lstrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M2_22
+ bgt .Lstrmm_kernel_L1_M2_22
-strmm_kernel_L1_M2_40:
+.Lstrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M2_100
+ ble .Lstrmm_kernel_L1_M2_100
-strmm_kernel_L1_M2_42:
+.Lstrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M2_42
+ bgt .Lstrmm_kernel_L1_M2_42
-strmm_kernel_L1_M2_100:
+.Lstrmm_kernel_L1_M2_100:
SAVE2x1
#if defined(LEFT)
add tempOffset, tempOffset, #2
#endif
-strmm_kernel_L1_M2_END:
+.Lstrmm_kernel_L1_M2_END:
/******************************************************************************/
-strmm_kernel_L1_M1_BEGIN:
+.Lstrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble strmm_kernel_L1_END
+ ble .Lstrmm_kernel_L1_END
-strmm_kernel_L1_M1_20:
+.Lstrmm_kernel_L1_M1_20:
INIT1x1
#endif
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble strmm_kernel_L1_M1_40
+ ble .Lstrmm_kernel_L1_M1_40
-strmm_kernel_L1_M1_22:
+.Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M1_22
+ bgt .Lstrmm_kernel_L1_M1_22
-strmm_kernel_L1_M1_40:
+.Lstrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble strmm_kernel_L1_M1_100
+ ble .Lstrmm_kernel_L1_M1_100
-strmm_kernel_L1_M1_42:
+.Lstrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt strmm_kernel_L1_M1_42
+ bgt .Lstrmm_kernel_L1_M1_42
-strmm_kernel_L1_M1_100:
+.Lstrmm_kernel_L1_M1_100:
SAVE1x1
-strmm_kernel_L1_END:
+.Lstrmm_kernel_L1_END:
/******************************************************************************/
-strmm_kernel_L999:
+.Lstrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
PROLOGUE
cmp N, xzr
- ble swap_kernel_L999
+ ble .Lswap_kernel_L999
cmp INC_X, #1
- bne swap_kernel_S_BEGIN
+ bne .Lswap_kernel_S_BEGIN
cmp INC_Y, #1
- bne swap_kernel_S_BEGIN
+ bne .Lswap_kernel_S_BEGIN
-swap_kernel_F_BEGIN:
+.Lswap_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
- beq swap_kernel_F1
+ beq .Lswap_kernel_F1
-swap_kernel_F8:
+.Lswap_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne swap_kernel_F8
+ bne .Lswap_kernel_F8
-swap_kernel_F1:
+.Lswap_kernel_F1:
ands I, N, #7
- ble swap_kernel_L999
+ ble .Lswap_kernel_L999
-swap_kernel_F10:
+.Lswap_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne swap_kernel_F10
+ bne .Lswap_kernel_F10
- b swap_kernel_L999
+ b .Lswap_kernel_L999
-swap_kernel_S_BEGIN:
+.Lswap_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble swap_kernel_S1
+ ble .Lswap_kernel_S1
-swap_kernel_S4:
+.Lswap_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne swap_kernel_S4
+ bne .Lswap_kernel_S4
-swap_kernel_S1:
+.Lswap_kernel_S1:
ands I, N, #3
- ble swap_kernel_L999
+ ble .Lswap_kernel_L999
-swap_kernel_S10:
+.Lswap_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne swap_kernel_S10
+ bne .Lswap_kernel_S10
-swap_kernel_L999:
+.Lswap_kernel_L999:
mov w0, wzr
ret
PROLOGUE
cmp N, xzr
- ble amax_kernel_zero
+ ble .Lzamax_kernel_zero
cmp INC_X, xzr
- ble amax_kernel_zero
+ ble .Lzamax_kernel_zero
cmp INC_X, #1
- bne amax_kernel_S_BEGIN
+ bne .Lzamax_kernel_S_BEGIN
-amax_kernel_F_BEGIN:
+.Lzamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq amax_kernel_F1_INIT
+ beq .Lzamax_kernel_F1_INIT
INIT_F4
subs I, I, #1
- beq amax_kernel_F1
+ beq .Lzamax_kernel_F1
-amax_kernel_F4:
+.Lzamax_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne amax_kernel_F4
+ bne .Lzamax_kernel_F4
-amax_kernel_F1:
+.Lzamax_kernel_F1:
ands I, N, #3
- ble amax_kernel_L999
+ ble .Lzamax_kernel_L999
-amax_kernel_F10:
+.Lzamax_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne amax_kernel_F10
+ bne .Lzamax_kernel_F10
ret
-amax_kernel_F1_INIT:
+.Lzamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
- b amax_kernel_F1
+ b .Lzamax_kernel_F1
-amax_kernel_S_BEGIN:
+.Lzamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
- ble amax_kernel_L999
+ ble .Lzamax_kernel_L999
asr I, N, #2
cmp I, xzr
- ble amax_kernel_S1
+ ble .Lzamax_kernel_S1
-amax_kernel_S4:
+.Lzamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne amax_kernel_S4
+ bne .Lzamax_kernel_S4
-amax_kernel_S1:
+.Lzamax_kernel_S1:
ands I, N, #3
- ble amax_kernel_L999
+ ble .Lzamax_kernel_L999
-amax_kernel_S10:
+.Lzamax_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne amax_kernel_S10
+ bne .Lzamax_kernel_S10
-amax_kernel_L999:
+.Lzamax_kernel_L999:
ret
-amax_kernel_zero:
+.Lzamax_kernel_zero:
fmov MAXF, REG0
ret
fmov SUMF, REG0
cmp N, xzr
- ble asum_kernel_L999
+ ble .Lzasum_kernel_L999
cmp INC_X, xzr
- ble asum_kernel_L999
+ ble .Lzasum_kernel_L999
cmp INC_X, #1
- bne asum_kernel_S_BEGIN
+ bne .Lzasum_kernel_S_BEGIN
-asum_kernel_F_BEGIN:
+.Lzasum_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq asum_kernel_F1
+ beq .Lzasum_kernel_F1
-asum_kernel_F4:
+.Lzasum_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne asum_kernel_F4
+ bne .Lzasum_kernel_F4
KERNEL_F4_FINALIZE
-asum_kernel_F1:
+.Lzasum_kernel_F1:
ands I, N, #3
- ble asum_kernel_L999
+ ble .Lzasum_kernel_L999
-asum_kernel_F10:
+.Lzasum_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne asum_kernel_F10
+ bne .Lzasum_kernel_F10
-asum_kernel_L999:
+.Lzasum_kernel_L999:
ret
-asum_kernel_S_BEGIN:
+.Lzasum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble asum_kernel_S1
+ ble .Lzasum_kernel_S1
-asum_kernel_S4:
+.Lzasum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne asum_kernel_S4
+ bne .Lzasum_kernel_S4
-asum_kernel_S1:
+.Lzasum_kernel_S1:
ands I, N, #3
- ble asum_kernel_L999
+ ble .Lzasum_kernel_L999
-asum_kernel_S10:
+.Lzasum_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne asum_kernel_S10
+ bne .Lzasum_kernel_S10
ret
PROLOGUE
cmp N, xzr
- ble zaxpy_kernel_L999
+ ble .Lzaxpy_kernel_L999
mov Y_COPY, Y
fcmp DA_R, #0.0
bne .L1
fcmp DA_I, #0.0
- beq zaxpy_kernel_L999
+ beq .Lzaxpy_kernel_L999
.L1:
INIT
cmp INC_X, #1
- bne zaxpy_kernel_S_BEGIN
+ bne .Lzaxpy_kernel_S_BEGIN
cmp INC_Y, #1
- bne zaxpy_kernel_S_BEGIN
+ bne .Lzaxpy_kernel_S_BEGIN
-zaxpy_kernel_F_BEGIN:
+.Lzaxpy_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq zaxpy_kernel_F1
+ beq .Lzaxpy_kernel_F1
KERNEL_INIT_F4
-zaxpy_kernel_F4:
+.Lzaxpy_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne zaxpy_kernel_F4
+ bne .Lzaxpy_kernel_F4
-zaxpy_kernel_F1:
+.Lzaxpy_kernel_F1:
ands I, N, #3
- ble zaxpy_kernel_L999
+ ble .Lzaxpy_kernel_L999
-zaxpy_kernel_F10:
+.Lzaxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne zaxpy_kernel_F10
+ bne .Lzaxpy_kernel_F10
mov w0, wzr
ret
-zaxpy_kernel_S_BEGIN:
+.Lzaxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble zaxpy_kernel_S1
+ ble .Lzaxpy_kernel_S1
-zaxpy_kernel_S4:
+.Lzaxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne zaxpy_kernel_S4
+ bne .Lzaxpy_kernel_S4
-zaxpy_kernel_S1:
+.Lzaxpy_kernel_S1:
ands I, N, #3
- ble zaxpy_kernel_L999
+ ble .Lzaxpy_kernel_L999
-zaxpy_kernel_S10:
+.Lzaxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne zaxpy_kernel_S10
+ bne .Lzaxpy_kernel_S10
-zaxpy_kernel_L999:
+.Lzaxpy_kernel_L999:
mov w0, wzr
ret
#endif
cmp N, xzr
- ble dot_kernel_L999
+ ble .Lzdot_kernel_L999
cmp INC_X, #1
- bne dot_kernel_S_BEGIN
+ bne .Lzdot_kernel_S_BEGIN
cmp INC_Y, #1
- bne dot_kernel_S_BEGIN
+ bne .Lzdot_kernel_S_BEGIN
-dot_kernel_F_BEGIN:
+.Lzdot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq dot_kernel_F1
+ beq .Lzdot_kernel_F1
-dot_kernel_F4:
+.Lzdot_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne dot_kernel_F4
+ bne .Lzdot_kernel_F4
KERNEL_F4_FINALIZE
-dot_kernel_F1:
+.Lzdot_kernel_F1:
ands I, N, #3
- ble dot_kernel_L999
+ ble .Lzdot_kernel_L999
-dot_kernel_F10:
+.Lzdot_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne dot_kernel_F10
+ bne .Lzdot_kernel_F10
ret
-dot_kernel_S_BEGIN:
+.Lzdot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble dot_kernel_S1
+ ble .Lzdot_kernel_S1
-dot_kernel_S4:
+.Lzdot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne dot_kernel_S4
+ bne .Lzdot_kernel_S4
-dot_kernel_S1:
+.Lzdot_kernel_S1:
ands I, N, #3
- ble dot_kernel_L999
+ ble .Lzdot_kernel_L999
-dot_kernel_S10:
+.Lzdot_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne dot_kernel_S10
+ bne .Lzdot_kernel_S10
-dot_kernel_L999:
+.Lzdot_kernel_L999:
ret
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble zgemm_kernel_L2_BEGIN
+ ble .Lzgemm_kernel_L2_BEGIN
-zgemm_kernel_L4_BEGIN:
+.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
mov pA, origPA // pA = start of A array
-zgemm_kernel_L4_M4_BEGIN:
+.Lzgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble zgemm_kernel_L4_M2_BEGIN
+ ble .Lzgemm_kernel_L4_M2_BEGIN
.align 5
-zgemm_kernel_L4_M4_20:
+.Lzgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
- blt zgemm_kernel_L4_M4_32
+ blt .Lzgemm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
- ble zgemm_kernel_L4_M4_22a
+ ble .Lzgemm_kernel_L4_M4_22a
.align 5
-zgemm_kernel_L4_M4_22:
+.Lzgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M4_22
+ bgt .Lzgemm_kernel_L4_M4_22
.align 5
-zgemm_kernel_L4_M4_22a:
+.Lzgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
- b zgemm_kernel_L4_M4_44
+ b .Lzgemm_kernel_L4_M4_44
.align 5
-zgemm_kernel_L4_M4_32:
+.Lzgemm_kernel_L4_M4_32:
tst counterL, #1
- ble zgemm_kernel_L4_M4_40
+ ble .Lzgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
- b zgemm_kernel_L4_M4_44
+ b .Lzgemm_kernel_L4_M4_44
-zgemm_kernel_L4_M4_40:
+.Lzgemm_kernel_L4_M4_40:
INIT4x4
-zgemm_kernel_L4_M4_44:
+.Lzgemm_kernel_L4_M4_44:
ands counterL , origK, #7
- ble zgemm_kernel_L4_M4_100
+ ble .Lzgemm_kernel_L4_M4_100
.align 5
-zgemm_kernel_L4_M4_46:
+.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
- bne zgemm_kernel_L4_M4_46
+ bne .Lzgemm_kernel_L4_M4_46
-zgemm_kernel_L4_M4_100:
+.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4
-zgemm_kernel_L4_M4_END:
+.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
- bne zgemm_kernel_L4_M4_20
+ bne .Lzgemm_kernel_L4_M4_20
-zgemm_kernel_L4_M2_BEGIN:
+.Lzgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble zgemm_kernel_L4_END
+ ble .Lzgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble zgemm_kernel_L4_M1_BEGIN
+ ble .Lzgemm_kernel_L4_M1_BEGIN
-zgemm_kernel_L4_M2_20:
+.Lzgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L4_M2_40
+ ble .Lzgemm_kernel_L4_M2_40
-zgemm_kernel_L4_M2_22:
+.Lzgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M2_22
+ bgt .Lzgemm_kernel_L4_M2_22
-zgemm_kernel_L4_M2_40:
+.Lzgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L4_M2_100
+ ble .Lzgemm_kernel_L4_M2_100
-zgemm_kernel_L4_M2_42:
+.Lzgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M2_42
+ bgt .Lzgemm_kernel_L4_M2_42
-zgemm_kernel_L4_M2_100:
+.Lzgemm_kernel_L4_M2_100:
SAVE2x4
-zgemm_kernel_L4_M2_END:
+.Lzgemm_kernel_L4_M2_END:
-zgemm_kernel_L4_M1_BEGIN:
+.Lzgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble zgemm_kernel_L4_END
+ ble .Lzgemm_kernel_L4_END
-zgemm_kernel_L4_M1_20:
+.Lzgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L4_M1_40
+ ble .Lzgemm_kernel_L4_M1_40
-zgemm_kernel_L4_M1_22:
+.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M1_22
+ bgt .Lzgemm_kernel_L4_M1_22
-zgemm_kernel_L4_M1_40:
+.Lzgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L4_M1_100
+ ble .Lzgemm_kernel_L4_M1_100
-zgemm_kernel_L4_M1_42:
+.Lzgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M1_42
+ bgt .Lzgemm_kernel_L4_M1_42
-zgemm_kernel_L4_M1_100:
+.Lzgemm_kernel_L4_M1_100:
SAVE1x4
-zgemm_kernel_L4_END:
+.Lzgemm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
subs counterJ, counterJ , #1 // j--
- bgt zgemm_kernel_L4_BEGIN
+ bgt .Lzgemm_kernel_L4_BEGIN
/******************************************************************************/
-zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble zgemm_kernel_L999
+ ble .Lzgemm_kernel_L999
tst counterJ , #2
- ble zgemm_kernel_L1_BEGIN
+ ble .Lzgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
-zgemm_kernel_L2_M4_BEGIN:
+.Lzgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble zgemm_kernel_L2_M2_BEGIN
+ ble .Lzgemm_kernel_L2_M2_BEGIN
-zgemm_kernel_L2_M4_20:
+.Lzgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble zgemm_kernel_L2_M4_40
+ ble .Lzgemm_kernel_L2_M4_40
.align 5
-zgemm_kernel_L2_M4_22:
+.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M4_22
+ bgt .Lzgemm_kernel_L2_M4_22
-zgemm_kernel_L2_M4_40:
+.Lzgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L2_M4_100
+ ble .Lzgemm_kernel_L2_M4_100
-zgemm_kernel_L2_M4_42:
+.Lzgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M4_42
+ bgt .Lzgemm_kernel_L2_M4_42
-zgemm_kernel_L2_M4_100:
+.Lzgemm_kernel_L2_M4_100:
SAVE4x2
-zgemm_kernel_L2_M4_END:
+.Lzgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt zgemm_kernel_L2_M4_20
+ bgt .Lzgemm_kernel_L2_M4_20
-zgemm_kernel_L2_M2_BEGIN:
+.Lzgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble zgemm_kernel_L2_END
+ ble .Lzgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble zgemm_kernel_L2_M1_BEGIN
+ ble .Lzgemm_kernel_L2_M1_BEGIN
-zgemm_kernel_L2_M2_20:
+.Lzgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble zgemm_kernel_L2_M2_40
+ ble .Lzgemm_kernel_L2_M2_40
-zgemm_kernel_L2_M2_22:
+.Lzgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M2_22
+ bgt .Lzgemm_kernel_L2_M2_22
-zgemm_kernel_L2_M2_40:
+.Lzgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L2_M2_100
+ ble .Lzgemm_kernel_L2_M2_100
-zgemm_kernel_L2_M2_42:
+.Lzgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M2_42
+ bgt .Lzgemm_kernel_L2_M2_42
-zgemm_kernel_L2_M2_100:
+.Lzgemm_kernel_L2_M2_100:
SAVE2x2
-zgemm_kernel_L2_M2_END:
+.Lzgemm_kernel_L2_M2_END:
-zgemm_kernel_L2_M1_BEGIN:
+.Lzgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble zgemm_kernel_L2_END
+ ble .Lzgemm_kernel_L2_END
-zgemm_kernel_L2_M1_20:
+.Lzgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble zgemm_kernel_L2_M1_40
+ ble .Lzgemm_kernel_L2_M1_40
-zgemm_kernel_L2_M1_22:
+.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M1_22
+ bgt .Lzgemm_kernel_L2_M1_22
-zgemm_kernel_L2_M1_40:
+.Lzgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L2_M1_100
+ ble .Lzgemm_kernel_L2_M1_100
-zgemm_kernel_L2_M1_42:
+.Lzgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M1_42
+ bgt .Lzgemm_kernel_L2_M1_42
-zgemm_kernel_L2_M1_100:
+.Lzgemm_kernel_L2_M1_100:
SAVE1x2
-zgemm_kernel_L2_END:
+.Lzgemm_kernel_L2_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2
/******************************************************************************/
-zgemm_kernel_L1_BEGIN:
+.Lzgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble zgemm_kernel_L999 // done
+ ble .Lzgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
-zgemm_kernel_L1_M4_BEGIN:
+.Lzgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble zgemm_kernel_L1_M2_BEGIN
+ ble .Lzgemm_kernel_L1_M2_BEGIN
-zgemm_kernel_L1_M4_20:
+.Lzgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L1_M4_40
+ ble .Lzgemm_kernel_L1_M4_40
.align 5
-zgemm_kernel_L1_M4_22:
+.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M4_22
+ bgt .Lzgemm_kernel_L1_M4_22
-zgemm_kernel_L1_M4_40:
+.Lzgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L1_M4_100
+ ble .Lzgemm_kernel_L1_M4_100
-zgemm_kernel_L1_M4_42:
+.Lzgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M4_42
+ bgt .Lzgemm_kernel_L1_M4_42
-zgemm_kernel_L1_M4_100:
+.Lzgemm_kernel_L1_M4_100:
SAVE4x1
-zgemm_kernel_L1_M4_END:
+.Lzgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt zgemm_kernel_L1_M4_20
+ bgt .Lzgemm_kernel_L1_M4_20
-zgemm_kernel_L1_M2_BEGIN:
+.Lzgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble zgemm_kernel_L1_END
+ ble .Lzgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble zgemm_kernel_L1_M1_BEGIN
+ ble .Lzgemm_kernel_L1_M1_BEGIN
-zgemm_kernel_L1_M2_20:
+.Lzgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L1_M2_40
+ ble .Lzgemm_kernel_L1_M2_40
-zgemm_kernel_L1_M2_22:
+.Lzgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M2_22
+ bgt .Lzgemm_kernel_L1_M2_22
-zgemm_kernel_L1_M2_40:
+.Lzgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L1_M2_100
+ ble .Lzgemm_kernel_L1_M2_100
-zgemm_kernel_L1_M2_42:
+.Lzgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M2_42
+ bgt .Lzgemm_kernel_L1_M2_42
-zgemm_kernel_L1_M2_100:
+.Lzgemm_kernel_L1_M2_100:
SAVE2x1
-zgemm_kernel_L1_M2_END:
+.Lzgemm_kernel_L1_M2_END:
-zgemm_kernel_L1_M1_BEGIN:
+.Lzgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble zgemm_kernel_L1_END
+ ble .Lzgemm_kernel_L1_END
-zgemm_kernel_L1_M1_20:
+.Lzgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L1_M1_40
+ ble .Lzgemm_kernel_L1_M1_40
-zgemm_kernel_L1_M1_22:
+.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M1_22
+ bgt .Lzgemm_kernel_L1_M1_22
-zgemm_kernel_L1_M1_40:
+.Lzgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L1_M1_100
+ ble .Lzgemm_kernel_L1_M1_100
-zgemm_kernel_L1_M1_42:
+.Lzgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M1_42
+ bgt .Lzgemm_kernel_L1_M1_42
-zgemm_kernel_L1_M1_100:
+.Lzgemm_kernel_L1_M1_100:
SAVE1x1
-zgemm_kernel_L1_END:
+.Lzgemm_kernel_L1_END:
-zgemm_kernel_L999:
+.Lzgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble zgemm_kernel_L2_BEGIN
+ ble .Lzgemm_kernel_L2_BEGIN
-zgemm_kernel_L4_BEGIN:
+.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
mov pA, origPA // pA = start of A array
-zgemm_kernel_L4_M4_BEGIN:
+.Lzgemm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble zgemm_kernel_L4_M2_BEGIN
+ ble .Lzgemm_kernel_L4_M2_BEGIN
.align 5
-zgemm_kernel_L4_M4_20:
+.Lzgemm_kernel_L4_M4_20:
mov pB, origPB
asr counterL , origK, #3
cmp counterL , #2
- blt zgemm_kernel_L4_M4_32
+ blt .Lzgemm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M2
subs counterL, counterL, #2 // subtract 2
- ble zgemm_kernel_L4_M4_22a
+ ble .Lzgemm_kernel_L4_M4_22a
.align 5
-zgemm_kernel_L4_M4_22:
+.Lzgemm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M4_22
+ bgt .Lzgemm_kernel_L4_M4_22
.align 5
-zgemm_kernel_L4_M4_22a:
+.Lzgemm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
- b zgemm_kernel_L4_M4_44
+ b .Lzgemm_kernel_L4_M4_44
.align 5
-zgemm_kernel_L4_M4_32:
+.Lzgemm_kernel_L4_M4_32:
tst counterL, #1
- ble zgemm_kernel_L4_M4_40
+ ble .Lzgemm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
- b zgemm_kernel_L4_M4_44
+ b .Lzgemm_kernel_L4_M4_44
-zgemm_kernel_L4_M4_40:
+.Lzgemm_kernel_L4_M4_40:
INIT4x4
-zgemm_kernel_L4_M4_44:
+.Lzgemm_kernel_L4_M4_44:
ands counterL , origK, #7
- ble zgemm_kernel_L4_M4_100
+ ble .Lzgemm_kernel_L4_M4_100
.align 5
-zgemm_kernel_L4_M4_46:
+.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
- bne zgemm_kernel_L4_M4_46
+ bne .Lzgemm_kernel_L4_M4_46
-zgemm_kernel_L4_M4_100:
+.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVE4x4
-zgemm_kernel_L4_M4_END:
+.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1
- bne zgemm_kernel_L4_M4_20
+ bne .Lzgemm_kernel_L4_M4_20
-zgemm_kernel_L4_M2_BEGIN:
+.Lzgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble zgemm_kernel_L4_END
+ ble .Lzgemm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble zgemm_kernel_L4_M1_BEGIN
+ ble .Lzgemm_kernel_L4_M1_BEGIN
-zgemm_kernel_L4_M2_20:
+.Lzgemm_kernel_L4_M2_20:
INIT2x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L4_M2_40
+ ble .Lzgemm_kernel_L4_M2_40
-zgemm_kernel_L4_M2_22:
+.Lzgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M2_22
+ bgt .Lzgemm_kernel_L4_M2_22
-zgemm_kernel_L4_M2_40:
+.Lzgemm_kernel_L4_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L4_M2_100
+ ble .Lzgemm_kernel_L4_M2_100
-zgemm_kernel_L4_M2_42:
+.Lzgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M2_42
+ bgt .Lzgemm_kernel_L4_M2_42
-zgemm_kernel_L4_M2_100:
+.Lzgemm_kernel_L4_M2_100:
SAVE2x4
-zgemm_kernel_L4_M2_END:
+.Lzgemm_kernel_L4_M2_END:
-zgemm_kernel_L4_M1_BEGIN:
+.Lzgemm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble zgemm_kernel_L4_END
+ ble .Lzgemm_kernel_L4_END
-zgemm_kernel_L4_M1_20:
+.Lzgemm_kernel_L4_M1_20:
INIT1x4
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L4_M1_40
+ ble .Lzgemm_kernel_L4_M1_40
-zgemm_kernel_L4_M1_22:
+.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M1_22
+ bgt .Lzgemm_kernel_L4_M1_22
-zgemm_kernel_L4_M1_40:
+.Lzgemm_kernel_L4_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L4_M1_100
+ ble .Lzgemm_kernel_L4_M1_100
-zgemm_kernel_L4_M1_42:
+.Lzgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L4_M1_42
+ bgt .Lzgemm_kernel_L4_M1_42
-zgemm_kernel_L4_M1_100:
+.Lzgemm_kernel_L4_M1_100:
SAVE1x4
-zgemm_kernel_L4_END:
+.Lzgemm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
subs counterJ, counterJ , #1 // j--
- bgt zgemm_kernel_L4_BEGIN
+ bgt .Lzgemm_kernel_L4_BEGIN
/******************************************************************************/
-zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble zgemm_kernel_L999
+ ble .Lzgemm_kernel_L999
tst counterJ , #2
- ble zgemm_kernel_L1_BEGIN
+ ble .Lzgemm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
-zgemm_kernel_L2_M4_BEGIN:
+.Lzgemm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble zgemm_kernel_L2_M2_BEGIN
+ ble .Lzgemm_kernel_L2_M2_BEGIN
-zgemm_kernel_L2_M4_20:
+.Lzgemm_kernel_L2_M4_20:
INIT4x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble zgemm_kernel_L2_M4_40
+ ble .Lzgemm_kernel_L2_M4_40
.align 5
-zgemm_kernel_L2_M4_22:
+.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M4_22
+ bgt .Lzgemm_kernel_L2_M4_22
-zgemm_kernel_L2_M4_40:
+.Lzgemm_kernel_L2_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L2_M4_100
+ ble .Lzgemm_kernel_L2_M4_100
-zgemm_kernel_L2_M4_42:
+.Lzgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M4_42
+ bgt .Lzgemm_kernel_L2_M4_42
-zgemm_kernel_L2_M4_100:
+.Lzgemm_kernel_L2_M4_100:
SAVE4x2
-zgemm_kernel_L2_M4_END:
+.Lzgemm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt zgemm_kernel_L2_M4_20
+ bgt .Lzgemm_kernel_L2_M4_20
-zgemm_kernel_L2_M2_BEGIN:
+.Lzgemm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble zgemm_kernel_L2_END
+ ble .Lzgemm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble zgemm_kernel_L2_M1_BEGIN
+ ble .Lzgemm_kernel_L2_M1_BEGIN
-zgemm_kernel_L2_M2_20:
+.Lzgemm_kernel_L2_M2_20:
INIT2x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble zgemm_kernel_L2_M2_40
+ ble .Lzgemm_kernel_L2_M2_40
-zgemm_kernel_L2_M2_22:
+.Lzgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M2_22
+ bgt .Lzgemm_kernel_L2_M2_22
-zgemm_kernel_L2_M2_40:
+.Lzgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L2_M2_100
+ ble .Lzgemm_kernel_L2_M2_100
-zgemm_kernel_L2_M2_42:
+.Lzgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M2_42
+ bgt .Lzgemm_kernel_L2_M2_42
-zgemm_kernel_L2_M2_100:
+.Lzgemm_kernel_L2_M2_100:
SAVE2x2
-zgemm_kernel_L2_M2_END:
+.Lzgemm_kernel_L2_M2_END:
-zgemm_kernel_L2_M1_BEGIN:
+.Lzgemm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble zgemm_kernel_L2_END
+ ble .Lzgemm_kernel_L2_END
-zgemm_kernel_L2_M1_20:
+.Lzgemm_kernel_L2_M1_20:
INIT1x2
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble zgemm_kernel_L2_M1_40
+ ble .Lzgemm_kernel_L2_M1_40
-zgemm_kernel_L2_M1_22:
+.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M1_22
+ bgt .Lzgemm_kernel_L2_M1_22
-zgemm_kernel_L2_M1_40:
+.Lzgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L2_M1_100
+ ble .Lzgemm_kernel_L2_M1_100
-zgemm_kernel_L2_M1_42:
+.Lzgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L2_M1_42
+ bgt .Lzgemm_kernel_L2_M1_42
-zgemm_kernel_L2_M1_100:
+.Lzgemm_kernel_L2_M1_100:
SAVE1x2
-zgemm_kernel_L2_END:
+.Lzgemm_kernel_L2_END:
lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2
/******************************************************************************/
-zgemm_kernel_L1_BEGIN:
+.Lzgemm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble zgemm_kernel_L999 // done
+ ble .Lzgemm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
-zgemm_kernel_L1_M4_BEGIN:
+.Lzgemm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble zgemm_kernel_L1_M2_BEGIN
+ ble .Lzgemm_kernel_L1_M2_BEGIN
-zgemm_kernel_L1_M4_20:
+.Lzgemm_kernel_L1_M4_20:
INIT4x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L1_M4_40
+ ble .Lzgemm_kernel_L1_M4_40
.align 5
-zgemm_kernel_L1_M4_22:
+.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M4_22
+ bgt .Lzgemm_kernel_L1_M4_22
-zgemm_kernel_L1_M4_40:
+.Lzgemm_kernel_L1_M4_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L1_M4_100
+ ble .Lzgemm_kernel_L1_M4_100
-zgemm_kernel_L1_M4_42:
+.Lzgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M4_42
+ bgt .Lzgemm_kernel_L1_M4_42
-zgemm_kernel_L1_M4_100:
+.Lzgemm_kernel_L1_M4_100:
SAVE4x1
-zgemm_kernel_L1_M4_END:
+.Lzgemm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt zgemm_kernel_L1_M4_20
+ bgt .Lzgemm_kernel_L1_M4_20
-zgemm_kernel_L1_M2_BEGIN:
+.Lzgemm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble zgemm_kernel_L1_END
+ ble .Lzgemm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble zgemm_kernel_L1_M1_BEGIN
+ ble .Lzgemm_kernel_L1_M1_BEGIN
-zgemm_kernel_L1_M2_20:
+.Lzgemm_kernel_L1_M2_20:
INIT2x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L1_M2_40
+ ble .Lzgemm_kernel_L1_M2_40
-zgemm_kernel_L1_M2_22:
+.Lzgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M2_22
+ bgt .Lzgemm_kernel_L1_M2_22
-zgemm_kernel_L1_M2_40:
+.Lzgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L1_M2_100
+ ble .Lzgemm_kernel_L1_M2_100
-zgemm_kernel_L1_M2_42:
+.Lzgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M2_42
+ bgt .Lzgemm_kernel_L1_M2_42
-zgemm_kernel_L1_M2_100:
+.Lzgemm_kernel_L1_M2_100:
SAVE2x1
-zgemm_kernel_L1_M2_END:
+.Lzgemm_kernel_L1_M2_END:
-zgemm_kernel_L1_M1_BEGIN:
+.Lzgemm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble zgemm_kernel_L1_END
+ ble .Lzgemm_kernel_L1_END
-zgemm_kernel_L1_M1_20:
+.Lzgemm_kernel_L1_M1_20:
INIT1x1
mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble zgemm_kernel_L1_M1_40
+ ble .Lzgemm_kernel_L1_M1_40
-zgemm_kernel_L1_M1_22:
+.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M1_22
+ bgt .Lzgemm_kernel_L1_M1_22
-zgemm_kernel_L1_M1_40:
+.Lzgemm_kernel_L1_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
- ble zgemm_kernel_L1_M1_100
+ ble .Lzgemm_kernel_L1_M1_100
-zgemm_kernel_L1_M1_42:
+.Lzgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt zgemm_kernel_L1_M1_42
+ bgt .Lzgemm_kernel_L1_M1_42
-zgemm_kernel_L1_M1_100:
+.Lzgemm_kernel_L1_M1_100:
SAVE1x1
-zgemm_kernel_L1_END:
+.Lzgemm_kernel_L1_END:
-zgemm_kernel_L999:
+.Lzgemm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
SAVE_REGS
cmp N, xzr
- ble zgemv_n_kernel_L999
+ ble .Lzgemv_n_kernel_L999
cmp M, xzr
- ble zgemv_n_kernel_L999
+ ble .Lzgemv_n_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
INIT
cmp INC_Y, #1
- bne zgemv_n_kernel_S_BEGIN
+ bne .Lzgemv_n_kernel_S_BEGIN
-zgemv_n_kernel_F_LOOP:
+.Lzgemv_n_kernel_F_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
asr I, M, #2
cmp I, xzr
- beq zgemv_n_kernel_F1
+ beq .Lzgemv_n_kernel_F1
-zgemv_n_kernel_F4:
+.Lzgemv_n_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne zgemv_n_kernel_F4
+ bne .Lzgemv_n_kernel_F4
-zgemv_n_kernel_F1:
+.Lzgemv_n_kernel_F1:
ands I, M, #3
- ble zgemv_n_kernel_F_END
+ ble .Lzgemv_n_kernel_F_END
-zgemv_n_kernel_F10:
+.Lzgemv_n_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne zgemv_n_kernel_F10
+ bne .Lzgemv_n_kernel_F10
-zgemv_n_kernel_F_END:
+.Lzgemv_n_kernel_F_END:
add A, A, LDA
subs J, J, #1
- bne zgemv_n_kernel_F_LOOP
+ bne .Lzgemv_n_kernel_F_LOOP
- b zgemv_n_kernel_L999
+ b .Lzgemv_n_kernel_L999
-zgemv_n_kernel_S_BEGIN:
+.Lzgemv_n_kernel_S_BEGIN:
INIT_S
-zgemv_n_kernel_S_LOOP:
+.Lzgemv_n_kernel_S_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
asr I, M, #2
cmp I, xzr
- ble zgemv_n_kernel_S1
+ ble .Lzgemv_n_kernel_S1
-zgemv_n_kernel_S4:
+.Lzgemv_n_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne zgemv_n_kernel_S4
+ bne .Lzgemv_n_kernel_S4
-zgemv_n_kernel_S1:
+.Lzgemv_n_kernel_S1:
ands I, M, #3
- ble zgemv_n_kernel_S_END
+ ble .Lzgemv_n_kernel_S_END
-zgemv_n_kernel_S10:
+.Lzgemv_n_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne zgemv_n_kernel_S10
+ bne .Lzgemv_n_kernel_S10
-zgemv_n_kernel_S_END:
+.Lzgemv_n_kernel_S_END:
add A, A, LDA
subs J, J, #1
- bne zgemv_n_kernel_S_LOOP
+ bne .Lzgemv_n_kernel_S_LOOP
-zgemv_n_kernel_L999:
+.Lzgemv_n_kernel_L999:
RESTORE_REGS
mov w0, wzr
SAVE_REGS
cmp N, xzr
- ble zgemv_t_kernel_L999
+ ble .Lzgemv_t_kernel_L999
cmp M, xzr
- ble zgemv_t_kernel_L999
+ ble .Lzgemv_t_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
INIT
cmp INC_X, #1
- bne zgemv_t_kernel_S_BEGIN
+ bne .Lzgemv_t_kernel_S_BEGIN
-zgemv_t_kernel_F_LOOP:
+.Lzgemv_t_kernel_F_LOOP:
mov A_PTR, A
mov X_PTR, X
asr I, M, #2
cmp I, xzr
- beq zgemv_t_kernel_F1
+ beq .Lzgemv_t_kernel_F1
-zgemv_t_kernel_F4:
+.Lzgemv_t_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne zgemv_t_kernel_F4
+ bne .Lzgemv_t_kernel_F4
KERNEL_F4_FINALIZE
-zgemv_t_kernel_F1:
+.Lzgemv_t_kernel_F1:
ands I, M, #3
- ble zgemv_t_kernel_F_END
+ ble .Lzgemv_t_kernel_F_END
-zgemv_t_kernel_F10:
+.Lzgemv_t_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne zgemv_t_kernel_F10
+ bne .Lzgemv_t_kernel_F10
-zgemv_t_kernel_F_END:
+.Lzgemv_t_kernel_F_END:
#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
add A, A, LDA
subs J, J, #1
- bne zgemv_t_kernel_F_LOOP
+ bne .Lzgemv_t_kernel_F_LOOP
- b zgemv_t_kernel_L999
+ b .Lzgemv_t_kernel_L999
-zgemv_t_kernel_S_BEGIN:
+.Lzgemv_t_kernel_S_BEGIN:
INIT_S
-zgemv_t_kernel_S_LOOP:
+.Lzgemv_t_kernel_S_LOOP:
mov A_PTR, A
mov X_PTR, X
asr I, M, #2
cmp I, xzr
- ble zgemv_t_kernel_S1
+ ble .Lzgemv_t_kernel_S1
-zgemv_t_kernel_S4:
+.Lzgemv_t_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne zgemv_t_kernel_S4
+ bne .Lzgemv_t_kernel_S4
-zgemv_t_kernel_S1:
+.Lzgemv_t_kernel_S1:
ands I, M, #3
- ble zgemv_t_kernel_S_END
+ ble .Lzgemv_t_kernel_S_END
-zgemv_t_kernel_S10:
+.Lzgemv_t_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne zgemv_t_kernel_S10
+ bne .Lzgemv_t_kernel_S10
-zgemv_t_kernel_S_END:
+.Lzgemv_t_kernel_S_END:
#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
add A, A, LDA
subs J, J, #1
- bne zgemv_t_kernel_S_LOOP
+ bne .Lzgemv_t_kernel_S_LOOP
-zgemv_t_kernel_L999:
+.Lzgemv_t_kernel_L999:
RESTORE_REGS
mov w0, wzr
ret
INIT
cmp N, #0
- ble nrm2_kernel_L999
+ ble .Lznrm2_kernel_L999
cmp INC_X, #0
- beq nrm2_kernel_L999
+ beq .Lznrm2_kernel_L999
cmp INC_X, #1
- bne nrm2_kernel_S_BEGIN
+ bne .Lznrm2_kernel_S_BEGIN
-nrm2_kernel_F_BEGIN:
+.Lznrm2_kernel_F_BEGIN:
asr I, N, #3 // I = N / 8
cmp I, xzr
- ble nrm2_kernel_F1
+ ble .Lznrm2_kernel_F1
-nrm2_kernel_F8:
+.Lznrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
- bne nrm2_kernel_F8
+ bne .Lznrm2_kernel_F8
-nrm2_kernel_F1:
+.Lznrm2_kernel_F1:
ands I, N, #7
- ble nrm2_kernel_L999
+ ble .Lznrm2_kernel_L999
-nrm2_kernel_F10:
+.Lznrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne nrm2_kernel_F10
+ bne .Lznrm2_kernel_F10
- b nrm2_kernel_L999
+ b .Lznrm2_kernel_L999
-nrm2_kernel_S_BEGIN:
+.Lznrm2_kernel_S_BEGIN:
INIT_S
.align 5
-nrm2_kernel_S10:
+.Lznrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne nrm2_kernel_S10
+ bne .Lznrm2_kernel_S10
-nrm2_kernel_L999:
+.Lznrm2_kernel_L999:
fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ
PROLOGUE
cmp N, xzr
- ble rot_kernel_L999
+ ble .Lzrot_kernel_L999
INIT
cmp INC_X, #1
- bne rot_kernel_S_BEGIN
+ bne .Lzrot_kernel_S_BEGIN
cmp INC_Y, #1
- bne rot_kernel_S_BEGIN
+ bne .Lzrot_kernel_S_BEGIN
-rot_kernel_F_BEGIN:
+.Lzrot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq rot_kernel_F1
+ beq .Lzrot_kernel_F1
KERNEL_INIT_F4
-rot_kernel_F4:
+.Lzrot_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne rot_kernel_F4
+ bne .Lzrot_kernel_F4
-rot_kernel_F1:
+.Lzrot_kernel_F1:
ands I, N, #3
- ble rot_kernel_L999
+ ble .Lzrot_kernel_L999
-rot_kernel_F10:
+.Lzrot_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne rot_kernel_F10
+ bne .Lzrot_kernel_F10
mov w0, wzr
ret
-rot_kernel_S_BEGIN:
+.Lzrot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble rot_kernel_S1
+ ble .Lzrot_kernel_S1
-rot_kernel_S4:
+.Lzrot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne rot_kernel_S4
+ bne .Lzrot_kernel_S4
-rot_kernel_S1:
+.Lzrot_kernel_S1:
ands I, N, #3
- ble rot_kernel_L999
+ ble .Lzrot_kernel_L999
-rot_kernel_S10:
+.Lzrot_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne rot_kernel_S10
+ bne .Lzrot_kernel_S10
-rot_kernel_L999:
+.Lzrot_kernel_L999:
mov w0, wzr
ret
mov X_COPY, X
cmp N, xzr
- ble zscal_kernel_L999
+ ble .Lzscal_kernel_L999
fcmp DA_R, #0.0
- bne zscal_kernel_R_non_zero
+ bne .Lzscal_kernel_R_non_zero
fcmp DA_I, #0.0
- beq zscal_kernel_RI_zero
+ beq .Lzscal_kernel_RI_zero
- b zscal_kernel_R_zero
+ b .Lzscal_kernel_R_zero
-zscal_kernel_R_non_zero:
+.Lzscal_kernel_R_non_zero:
fcmp DA_I, #0.0
- beq zscal_kernel_I_zero
+ beq .Lzscal_kernel_I_zero
/*******************************************************************************
* A_R != 0 && A_I != 0
*******************************************************************************/
-zscal_kernel_RI_non_zero:
+.Lzscal_kernel_RI_non_zero:
INIT
cmp INC_X, #1
- bne zscal_kernel_S_BEGIN
+ bne .Lzscal_kernel_S_BEGIN
-zscal_kernel_F_BEGIN:
+.Lzscal_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
- beq zscal_kernel_F1
+ beq .Lzscal_kernel_F1
KERNEL_INIT_F4
-zscal_kernel_F4:
+.Lzscal_kernel_F4:
KERNEL_F4
subs I, I, #1
- bne zscal_kernel_F4
+ bne .Lzscal_kernel_F4
-zscal_kernel_F1:
+.Lzscal_kernel_F1:
ands I, N, #3
- ble zscal_kernel_L999
+ ble .Lzscal_kernel_L999
-zscal_kernel_F10:
+.Lzscal_kernel_F10:
KERNEL_F1
subs I, I, #1
- bne zscal_kernel_F10
+ bne .Lzscal_kernel_F10
mov w0, wzr
ret
-zscal_kernel_S_BEGIN:
+.Lzscal_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
- ble zscal_kernel_S1
+ ble .Lzscal_kernel_S1
-zscal_kernel_S4:
+.Lzscal_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
- bne zscal_kernel_S4
+ bne .Lzscal_kernel_S4
-zscal_kernel_S1:
+.Lzscal_kernel_S1:
ands I, N, #3
- ble zscal_kernel_L999
+ ble .Lzscal_kernel_L999
-zscal_kernel_S10:
+.Lzscal_kernel_S10:
KERNEL_S1
subs I, I, #1
- bne zscal_kernel_S10
+ bne .Lzscal_kernel_S10
-zscal_kernel_L999:
+.Lzscal_kernel_L999:
mov w0, wzr
ret
* A_R == 0 && A_I != 0
*******************************************************************************/
-zscal_kernel_R_zero:
+.Lzscal_kernel_R_zero:
INIT_S
#if !defined(DOUBLE)
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
#endif
-zscal_kernel_R_zero_1:
+.Lzscal_kernel_R_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
#endif
add X, X, INC_X
subs N, N, #1
- bne zscal_kernel_R_zero_1
+ bne .Lzscal_kernel_R_zero_1
mov w0, wzr
ret
* A_R != 0 && A_I == 0
*******************************************************************************/
-zscal_kernel_I_zero:
+.Lzscal_kernel_I_zero:
INIT_S
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
#endif
-zscal_kernel_I_zero_1:
+.Lzscal_kernel_I_zero_1:
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
#endif
add X, X, INC_X
subs N, N, #1
- bne zscal_kernel_I_zero_1
+ bne .Lzscal_kernel_I_zero_1
mov w0, wzr
ret
* A_R == 0 && A_I == 0
*******************************************************************************/
-zscal_kernel_RI_zero:
+.Lzscal_kernel_RI_zero:
INIT_S
-zscal_kernel_RI_zero_1:
+.Lzscal_kernel_RI_zero_1:
stp DA_R, DA_I, [X]
add X, X, INC_X
subs N, N, #1
- bne zscal_kernel_RI_zero_1
+ bne .Lzscal_kernel_RI_zero_1
mov w0, wzr
ret
mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0
- ble ztrmm_kernel_L2_BEGIN
+ ble .Lztrmm_kernel_L2_BEGIN
-ztrmm_kernel_L4_BEGIN:
+.Lztrmm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC
#endif
mov pA, origPA // pA = start of A array
-ztrmm_kernel_L4_M4_BEGIN:
+.Lztrmm_kernel_L4_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble ztrmm_kernel_L4_M2_BEGIN
+ ble .Lztrmm_kernel_L4_M2_BEGIN
.align 5
-ztrmm_kernel_L4_M4_20:
+.Lztrmm_kernel_L4_M4_20:
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB
asr counterL , tempK, #3
cmp counterL , #2
- blt ztrmm_kernel_L4_M4_32
+ blt .Lztrmm_kernel_L4_M4_32
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M2
subs counterL, counterL, #2
- ble ztrmm_kernel_L4_M4_22a
+ ble .Lztrmm_kernel_L4_M4_22a
.align 5
-ztrmm_kernel_L4_M4_22:
+.Lztrmm_kernel_L4_M4_22:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M2
subs counterL, counterL, #1
- bgt ztrmm_kernel_L4_M4_22
+ bgt .Lztrmm_kernel_L4_M4_22
.align 5
-ztrmm_kernel_L4_M4_22a:
+.Lztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
- b ztrmm_kernel_L4_M4_44
+ b .Lztrmm_kernel_L4_M4_44
.align 5
-ztrmm_kernel_L4_M4_32:
+.Lztrmm_kernel_L4_M4_32:
tst counterL, #1
- ble ztrmm_kernel_L4_M4_40
+ ble .Lztrmm_kernel_L4_M4_40
KERNEL4x4_I
KERNEL4x4_M2
KERNEL4x4_M1
KERNEL4x4_E
- b ztrmm_kernel_L4_M4_44
+ b .Lztrmm_kernel_L4_M4_44
-ztrmm_kernel_L4_M4_40:
+.Lztrmm_kernel_L4_M4_40:
INIT4x4
-ztrmm_kernel_L4_M4_44:
+.Lztrmm_kernel_L4_M4_44:
ands counterL , tempK, #7
- ble ztrmm_kernel_L4_M4_100
+ ble .Lztrmm_kernel_L4_M4_100
.align 5
-ztrmm_kernel_L4_M4_46:
+.Lztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB
subs counterL, counterL, #1
- bne ztrmm_kernel_L4_M4_46
+ bne .Lztrmm_kernel_L4_M4_46
-ztrmm_kernel_L4_M4_100:
+.Lztrmm_kernel_L4_M4_100:
SAVE4x4
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
-ztrmm_kernel_L4_M4_END:
+.Lztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1
- bne ztrmm_kernel_L4_M4_20
+ bne .Lztrmm_kernel_L4_M4_20
-ztrmm_kernel_L4_M2_BEGIN:
+.Lztrmm_kernel_L4_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ztrmm_kernel_L4_END
+ ble .Lztrmm_kernel_L4_END
tst counterI, #2 // counterI = counterI / 2
- ble ztrmm_kernel_L4_M1_BEGIN
+ ble .Lztrmm_kernel_L4_M1_BEGIN
-ztrmm_kernel_L4_M2_20:
+.Lztrmm_kernel_L4_M2_20:
INIT2x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ztrmm_kernel_L4_M2_40
+ ble .Lztrmm_kernel_L4_M2_40
-ztrmm_kernel_L4_M2_22:
+.Lztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB
KERNEL2x4_SUB
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L4_M2_22
+ bgt .Lztrmm_kernel_L4_M2_22
-ztrmm_kernel_L4_M2_40:
+.Lztrmm_kernel_L4_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ztrmm_kernel_L4_M2_100
+ ble .Lztrmm_kernel_L4_M2_100
-ztrmm_kernel_L4_M2_42:
+.Lztrmm_kernel_L4_M2_42:
KERNEL2x4_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L4_M2_42
+ bgt .Lztrmm_kernel_L4_M2_42
-ztrmm_kernel_L4_M2_100:
+.Lztrmm_kernel_L4_M2_100:
SAVE2x4
add tempOffset, tempOffset, #2
#endif
-ztrmm_kernel_L4_M2_END:
+.Lztrmm_kernel_L4_M2_END:
-ztrmm_kernel_L4_M1_BEGIN:
+.Lztrmm_kernel_L4_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ztrmm_kernel_L4_END
+ ble .Lztrmm_kernel_L4_END
-ztrmm_kernel_L4_M1_20:
+.Lztrmm_kernel_L4_M1_20:
INIT1x4
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ztrmm_kernel_L4_M1_40
+ ble .Lztrmm_kernel_L4_M1_40
-ztrmm_kernel_L4_M1_22:
+.Lztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L4_M1_22
+ bgt .Lztrmm_kernel_L4_M1_22
-ztrmm_kernel_L4_M1_40:
+.Lztrmm_kernel_L4_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ztrmm_kernel_L4_M1_100
+ ble .Lztrmm_kernel_L4_M1_100
-ztrmm_kernel_L4_M1_42:
+.Lztrmm_kernel_L4_M1_42:
KERNEL1x4_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L4_M1_42
+ bgt .Lztrmm_kernel_L4_M1_42
-ztrmm_kernel_L4_M1_100:
+.Lztrmm_kernel_L4_M1_100:
SAVE1x4
#endif
-ztrmm_kernel_L4_END:
+.Lztrmm_kernel_L4_END:
lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2
#endif
subs counterJ, counterJ , #1 // j--
- bgt ztrmm_kernel_L4_BEGIN
+ bgt .Lztrmm_kernel_L4_BEGIN
/******************************************************************************/
-ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
+.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov counterJ , origN
tst counterJ , #3
- ble ztrmm_kernel_L999 // error, N was less than 4?
+ ble .Lztrmm_kernel_L999 // error, N was less than 4?
tst counterJ , #2
- ble ztrmm_kernel_L1_BEGIN
+ ble .Lztrmm_kernel_L1_BEGIN
mov pCRow0, pC // pCRow0 = pC
mov pA, origPA // pA = A
-ztrmm_kernel_L2_M4_BEGIN:
+.Lztrmm_kernel_L2_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0
- ble ztrmm_kernel_L2_M2_BEGIN
+ ble .Lztrmm_kernel_L2_M2_BEGIN
-ztrmm_kernel_L2_M4_20:
+.Lztrmm_kernel_L2_M4_20:
INIT4x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble ztrmm_kernel_L2_M4_40
+ ble .Lztrmm_kernel_L2_M4_40
.align 5
-ztrmm_kernel_L2_M4_22:
+.Lztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L2_M4_22
+ bgt .Lztrmm_kernel_L2_M4_22
-ztrmm_kernel_L2_M4_40:
+.Lztrmm_kernel_L2_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ztrmm_kernel_L2_M4_100
+ ble .Lztrmm_kernel_L2_M4_100
-ztrmm_kernel_L2_M4_42:
+.Lztrmm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L2_M4_42
+ bgt .Lztrmm_kernel_L2_M4_42
-ztrmm_kernel_L2_M4_100:
+.Lztrmm_kernel_L2_M4_100:
SAVE4x2
add tempOffset, tempOffset, #4
#endif
-ztrmm_kernel_L2_M4_END:
+.Lztrmm_kernel_L2_M4_END:
subs counterI, counterI, #1
- bgt ztrmm_kernel_L2_M4_20
+ bgt .Lztrmm_kernel_L2_M4_20
-ztrmm_kernel_L2_M2_BEGIN:
+.Lztrmm_kernel_L2_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ztrmm_kernel_L2_END
+ ble .Lztrmm_kernel_L2_END
tst counterI, #2 // counterI = counterI / 2
- ble ztrmm_kernel_L2_M1_BEGIN
+ ble .Lztrmm_kernel_L2_M1_BEGIN
-ztrmm_kernel_L2_M2_20:
+.Lztrmm_kernel_L2_M2_20:
INIT2x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0
- ble ztrmm_kernel_L2_M2_40
+ ble .Lztrmm_kernel_L2_M2_40
-ztrmm_kernel_L2_M2_22:
+.Lztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L2_M2_22
+ bgt .Lztrmm_kernel_L2_M2_22
-ztrmm_kernel_L2_M2_40:
+.Lztrmm_kernel_L2_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ztrmm_kernel_L2_M2_100
+ ble .Lztrmm_kernel_L2_M2_100
-ztrmm_kernel_L2_M2_42:
+.Lztrmm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L2_M2_42
+ bgt .Lztrmm_kernel_L2_M2_42
-ztrmm_kernel_L2_M2_100:
+.Lztrmm_kernel_L2_M2_100:
SAVE2x2
add tempOffset, tempOffset, #2
#endif
-ztrmm_kernel_L2_M2_END:
+.Lztrmm_kernel_L2_M2_END:
-ztrmm_kernel_L2_M1_BEGIN:
+.Lztrmm_kernel_L2_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ztrmm_kernel_L2_END
+ ble .Lztrmm_kernel_L2_END
-ztrmm_kernel_L2_M1_20:
+.Lztrmm_kernel_L2_M1_20:
INIT1x2
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0
- ble ztrmm_kernel_L2_M1_40
+ ble .Lztrmm_kernel_L2_M1_40
-ztrmm_kernel_L2_M1_22:
+.Lztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L2_M1_22
+ bgt .Lztrmm_kernel_L2_M1_22
-ztrmm_kernel_L2_M1_40:
+.Lztrmm_kernel_L2_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ztrmm_kernel_L2_M1_100
+ ble .Lztrmm_kernel_L2_M1_100
-ztrmm_kernel_L2_M1_42:
+.Lztrmm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L2_M1_42
+ bgt .Lztrmm_kernel_L2_M1_42
-ztrmm_kernel_L2_M1_100:
+.Lztrmm_kernel_L2_M1_100:
SAVE1x2
#endif
-ztrmm_kernel_L2_END:
+.Lztrmm_kernel_L2_END:
#if !defined(LEFT)
add tempOffset, tempOffset, #2
#endif
/******************************************************************************/
-ztrmm_kernel_L1_BEGIN:
+.Lztrmm_kernel_L1_BEGIN:
mov counterJ , origN
tst counterJ , #1
- ble ztrmm_kernel_L999 // done
+ ble .Lztrmm_kernel_L999 // done
mov pCRow0, pC // pCRow0 = C
-ztrmm_kernel_L1_M4_BEGIN:
+.Lztrmm_kernel_L1_M4_BEGIN:
mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0
- ble ztrmm_kernel_L1_M2_BEGIN
+ ble .Lztrmm_kernel_L1_M2_BEGIN
-ztrmm_kernel_L1_M4_20:
+.Lztrmm_kernel_L1_M4_20:
INIT4x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ztrmm_kernel_L1_M4_40
+ ble .Lztrmm_kernel_L1_M4_40
.align 5
-ztrmm_kernel_L1_M4_22:
+.Lztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L1_M4_22
+ bgt .Lztrmm_kernel_L1_M4_22
-ztrmm_kernel_L1_M4_40:
+.Lztrmm_kernel_L1_M4_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ztrmm_kernel_L1_M4_100
+ ble .Lztrmm_kernel_L1_M4_100
-ztrmm_kernel_L1_M4_42:
+.Lztrmm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L1_M4_42
+ bgt .Lztrmm_kernel_L1_M4_42
-ztrmm_kernel_L1_M4_100:
+.Lztrmm_kernel_L1_M4_100:
SAVE4x1
add tempOffset, tempOffset, #4
#endif
-ztrmm_kernel_L1_M4_END:
+.Lztrmm_kernel_L1_M4_END:
subs counterI, counterI, #1
- bgt ztrmm_kernel_L1_M4_20
+ bgt .Lztrmm_kernel_L1_M4_20
-ztrmm_kernel_L1_M2_BEGIN:
+.Lztrmm_kernel_L1_M2_BEGIN:
mov counterI, origM
tst counterI , #3
- ble ztrmm_kernel_L1_END
+ ble .Lztrmm_kernel_L1_END
tst counterI, #2 // counterI = counterI / 2
- ble ztrmm_kernel_L1_M1_BEGIN
+ ble .Lztrmm_kernel_L1_M1_BEGIN
-ztrmm_kernel_L1_M2_20:
+.Lztrmm_kernel_L1_M2_20:
INIT2x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ztrmm_kernel_L1_M2_40
+ ble .Lztrmm_kernel_L1_M2_40
-ztrmm_kernel_L1_M2_22:
+.Lztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L1_M2_22
+ bgt .Lztrmm_kernel_L1_M2_22
-ztrmm_kernel_L1_M2_40:
+.Lztrmm_kernel_L1_M2_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ztrmm_kernel_L1_M2_100
+ ble .Lztrmm_kernel_L1_M2_100
-ztrmm_kernel_L1_M2_42:
+.Lztrmm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L1_M2_42
+ bgt .Lztrmm_kernel_L1_M2_42
-ztrmm_kernel_L1_M2_100:
+.Lztrmm_kernel_L1_M2_100:
SAVE2x1
add tempOffset, tempOffset, #2
#endif
-ztrmm_kernel_L1_M2_END:
+.Lztrmm_kernel_L1_M2_END:
-ztrmm_kernel_L1_M1_BEGIN:
+.Lztrmm_kernel_L1_M1_BEGIN:
tst counterI, #1 // counterI = counterI % 2
- ble ztrmm_kernel_L1_END
+ ble .Lztrmm_kernel_L1_END
-ztrmm_kernel_L1_M1_20:
+.Lztrmm_kernel_L1_M1_20:
INIT1x1
asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0
- ble ztrmm_kernel_L1_M1_40
+ ble .Lztrmm_kernel_L1_M1_40
-ztrmm_kernel_L1_M1_22:
+.Lztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L1_M1_22
+ bgt .Lztrmm_kernel_L1_M1_22
-ztrmm_kernel_L1_M1_40:
+.Lztrmm_kernel_L1_M1_40:
ands counterL , tempK, #7 // counterL = counterL % 8
- ble ztrmm_kernel_L1_M1_100
+ ble .Lztrmm_kernel_L1_M1_100
-ztrmm_kernel_L1_M1_42:
+.Lztrmm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs counterL, counterL, #1
- bgt ztrmm_kernel_L1_M1_42
+ bgt .Lztrmm_kernel_L1_M1_42
-ztrmm_kernel_L1_M1_100:
+.Lztrmm_kernel_L1_M1_100:
SAVE1x1
-ztrmm_kernel_L1_END:
+.Lztrmm_kernel_L1_END:
-ztrmm_kernel_L999:
+.Lztrmm_kernel_L999:
mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]