stp q0, q1, [pCRow0]
add pCRow0, pCRow0, #32
- prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ldp q2, q3, [pCRow0]
fmla v2.2d, v18.2d, alphaV0
stp q4, q5, [pCRow1]
add pCRow1, pCRow1, #32
- prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ldp q6, q7, [pCRow1]
fmla v6.2d, v22.2d, alphaV0
stp q0, q1, [pCRow2]
add pCRow2, pCRow2, #32
- prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
ldp q2, q3, [pCRow2]
fmla v2.2d, v26.2d, alphaV0
stp q4, q5, [pCRow3]
add pCRow3, pCRow3, #32
- prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
ldp q6, q7, [pCRow3]
fmla v6.2d, v30.2d, alphaV0
.macro SAVE4x4
fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
- add pCRow1, pCRow0, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #32
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
- add pCRow2, pCRow1, LDC
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #32
ld1 {v8.2d, v9.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
fmla v9.2d, v25.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow2]
- add pCRow1, pCRow2, LDC
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+ add pCRow2, pCRow2, #32
- ld1 {v12.2d, v13.2d}, [pCRow1]
+ ld1 {v12.2d, v13.2d}, [pCRow3]
fmla v12.2d, v28.2d, alphaV0
fmla v13.2d, v29.2d, alphaV0
- st1 {v12.2d, v13.2d}, [pCRow1]
+ st1 {v12.2d, v13.2d}, [pCRow3]
- add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+ add pCRow3, pCRow3, #32
.endm
/******************************************************************************/
.macro SAVE2x4
fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
- add pCRow1, pCRow0, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #16
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
- add pCRow2, pCRow1, LDC
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #16
ld1 {v8.2d}, [pCRow2]
fmla v8.2d, v24.2d, alphaV0
st1 {v8.2d}, [pCRow2]
- add pCRow1, pCRow2, LDC
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+ add pCRow2, pCRow2, #16
- ld1 {v12.2d}, [pCRow1]
+ ld1 {v12.2d}, [pCRow3]
fmla v12.2d, v28.2d, alphaV0
- st1 {v12.2d}, [pCRow1]
+ st1 {v12.2d}, [pCRow3]
- add pCRow0, pCRow0, #16
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+ add pCRow3, pCRow3, #16
.endm
/******************************************************************************/
.macro SAVE1x4
fmov alpha0, alpha
- add pCRow1, pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
ld1 {v8.d}[1], [pCRow1]
st1 {v8.d}[0], [pCRow0]
st1 {v8.d}[1], [pCRow1]
- add pCRow2, pCRow1, LDC
- add pCRow1, pCRow2, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #8
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #8
ld1 {v12.d}[0], [pCRow2]
- ld1 {v12.d}[1], [pCRow1]
+ ld1 {v12.d}[1], [pCRow3]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.d}[0], [pCRow2]
- st1 {v12.d}[1], [pCRow1]
+ st1 {v12.d}[1], [pCRow3]
- add pCRow0, pCRow0, #8
+ prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+ add pCRow2, pCRow2, #8
+ prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+ add pCRow3, pCRow3, #8
.endm
/******************************************************************************/
fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.d[0]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
fmla v20.2d, v0.2d, v8.d[1]
fmla v21.2d, v1.2d, v8.d[1]
fmla v22.2d, v2.2d, v8.d[1]
.macro SAVE8x2
fmov alpha0, alpha
- add pCRow1, pCRow0, LDC
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #64
+
ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
fmla v4.2d, v20.2d, alphaV0
fmla v5.2d, v21.2d, alphaV0
fmla v7.2d, v23.2d, alphaV0
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
- add pCRow0, pCRow0, #64
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #64
.endm
/******************************************************************************/
.macro SAVE4x2
fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
- add pCRow1, pCRow0, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #32
ld1 {v12.2d, v13.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
fmla v13.2d, v21.2d, alphaV0
st1 {v12.2d, v13.2d}, [pCRow1]
- add pCRow0, pCRow0, #32
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #32
.endm
/******************************************************************************/
.macro SAVE2x2
fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
- add pCRow1 , pCRow0, LDC
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+ add pCRow0, pCRow0, #16
ld1 {v12.2d}, [pCRow1]
fmla v12.2d, v20.2d, alphaV0
st1 {v12.2d}, [pCRow1]
- add pCRow0, pCRow0, #16
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #16
.endm
/******************************************************************************/
.macro SAVE1x2
fmov alpha0, alpha
- add pCRow1 , pCRow0, LDC
ld1 {v8.d}[0], [pCRow0]
ld1 {v8.d}[1], [pCRow1]
st1 {v8.d}[0], [pCRow0]
st1 {v8.d}[1], [pCRow1]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #8
+ prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+ add pCRow1, pCRow1, #8
.endm
/******************************************************************************/
fmla v16.2d, v0.2d, v8.d[0]
fmla v17.2d, v1.2d, v8.d[0]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
fmla v18.2d, v2.2d, v8.d[0]
fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x1
fmov alpha0, alpha
+
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
fmla v0.2d, v16.2d, alphaV0
fmla v1.2d, v17.2d, alphaV0
fmla v3.2d, v19.2d, alphaV0
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #64
.endm
.macro SAVE4x1
fmov alpha0, alpha
+
ld1 {v8.2d, v9.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
fmla v9.2d, v17.2d, alphaV0
st1 {v8.2d, v9.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #32
.endm
.macro SAVE2x1
fmov alpha0, alpha
+
ld1 {v8.2d}, [pCRow0]
fmla v8.2d, v16.2d, alphaV0
st1 {v8.2d}, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #16
.endm
fmadd d8, d16, alpha0, d8
str d8, [pCRow0]
+ prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
add pCRow0, pCRow0, #8
.endm
/******************************************************************************/
+ .align 5
dgemm_kernel_L4_BEGIN:
mov pCRow0, pC
add pCRow1, pCRow0, LDC
cmp counterL , #0
ble dgemm_kernel_L4_M4_40
+ .align 5
dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
dgemm_kernel_L4_M4_42:
KERNEL4x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
dgemm_kernel_L4_M4_END:
-
dgemm_kernel_L4_M2_BEGIN:
mov counterI, origM
cmp counterL , #0
ble dgemm_kernel_L4_M2_40
+ .align 5
dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x4_SUB
subs counterL, counterL, #1
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
KERNEL2x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
cmp counterL , #0
ble dgemm_kernel_L4_M1_40
+ .align 5
dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB
subs counterL, counterL, #1
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
KERNEL1x4_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
- mov pCRow0, pC // pCRow0 = pC
+ mov pCRow0, pC
+ add pCRow1, pCRow0, LDC
- add pC,pC,LDC, lsl #1
+ add pC, pCRow1, LDC
mov pA, origPA // pA = A
cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
+ .align 5
dgemm_kernel_L2_M8_20:
INIT8x2
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M8_40
- .align 5
+ .align 5
dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB
KERNEL8x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
KERNEL8x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL8x2_SUB
KERNEL8x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
-
dgemm_kernel_L2_M8_40:
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
KERNEL8x2_SUB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0
ble dgemm_kernel_L2_M4_40
- .align 5
+ .align 5
dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB
subs counterL, counterL, #1
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL2x2_SUB
KERNEL2x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
-
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x2_SUB
KERNEL1x2_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+
KERNEL1x2_SUB
KERNEL1x2_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x2_SUB
KERNEL1x2_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
-
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
ands counterL , origK, #7 // counterL = counterL % 8
cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
+ .align 5
dgemm_kernel_L1_M8_20:
INIT8x1
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M8_40
- .align 5
+ .align 5
dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
KERNEL8x1_SUB
KERNEL8x1_SUB
KERNEL8x1_SUB
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
KERNEL8x1_SUB
asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0
ble dgemm_kernel_L1_M4_40
- .align 5
+ .align 5
dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB
subs counterL, counterL, #1
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
KERNEL2x1_SUB
KERNEL2x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x1_SUB
KERNEL2x1_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
+
KERNEL2x1_SUB
KERNEL2x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL2x1_SUB
KERNEL2x1_SUB
subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
-
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
ands counterL , origK, #7 // counterL = counterL % 8
cmp counterL , #0
ble dgemm_kernel_L1_M1_40
+
dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x1_SUB
KERNEL1x1_SUB
ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
+ prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
+ prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
KERNEL1x1_SUB