Optimize Dgemm 4x4 for Cortex A57
authorAshwin Sekhar T K <ashwin@broadcom.com>
Mon, 14 Mar 2016 14:05:23 +0000 (19:35 +0530)
committerAshwin Sekhar T K <ashwin@broadcom.com>
Mon, 14 Mar 2016 14:05:23 +0000 (19:35 +0530)
kernel/arm64/dgemm_kernel_4x4.S

index e88253a..e2ad114 100644 (file)
@@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow0         x12
 #define pCRow1         x13
 #define pCRow2         x14
-#define pA             x15
-#define ppC            x16
-#define ppCRow0                x17
-#define ppCRow1                x18
-#define ppCRow2                x19
-#define ppA            x20
+#define pCRow3         x15
+#define pA             x16
+#define ppC            x17
+#define ppCRow0                x18
+#define ppCRow1                x19
+#define ppCRow2                x20
+#define ppCRow3                x21
+#define ppA            x22
+#define alpha          x23
 
 #define alpha0         d10
 #define alphaV0                v10.d[0]
-#define alpha1         d11
-#define alphaV1                v11.d[0]
-#define alpha2         d14
-#define alphaV2                v14.d[0]
-#define alpha3         d15
-#define alphaV3                v15.d[0]
+
+#define A_PRE_SIZE     1024
+#define B_PRE_SIZE     1024
+#define C_PRE_SIZE     128
 
 // 00 origM
 // 01 origN
@@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // 12 pCRow0
 // 13 pCRow1
 // 14 pCRow2
-// 15 pA
-// 16 ppC
-// 17 ppCRow0
-// 18 must save ppCRow1
-// 19 must save ppCRow2
-// 20 must save ppA
-// 21 must save
-// 22 must save
-// 23 must save
+// 15 pCRow3
+// 16 pA
+// 17 ppC
+// 18 must save ppCRow0
+// 19 must save ppCRow1
+// 20 must save ppCRow2
+// 21 must save ppCRow3
+// 22 must save ppA
+// 23 must save alpha
 // 24 must save
 // 25 must save
 // 26 must save
@@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //v08 must save pB00, pB01
 //v09 must save pB02, pB03
 //v10 must save ALPHA0
-//v11 must save ALPHA1
+//v11 must save
 //v12 must save pB10, pB11
 //v13 must save pB12, pB13
-//v14 must save ALPHA2
-//v15 must save ALPHA3
+//v14 must save
+//v15 must save
 //v16 must save C00, C01
 //v17 must save C02, C03
 //v18 ppC00, ppC01
@@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_I
-       ld1     {v8.2d, v9.2d}, [pB]
-       add     pB, pB, #32
-       ld1     {v0.2d, v1.2d}, [pA]
+       ldp     d8, d9, [pB]
+       add     pB, pB, #16
+       ldp     d10, d11, [pB]
+       add     pB, pB, #16
+
+       ldp     q0, q1, [pA]
        add     pA, pA, #32
 
        fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v29.2d, v1.2d, v9.2d[1]
+       fmul    v29.2d, v1.2d, v11.2d[0]
 
-       ld1     {v2.2d, v3.2d}, [ppA]
+       ldp     q2, q3, [ppA]
        add     ppA, ppA, #32
 
-       fmul    v20.2d, v0.2d, v8.2d[1]
-       fmul    v25.2d, v1.2d, v9.2d[0]
+       fmul    v20.2d, v0.2d, v9.2d[0]
+       fmul    v25.2d, v1.2d, v10.2d[0]
+
+       prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
        fmul    v18.2d, v2.2d, v8.2d[0]
-       fmul    v31.2d, v3.2d, v9.2d[1]
-       fmul    v22.2d, v2.2d, v8.2d[1]
-       fmul    v27.2d, v3.2d, v9.2d[0]
+       fmul    v31.2d, v3.2d, v11.2d[0]
 
-       ld1     {v12.2d, v13.2d}, [pB]          // for next round
-       add     pB, pB, #32
+       prfm    PLDL1KEEP, [ppA, #A_PRE_SIZE]
+
+       fmul    v22.2d, v2.2d, v9.2d[0]
+       fmul    v27.2d, v3.2d, v10.2d[0]
+
+       ldp     d12, d13, [pB]
+       add     pB, pB, #16
 
-       fmul    v24.2d, v0.2d, v9.2d[0]
-       fmul    v21.2d, v1.2d, v8.2d[1]
+       fmul    v24.2d, v0.2d, v10.2d[0]
+       fmul    v21.2d, v1.2d, v9.2d[0]
 
-       ld1     {v4.2d, v5.2d} , [pA]           // for next round
+       ldp     q4, q5, [pA]            // for next round
        add     pA, pA, #32
 
-       fmul    v26.2d, v2.2d, v9.2d[0]
-       fmul    v23.2d, v3.2d, v8.2d[1]
+       fmul    v26.2d, v2.2d, v10.2d[0]
+       fmul    v23.2d, v3.2d, v9.2d[0]
 
-       ld1     {v6.2d, v7.2d} , [ppA]          // for next round
+       ldp     q6, q7, [ppA]           // for next round
        add     ppA, ppA, #32
 
-       fmul    v28.2d, v0.2d, v9.2d[1]
+       fmul    v28.2d, v0.2d, v11.2d[0]
        fmul    v17.2d, v1.2d, v8.2d[0]
-       fmul    v30.2d, v2.2d, v9.2d[1]
+
+       ldp     d14, d15, [pB]
+       add     pB, pB, #16
+
+       fmul    v30.2d, v2.2d, v11.2d[0]
        fmul    v19.2d, v3.2d, v8.2d[0]
 .endm
 
 .macro KERNEL8x4_M2
        fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v29.2d, v5.2d, v13.2d[1]
+       fmla    v29.2d, v5.2d, v15.2d[0]
 
-       ld1     {v8.2d, v9.2d}, [pB]
-       add     pB, pB, #32
+       ldp     d8, d9, [pB]
+       add     pB, pB, #16
 
        fmla    v18.2d, v6.2d, v12.2d[0]
-       fmla    v31.2d, v7.2d, v13.2d[1]
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v25.2d, v5.2d, v13.2d[0]
+       fmla    v31.2d, v7.2d, v15.2d[0]
 
-       prfm    PLDL1KEEP, [pB, #512]
+       ldp     d10, d11, [pB]
+       add     pB, pB, #16
 
-       fmla    v22.2d, v6.2d, v12.2d[1]
-       fmla    v27.2d, v7.2d, v13.2d[0]
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v12.2d[1]
+       fmla    v20.2d, v4.2d, v13.2d[0]
+       fmla    v25.2d, v5.2d, v14.2d[0]
 
-       ld1     {v0.2d, v1.2d}, [pA]
+       prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+       fmla    v22.2d, v6.2d, v13.2d[0]
+       fmla    v27.2d, v7.2d, v14.2d[0]
+       fmla    v24.2d, v4.2d, v14.2d[0]
+       fmla    v21.2d, v5.2d, v13.2d[0]
+
+       ldp     q0, q1, [pA]
        add     pA, pA, #32
 
-       fmla    v26.2d, v6.2d, v13.2d[0]
-       fmla    v23.2d, v7.2d, v12.2d[1]
-       fmla    v28.2d, v4.2d, v13.2d[1]
+       fmla    v26.2d, v6.2d, v14.2d[0]
+       fmla    v23.2d, v7.2d, v13.2d[0]
+       fmla    v28.2d, v4.2d, v15.2d[0]
        fmla    v17.2d, v5.2d, v12.2d[0]
 
-       ld1     {v2.2d, v3.2d}, [ppA]
+       ldp     q2, q3, [ppA]
        add     ppA, ppA, #32
 
-       fmla    v30.2d, v6.2d, v13.2d[1]
+       fmla    v30.2d, v6.2d, v15.2d[0]
        fmla    v19.2d, v7.2d, v12.2d[0]
 .endm
 
 .macro KERNEL8x4_M1
        fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v29.2d, v1.2d, v11.2d[0]
 
-       ld1     {v12.2d, v13.2d}, [pB]          // for next round
-       add     pB, pB, #32
+       ldp     d12, d13, [pB]
+       add     pB, pB, #16
 
        fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v31.2d, v3.2d, v9.2d[1]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v31.2d, v3.2d, v11.2d[0]
 
-       prfm    PLDL1KEEP, [pA, #512]
+       ldp     d14, d15, [pB]
+       add     pB, pB, #16
 
-       fmla    v22.2d, v2.2d, v8.2d[1]
-       fmla    v27.2d, v3.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v9.2d[0]
+       fmla    v25.2d, v1.2d, v10.2d[0]
 
-       prfm    PLDL1KEEP, [ppA, #512]
+       prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v22.2d, v2.2d, v9.2d[0]
+       fmla    v27.2d, v3.2d, v10.2d[0]
+
+       prfm    PLDL1KEEP, [ppA, #A_PRE_SIZE]
+
+       fmla    v24.2d, v0.2d, v10.2d[0]
+       fmla    v21.2d, v1.2d, v9.2d[0]
 
-       ld1     {v4.2d, v5.2d} , [pA]           // for next round
+       ldp     q4, q5, [pA]
        add     pA, pA, #32
 
-       fmla    v26.2d, v2.2d, v9.2d[0]
-       fmla    v23.2d, v3.2d, v8.2d[1]
-       fmla    v28.2d, v0.2d, v9.2d[1]
+       fmla    v26.2d, v2.2d, v10.2d[0]
+       fmla    v23.2d, v3.2d, v9.2d[0]
+
+       fmla    v28.2d, v0.2d, v11.2d[0]
        fmla    v17.2d, v1.2d, v8.2d[0]
 
-       ld1     {v6.2d, v7.2d} , [ppA]          // for next round
+       ldp     q6, q7, [ppA]
        add     ppA, ppA, #32
 
-       fmla    v30.2d, v2.2d, v9.2d[1]
+       fmla    v30.2d, v2.2d, v11.2d[0]
        fmla    v19.2d, v3.2d, v8.2d[0]
 .endm
 
 .macro KERNEL8x4_E
        fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v25.2d, v5.2d, v13.2d[0]
+       fmla    v25.2d, v5.2d, v14.2d[0]
        fmla    v18.2d, v6.2d, v12.2d[0]
-       fmla    v27.2d, v7.2d, v13.2d[0]
+       fmla    v27.2d, v7.2d, v14.2d[0]
 
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v29.2d, v5.2d, v13.2d[1]
-       fmla    v22.2d, v6.2d, v12.2d[1]
-       fmla    v31.2d, v7.2d, v13.2d[1]
+       fmla    v20.2d, v4.2d, v13.2d[0]
+       fmla    v29.2d, v5.2d, v15.2d[0]
+       fmla    v22.2d, v6.2d, v13.2d[0]
+       fmla    v31.2d, v7.2d, v15.2d[0]
 
-       fmla    v24.2d, v4.2d, v13.2d[0]
+       fmla    v24.2d, v4.2d, v14.2d[0]
        fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v26.2d, v6.2d, v13.2d[0]
+       fmla    v26.2d, v6.2d, v14.2d[0]
        fmla    v19.2d, v7.2d, v12.2d[0]
 
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v21.2d, v5.2d, v12.2d[1]
-       fmla    v30.2d, v6.2d, v13.2d[1]
-       fmla    v23.2d, v7.2d, v12.2d[1]
+       fmla    v28.2d, v4.2d, v15.2d[0]
+       fmla    v21.2d, v5.2d, v13.2d[0]
+       fmla    v30.2d, v6.2d, v15.2d[0]
+       fmla    v23.2d, v7.2d, v13.2d[0]
 .endm
 
 .macro KERNEL8x4_SUB
-       ld1     {v8.2d, v9.2d}, [pB]
-       add     pB, pB, #32
-       ld1     {v0.2d, v1.2d}, [pA]
+       ldp     d8, d9, [pB]
+       add     pB, pB, #16
+       ldp     d10, d11, [pB]
+       add     pB, pB, #16
+       ldp     q0, q1, [pA]
        add     pA, pA, #32
 
        fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v29.2d, v1.2d, v11.2d[0]
+       fmla    v20.2d, v0.2d, v9.2d[0]
+       fmla    v25.2d, v1.2d, v10.2d[0]
 
-       ld1     {v2.2d, v3.2d}, [ppA]
+       ldp     q2, q3, [ppA]
        add     ppA, ppA, #32
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
-       fmla    v28.2d, v0.2d, v9.2d[1]
+       fmla    v24.2d, v0.2d, v10.2d[0]
+       fmla    v21.2d, v1.2d, v9.2d[0]
+       fmla    v28.2d, v0.2d, v11.2d[0]
        fmla    v17.2d, v1.2d, v8.2d[0]
 
        fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v31.2d, v3.2d, v9.2d[1]
-       fmla    v22.2d, v2.2d, v8.2d[1]
-       fmla    v27.2d, v3.2d, v9.2d[0]
+       fmla    v31.2d, v3.2d, v11.2d[0]
+       fmla    v22.2d, v2.2d, v9.2d[0]
+       fmla    v27.2d, v3.2d, v10.2d[0]
 
-       fmla    v26.2d, v2.2d, v9.2d[0]
-       fmla    v23.2d, v3.2d, v8.2d[1]
-       fmla    v30.2d, v2.2d, v9.2d[1]
+       fmla    v26.2d, v2.2d, v10.2d[0]
+       fmla    v23.2d, v3.2d, v9.2d[0]
+       fmla    v30.2d, v2.2d, v11.2d[0]
        fmla    v19.2d, v3.2d, v8.2d[0]
 .endm
 
 .macro SAVE8x4
+       fmov    alpha0, alpha
+
+       prfm    PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
        add     ppCRow0, pCRow0, #32
 
-       ld1     {v0.2d, v1.2d}, [pCRow0]
+       ldp     q0, q1, [pCRow0]
        fmla    v0.2d, v16.2d, alphaV0
-       fmla    v1.2d, v17.2d, alphaV1
-       st1     {v0.2d, v1.2d}, [pCRow0]
+       fmla    v1.2d, v17.2d, alphaV0
+       stp     q0, q1, [pCRow0]
 
-       ld1     {v2.2d, v3.2d}, [ppCRow0]
-       fmla    v2.2d, v18.2d, alphaV2
-       fmla    v3.2d, v19.2d, alphaV3
-       st1     {v2.2d, v3.2d}, [ppCRow0]
+       add     pCRow0, pCRow0, #64
 
-       add     pCRow1, pCRow0, LDC
-       add     ppCRow1, ppCRow0, LDC
+       ldp     q2, q3, [ppCRow0]
+       fmla    v2.2d, v18.2d, alphaV0
+       fmla    v3.2d, v19.2d, alphaV0
+       stp     q2, q3, [ppCRow0]
 
-       ld1     {v4.2d, v5.2d}, [pCRow1]
+       prfm    PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+       add     ppCRow1, pCRow1, #32
+
+       ldp     q4, q5, [pCRow1]
        fmla    v4.2d, v20.2d, alphaV0
-       fmla    v5.2d, v21.2d, alphaV1
-       st1     {v4.2d, v5.2d}, [pCRow1]
+       fmla    v5.2d, v21.2d, alphaV0
+       stp     q4, q5, [pCRow1]
 
-       ld1     {v6.2d, v7.2d}, [ppCRow1]
-       fmla    v6.2d, v22.2d, alphaV2
-       fmla    v7.2d, v23.2d, alphaV3
-       st1     {v6.2d, v7.2d}, [ppCRow1]
+       add     pCRow1, pCRow1, #64
 
-       add     pCRow2, pCRow1, LDC
-       add     ppCRow2, ppCRow1, LDC
+       ldp     q6, q7, [ppCRow1]
+       fmla    v6.2d, v22.2d, alphaV0
+       fmla    v7.2d, v23.2d, alphaV0
+       stp     q6, q7, [ppCRow1]
+
+       prfm    PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+       add     ppCRow2, pCRow2, #32
 
-       ld1     {v0.2d, v1.2d}, [pCRow2]
+       ldp     q0, q1, [pCRow2]
        fmla    v0.2d, v24.2d, alphaV0
-       fmla    v1.2d, v25.2d, alphaV1
-       st1     {v0.2d, v1.2d}, [pCRow2]
+       fmla    v1.2d, v25.2d, alphaV0
+       stp     q0, q1, [pCRow2]
 
-       ld1     {v2.2d, v3.2d}, [ppCRow2]
-       fmla    v2.2d, v26.2d, alphaV2
-       fmla    v3.2d, v27.2d, alphaV3
-       st1     {v2.2d, v3.2d}, [ppCRow2]
+       add     pCRow2, pCRow2, #64
 
-       add     pCRow1, pCRow2, LDC
-       add     ppCRow1, ppCRow2, LDC
+       ldp     q2, q3, [ppCRow2]
+       fmla    v2.2d, v26.2d, alphaV0
+       fmla    v3.2d, v27.2d, alphaV0
+       stp     q2, q3, [ppCRow2]
+
+       prfm    PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+       add     ppCRow3, pCRow3, #32
 
-       ld1     {v4.2d, v5.2d}, [pCRow1]
+       ldp     q4, q5, [pCRow3]
        fmla    v4.2d, v28.2d, alphaV0
-       fmla    v5.2d, v29.2d, alphaV1
-       st1     {v4.2d, v5.2d}, [pCRow1]
+       fmla    v5.2d, v29.2d, alphaV0
+       stp     q4, q5, [pCRow3]
 
-       ld1     {v6.2d, v7.2d}, [ppCRow1]
-       fmla    v6.2d, v30.2d, alphaV2
-       fmla    v7.2d, v31.2d, alphaV3
-       st1     {v6.2d, v7.2d}, [ppCRow1]
+       add     pCRow3, pCRow3, #64
 
-       add     pCRow0, pCRow0, #64
+       ldp     q6, q7, [ppCRow3]
+       fmla    v6.2d, v30.2d, alphaV0
+       fmla    v7.2d, v31.2d, alphaV0
+       stp     q6, q7, [ppCRow3]
 .endm
 
 /******************************************************************************/
@@ -403,30 +436,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE4x4
+       fmov    alpha0, alpha
+
        ld1     {v8.2d, v9.2d}, [pCRow0]
        fmla    v8.2d, v16.2d, alphaV0
-       fmla    v9.2d, v17.2d, alphaV1
+       fmla    v9.2d, v17.2d, alphaV0
        st1     {v8.2d, v9.2d}, [pCRow0]
 
        add     pCRow1, pCRow0, LDC
 
        ld1     {v12.2d, v13.2d}, [pCRow1]
-       fmla    v12.2d, v20.2d, alphaV2
-       fmla    v13.2d, v21.2d, alphaV3
+       fmla    v12.2d, v20.2d, alphaV0
+       fmla    v13.2d, v21.2d, alphaV0
        st1     {v12.2d, v13.2d}, [pCRow1]
 
        add     pCRow2, pCRow1, LDC
 
        ld1     {v8.2d, v9.2d}, [pCRow2]
        fmla    v8.2d, v24.2d, alphaV0
-       fmla    v9.2d, v25.2d, alphaV1
+       fmla    v9.2d, v25.2d, alphaV0
        st1     {v8.2d, v9.2d}, [pCRow2]
 
        add     pCRow1, pCRow2, LDC
 
        ld1     {v12.2d, v13.2d}, [pCRow1]
-       fmla    v12.2d, v28.2d, alphaV2
-       fmla    v13.2d, v29.2d, alphaV3
+       fmla    v12.2d, v28.2d, alphaV0
+       fmla    v13.2d, v29.2d, alphaV0
        st1     {v12.2d, v13.2d}, [pCRow1]
 
        add     pCRow0, pCRow0, #32
@@ -454,6 +489,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE2x4
+       fmov    alpha0, alpha
+
        ld1     {v8.2d}, [pCRow0]
        fmla    v8.2d, v16.2d, alphaV0
        st1     {v8.2d}, [pCRow0]
@@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        add     pCRow1, pCRow0, LDC
 
        ld1     {v12.2d}, [pCRow1]
-       fmla    v12.2d, v20.2d, alphaV1
+       fmla    v12.2d, v20.2d, alphaV0
        st1     {v12.2d}, [pCRow1]
 
        add     pCRow2, pCRow1, LDC
 
        ld1     {v8.2d}, [pCRow2]
-       fmla    v8.2d, v24.2d, alphaV2
+       fmla    v8.2d, v24.2d, alphaV0
        st1     {v8.2d}, [pCRow2]
 
        add     pCRow1, pCRow2, LDC
 
        ld1     {v12.2d}, [pCRow1]
-       fmla    v12.2d, v28.2d, alphaV3
+       fmla    v12.2d, v28.2d, alphaV0
        st1     {v12.2d}, [pCRow1]
 
        add     pCRow0, pCRow0, #16
@@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE1x4
+       fmov    alpha0, alpha
+
        add     pCRow1, pCRow0, LDC
 
        ld1     {v8.d}[0], [pCRow0]
@@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        ld1     {v12.d}[0], [pCRow2]
        ld1     {v12.d}[1], [pCRow1]
-       fmla    v12.2d, v20.2d, alphaV1
+       fmla    v12.2d, v20.2d, alphaV0
        st1     {v12.d}[0], [pCRow2]
        st1     {v12.d}[1], [pCRow1]
 
@@ -540,16 +579,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE4x2
+       fmov    alpha0, alpha
+
        ld1     {v8.2d, v9.2d}, [pCRow0]
        fmla    v8.2d, v16.2d, alphaV0
-       fmla    v9.2d, v17.2d, alphaV1
+       fmla    v9.2d, v17.2d, alphaV0
        st1     {v8.2d, v9.2d}, [pCRow0]
 
        add     pCRow1, pCRow0, LDC
 
        ld1     {v12.2d, v13.2d}, [pCRow1]
-       fmla    v12.2d, v20.2d, alphaV2
-       fmla    v13.2d, v21.2d, alphaV3
+       fmla    v12.2d, v20.2d, alphaV0
+       fmla    v13.2d, v21.2d, alphaV0
        st1     {v12.2d, v13.2d}, [pCRow1]
 
        add     pCRow0, pCRow0, #32
@@ -574,6 +615,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE2x2
+       fmov    alpha0, alpha
+
        ld1     {v8.2d}, [pCRow0]
        fmla    v8.2d, v16.2d, alphaV0
        st1     {v8.2d}, [pCRow0]
@@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        add     pCRow1 , pCRow0, LDC
 
        ld1     {v12.2d}, [pCRow1]
-       fmla    v12.2d, v20.2d, alphaV1
+       fmla    v12.2d, v20.2d, alphaV0
        st1     {v12.2d}, [pCRow1]
 
        add     pCRow0, pCRow0, #16
@@ -604,6 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE1x2
+       fmov    alpha0, alpha
+
        add     pCRow1 , pCRow0, LDC
 
        ld1     {v8.d}[0], [pCRow0]
@@ -634,9 +679,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE4x1
+       fmov    alpha0, alpha
+
        ld1     {v8.2d, v9.2d}, [pCRow0]
        fmla    v8.2d, v16.2d, alphaV0
-       fmla    v9.2d, v17.2d, alphaV1
+       fmla    v9.2d, v17.2d, alphaV0
        st1     {v8.2d, v9.2d}, [pCRow0]
 
        add     pCRow0, pCRow0, #32
@@ -662,6 +709,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE2x1
+       fmov    alpha0, alpha
+
        ld1     {v8.2d}, [pCRow0]
        fmla    v8.2d, v16.2d, alphaV0
        st1     {v8.2d}, [pCRow0]
@@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE1x1
+       fmov    alpha0, alpha
+
        ldr     d8, [pCRow0]
        fmadd   d8, d16, alpha0, d8
        str     d8, [pCRow0]
@@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        stp     x26, x27, [sp, #(9 * 16)]
        str     x28, [sp, #(10 * 16)]
 
-       fmov    alpha0, d0
-       fmov    alpha1, d0
-       fmov    alpha2, d0
-       fmov    alpha3, d0
+       fmov    alpha, d0
+       prfm    PLDL1KEEP, [origPA]
+       prfm    PLDL1KEEP, [origPB]
 
        lsl     LDC, LDC, #3                    // ldc = ldc * 8
 
@@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ble     dgemm_kernel_L2_BEGIN
 
 dgemm_kernel_L4_BEGIN:
-       mov     pCRow0, pC                      // pCRow0 = C
-       add     pC, pC, LDC, lsl #2
+       mov     pCRow0, pC
+       add     pCRow1, pCRow0, LDC
+       add     pCRow2, pCRow1, LDC
+       add     pCRow3, pCRow2, LDC
+       add     pC, pCRow3, LDC
 
        lsl     temp, origK, #5                 // k * 4 * 8
        mov     pA, origPA                      // pA = start of A array
        add     ppA, temp, pA
+       prfm    PLDL1KEEP, [ppA]
 
 //------------------------------------------------------------------------------
 
@@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN:
        cmp     counterI, #0
        ble     dgemm_kernel_L4_M4_BEGIN
 
+       .align 5
 dgemm_kernel_L4_M8_20:
 
        mov     pB, origPB
-       asr     counterL , origK, #1            // L = K / 2
-       cmp     counterL , #2                   // is there at least 4 to do?
+       asr     counterL , origK, #2            // L = K / 4
+       cmp     counterL , #2
        blt     dgemm_kernel_L4_M8_32
 
-       KERNEL8x4_I                             // do one in the K
-       KERNEL8x4_M2                            // do another in the K
+       KERNEL8x4_I
+       KERNEL8x4_M2
+       KERNEL8x4_M1
+       KERNEL8x4_M2
 
        subs    counterL, counterL, #2          // subtract 2
        ble     dgemm_kernel_L4_M8_22a
-       .align 5
 
+       .align 5
 dgemm_kernel_L4_M8_22:
-
+       KERNEL8x4_M1
+       KERNEL8x4_M2
        KERNEL8x4_M1
        KERNEL8x4_M2
 
        subs    counterL, counterL, #1
        bgt     dgemm_kernel_L4_M8_22
 
-
+       .align 5
 dgemm_kernel_L4_M8_22a:
 
        KERNEL8x4_M1
+       KERNEL8x4_M2
+       KERNEL8x4_M1
        KERNEL8x4_E
 
        b        dgemm_kernel_L4_M8_44
 
+       .align 5
 dgemm_kernel_L4_M8_32:
 
        tst     counterL, #1
        ble     dgemm_kernel_L4_M8_40
 
        KERNEL8x4_I
-
+       KERNEL8x4_M2
+       KERNEL8x4_M1
        KERNEL8x4_E
 
        b       dgemm_kernel_L4_M8_44
@@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40:
 
 dgemm_kernel_L4_M8_44:
 
-       ands    counterL , origK, #1
+       ands    counterL , origK, #3
        ble     dgemm_kernel_L4_M8_100
 
+       .align 5
 dgemm_kernel_L4_M8_46:
 
        KERNEL8x4_SUB
 
+       subs    counterL, counterL, #1
+       bne     dgemm_kernel_L4_M8_46
+
 dgemm_kernel_L4_M8_100:
+       lsl     temp, origK, #5
+       prfm    PLDL1KEEP, [pA, temp]
+       prfm    PLDL1KEEP, [ppA, temp]
+       prfm    PLDL1KEEP, [origPB]
 
        SAVE8x4
 
@@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END:
        subs    counterI, counterI, #1
        bne     dgemm_kernel_L4_M8_20
 
-
 dgemm_kernel_L4_M4_BEGIN:
        mov     counterI, origM
        tst     counterI , #7