Cortex-A57: Fix clang compilation errors
authorAshwin Sekhar T K <ashwin@broadcom.com>
Thu, 24 Mar 2016 05:01:28 +0000 (10:31 +0530)
committerAshwin Sekhar T K <ashwin@broadcom.com>
Thu, 24 Mar 2016 05:12:04 +0000 (10:42 +0530)
18 files changed:
kernel/arm64/cgemm_kernel_4x4.S
kernel/arm64/cgemm_kernel_8x4.S [changed mode: 0755->0644]
kernel/arm64/ctrmm_kernel_4x4.S
kernel/arm64/ctrmm_kernel_8x4.S [changed mode: 0755->0644]
kernel/arm64/dgemm_kernel_4x4.S
kernel/arm64/dgemm_kernel_4x8.S [changed mode: 0755->0644]
kernel/arm64/dgemm_kernel_8x4.S [changed mode: 0755->0644]
kernel/arm64/dtrmm_kernel_4x4.S
kernel/arm64/dtrmm_kernel_4x8.S [changed mode: 0755->0644]
kernel/arm64/dtrmm_kernel_8x4.S [changed mode: 0755->0644]
kernel/arm64/sgemm_kernel_16x4.S
kernel/arm64/sgemm_kernel_4x4.S
kernel/arm64/sgemm_kernel_8x8.S
kernel/arm64/strmm_kernel_16x4.S [changed mode: 0755->0644]
kernel/arm64/strmm_kernel_4x4.S
kernel/arm64/strmm_kernel_8x8.S [changed mode: 0755->0644]
kernel/arm64/zgemm_kernel_4x4.S
kernel/arm64/ztrmm_kernel_4x4.S

index 7a70264..7f2ddea 100644 (file)
@@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [ppA]
        add     ppA, ppA, #32
 
-       fmul    v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v17.16b, v17.16b, v17.16b
-       fmls    v17.4s, v0.4s, v9.4s[0]
+       fmls    v17.4s, v0.4s, v9.s[0]
 #else
-       fmul    v17.4s, v0.4s, v9.4s[0]
+       fmul    v17.4s, v0.4s, v9.s[0]
 #endif
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       fmul    v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v21.16b, v21.16b, v21.16b
-       fmls    v21.4s, v0.4s, v9.4s[1]
+       fmls    v21.4s, v0.4s, v9.s[1]
 #else
-       fmul    v21.4s, v0.4s, v9.4s[1]
+       fmul    v21.4s, v0.4s, v9.s[1]
 #endif
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
-       fmul    v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
+       fmul    v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v25.16b, v25.16b, v25.16b
-       fmls    v25.4s, v0.4s, v9.4s[2]
+       fmls    v25.4s, v0.4s, v9.s[2]
 #else
-       fmul    v25.4s, v0.4s, v9.4s[2]
+       fmul    v25.4s, v0.4s, v9.s[2]
 #endif
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
-       fmul    v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
+       fmul    v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v29.16b, v29.16b, v29.16b
-       fmls    v29.4s, v0.4s, v9.4s[3]
+       fmls    v29.4s, v0.4s, v9.s[3]
 #else
-       fmul    v29.4s, v0.4s, v9.4s[3]
+       fmul    v29.4s, v0.4s, v9.s[3]
 #endif
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 
-       fmul    v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
+       fmul    v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v19.16b, v19.16b, v19.16b
-       fmls    v19.4s, v2.4s, v9.4s[0]
+       fmls    v19.4s, v2.4s, v9.s[0]
 #else
-       fmul    v19.4s, v2.4s, v9.4s[0]
+       fmul    v19.4s, v2.4s, v9.s[0]
 #endif
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
 
-       fmul    v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
+       fmul    v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v23.16b, v23.16b, v23.16b
-       fmls    v23.4s, v2.4s, v9.4s[1]
+       fmls    v23.4s, v2.4s, v9.s[1]
 #else
-       fmul    v23.4s, v2.4s, v9.4s[1]
+       fmul    v23.4s, v2.4s, v9.s[1]
 #endif
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
 
-       fmul    v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
+       fmul    v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v27.16b, v27.16b, v27.16b
-       fmls    v27.4s, v2.4s, v9.4s[2]
+       fmls    v27.4s, v2.4s, v9.s[2]
 #else
-       fmul    v27.4s, v2.4s, v9.4s[2]
+       fmul    v27.4s, v2.4s, v9.s[2]
 #endif
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
 
-       fmul    v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
+       fmul    v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v31.16b, v31.16b, v31.16b
-       fmls    v31.4s, v2.4s, v9.4s[3]
+       fmls    v31.4s, v2.4s, v9.s[3]
 #else
-       fmul    v31.4s, v2.4s, v9.4s[3]
+       fmul    v31.4s, v2.4s, v9.s[3]
 #endif
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 
        ld2     {v12.4s, v13.4s}, [pB]
        add     pB, pB, #32
@@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
        ld2     {v12.4s, v13.4s}, [pB]          // for next round
        add     pB, pB, #32
 
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
        ld2     {v4.4s, v5.4s} , [pA]           // for next round
        add     pA, pA, #32
 
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
-       OP_ri   v19.4s, v2.4s, v9.4s[0]
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
+       OP_ri   v19.4s, v2.4s, v9.s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
 
        ld2     {v6.4s, v7.4s} , [ppA]          // for next round
        add     ppA, ppA, #32
 
-       OP_rr   v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
-       OP_ri   v23.4s, v2.4s, v9.4s[1]
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
+       OP_rr   v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
+       OP_ri   v23.4s, v2.4s, v9.s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
 
        prfm    PLDL1KEEP, [ppA, #512]
 
-       OP_rr   v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
-       OP_ri   v27.4s, v2.4s, v9.4s[2]
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
+       OP_rr   v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
+       OP_ri   v27.4s, v2.4s, v9.s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
 
-       OP_rr   v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
-       OP_ri   v31.4s, v2.4s, v9.4s[3]
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_rr   v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
+       OP_ri   v31.4s, v2.4s, v9.s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 .endm
 
 .macro KERNEL8x4_M2
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
 
        ld2     {v8.4s, v9.4s}, [pB]            // for next round
        add     pB, pB, #32
 
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
 
        ld2     {v0.4s, v1.4s}, [pA]            // for next round
        add     pA, pA, #32
 
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
 
        prfm    PLDL1KEEP, [ppA, #512]
 
-       OP_rr   v18.4s, v6.4s, v12.4s[0]
-       OP_ii   v18.4s, v7.4s, v13.4s[0]
-       OP_ri   v19.4s, v6.4s, v13.4s[0]
-       OP_ir   v19.4s, v7.4s, v12.4s[0]
+       OP_rr   v18.4s, v6.4s, v12.s[0]
+       OP_ii   v18.4s, v7.4s, v13.s[0]
+       OP_ri   v19.4s, v6.4s, v13.s[0]
+       OP_ir   v19.4s, v7.4s, v12.s[0]
 
        ld2     {v2.4s, v3.4s}, [ppA]           // for next round
        add     ppA, ppA, #32
 
-       OP_rr   v22.4s, v6.4s, v12.4s[1]
-       OP_ii   v22.4s, v7.4s, v13.4s[1]
-       OP_ri   v23.4s, v6.4s, v13.4s[1]
-       OP_ir   v23.4s, v7.4s, v12.4s[1]
+       OP_rr   v22.4s, v6.4s, v12.s[1]
+       OP_ii   v22.4s, v7.4s, v13.s[1]
+       OP_ri   v23.4s, v6.4s, v13.s[1]
+       OP_ir   v23.4s, v7.4s, v12.s[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v26.4s, v6.4s, v12.4s[2]
-       OP_ii   v26.4s, v7.4s, v13.4s[2]
-       OP_ri   v27.4s, v6.4s, v13.4s[2]
-       OP_ir   v27.4s, v7.4s, v12.4s[2]
+       OP_rr   v26.4s, v6.4s, v12.s[2]
+       OP_ii   v26.4s, v7.4s, v13.s[2]
+       OP_ri   v27.4s, v6.4s, v13.s[2]
+       OP_ir   v27.4s, v7.4s, v12.s[2]
 
-       OP_rr   v30.4s, v6.4s, v12.4s[3]
-       OP_ii   v30.4s, v7.4s, v13.4s[3]
-       OP_ri   v31.4s, v6.4s, v13.4s[3]
-       OP_ir   v31.4s, v7.4s, v12.4s[3]
+       OP_rr   v30.4s, v6.4s, v12.s[3]
+       OP_ii   v30.4s, v7.4s, v13.s[3]
+       OP_ri   v31.4s, v6.4s, v13.s[3]
+       OP_ir   v31.4s, v7.4s, v12.s[3]
 .endm
 
 .macro KERNEL8x4_E
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
-
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
-
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
-
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
-
-       OP_rr   v18.4s, v6.4s, v12.4s[0]
-       OP_ii   v18.4s, v7.4s, v13.4s[0]
-       OP_ri   v19.4s, v6.4s, v13.4s[0]
-       OP_ir   v19.4s, v7.4s, v12.4s[0]
-
-       OP_rr   v22.4s, v6.4s, v12.4s[1]
-       OP_ii   v22.4s, v7.4s, v13.4s[1]
-       OP_ri   v23.4s, v6.4s, v13.4s[1]
-       OP_ir   v23.4s, v7.4s, v12.4s[1]
-
-       OP_rr   v26.4s, v6.4s, v12.4s[2]
-       OP_ii   v26.4s, v7.4s, v13.4s[2]
-       OP_ri   v27.4s, v6.4s, v13.4s[2]
-       OP_ir   v27.4s, v7.4s, v12.4s[2]
-
-       OP_rr   v30.4s, v6.4s, v12.4s[3]
-       OP_ii   v30.4s, v7.4s, v13.4s[3]
-       OP_ri   v31.4s, v6.4s, v13.4s[3]
-       OP_ir   v31.4s, v7.4s, v12.4s[3]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
+
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
+
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
+
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
+
+       OP_rr   v18.4s, v6.4s, v12.s[0]
+       OP_ii   v18.4s, v7.4s, v13.s[0]
+       OP_ri   v19.4s, v6.4s, v13.s[0]
+       OP_ir   v19.4s, v7.4s, v12.s[0]
+
+       OP_rr   v22.4s, v6.4s, v12.s[1]
+       OP_ii   v22.4s, v7.4s, v13.s[1]
+       OP_ri   v23.4s, v6.4s, v13.s[1]
+       OP_ir   v23.4s, v7.4s, v12.s[1]
+
+       OP_rr   v26.4s, v6.4s, v12.s[2]
+       OP_ii   v26.4s, v7.4s, v13.s[2]
+       OP_ri   v27.4s, v6.4s, v13.s[2]
+       OP_ir   v27.4s, v7.4s, v12.s[2]
+
+       OP_rr   v30.4s, v6.4s, v12.s[3]
+       OP_ii   v30.4s, v7.4s, v13.s[3]
+       OP_ri   v31.4s, v6.4s, v13.s[3]
+       OP_ir   v31.4s, v7.4s, v12.s[3]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
        ld2     {v2.4s, v3.4s}, [ppA]
        add     ppA, ppA, #32
 
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
-
-       OP_rr   v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
-       OP_ri   v19.4s, v2.4s, v9.4s[0]
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
-
-       OP_rr   v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
-       OP_ri   v23.4s, v2.4s, v9.4s[1]
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
-
-       OP_rr   v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
-       OP_ri   v27.4s, v2.4s, v9.4s[2]
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
-
-       OP_rr   v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
-       OP_ri   v31.4s, v2.4s, v9.4s[3]
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
+
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
+       OP_ri   v19.4s, v2.4s, v9.s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
+
+       OP_rr   v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
+       OP_ri   v23.4s, v2.4s, v9.s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
+
+       OP_rr   v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
+       OP_ri   v27.4s, v2.4s, v9.s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
+
+       OP_rr   v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
+       OP_ri   v31.4s, v2.4s, v9.s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 .endm
 
 .macro SAVE8x4
@@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
-
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro SAVE4x4
@@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       OP_rr   v16.2s, v0.2s, v8.4s[0]
-       OP_ii   v16.2s, v1.2s, v9.4s[0]
-       OP_ri   v17.2s, v0.2s, v9.4s[0]
-       OP_ir   v17.2s, v1.2s, v8.4s[0]
-
-       OP_rr   v20.2s, v0.2s, v8.4s[1]
-       OP_ii   v20.2s, v1.2s, v9.4s[1]
-       OP_ri   v21.2s, v0.2s, v9.4s[1]
-       OP_ir   v21.2s, v1.2s, v8.4s[1]
-
-       OP_rr   v24.2s, v0.2s, v8.4s[2]
-       OP_ii   v24.2s, v1.2s, v9.4s[2]
-       OP_ri   v25.2s, v0.2s, v9.4s[2]
-       OP_ir   v25.2s, v1.2s, v8.4s[2]
-
-       OP_rr   v28.2s, v0.2s, v8.4s[3]
-       OP_ii   v28.2s, v1.2s, v9.4s[3]
-       OP_ri   v29.2s, v0.2s, v9.4s[3]
-       OP_ir   v29.2s, v1.2s, v8.4s[3]
+       OP_rr   v16.2s, v0.2s, v8.s[0]
+       OP_ii   v16.2s, v1.2s, v9.s[0]
+       OP_ri   v17.2s, v0.2s, v9.s[0]
+       OP_ir   v17.2s, v1.2s, v8.s[0]
+
+       OP_rr   v20.2s, v0.2s, v8.s[1]
+       OP_ii   v20.2s, v1.2s, v9.s[1]
+       OP_ri   v21.2s, v0.2s, v9.s[1]
+       OP_ir   v21.2s, v1.2s, v8.s[1]
+
+       OP_rr   v24.2s, v0.2s, v8.s[2]
+       OP_ii   v24.2s, v1.2s, v9.s[2]
+       OP_ri   v25.2s, v0.2s, v9.s[2]
+       OP_ir   v25.2s, v1.2s, v8.s[2]
+
+       OP_rr   v28.2s, v0.2s, v8.s[3]
+       OP_ii   v28.2s, v1.2s, v9.s[3]
+       OP_ri   v29.2s, v0.2s, v9.s[3]
+       OP_ir   v29.2s, v1.2s, v8.s[3]
 .endm
 
 .macro SAVE2x4
@@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.s, v1.s}[0], [pA]
        add     pA, pA, #8
 
-       OP_rr   s16, s0, v8.4s[0]
-       OP_ii   s16, s1, v9.4s[0]
-       OP_ri   s17, s0, v9.4s[0]
-       OP_ir   s17, s1, v8.4s[0]
-
-       OP_rr   s20, s0, v8.4s[1]
-       OP_ii   s20, s1, v9.4s[1]
-       OP_ri   s21, s0, v9.4s[1]
-       OP_ir   s21, s1, v8.4s[1]
-
-       OP_rr   s24, s0, v8.4s[2]
-       OP_ii   s24, s1, v9.4s[2]
-       OP_ri   s25, s0, v9.4s[2]
-       OP_ir   s25, s1, v8.4s[2]
-
-       OP_rr   s28, s0, v8.4s[3]
-       OP_ii   s28, s1, v9.4s[3]
-       OP_ri   s29, s0, v9.4s[3]
-       OP_ir   s29, s1, v8.4s[3]
+       OP_rr   s16, s0, v8.s[0]
+       OP_ii   s16, s1, v9.s[0]
+       OP_ri   s17, s0, v9.s[0]
+       OP_ir   s17, s1, v8.s[0]
+
+       OP_rr   s20, s0, v8.s[1]
+       OP_ii   s20, s1, v9.s[1]
+       OP_ri   s21, s0, v9.s[1]
+       OP_ir   s21, s1, v8.s[1]
+
+       OP_rr   s24, s0, v8.s[2]
+       OP_ii   s24, s1, v9.s[2]
+       OP_ri   s25, s0, v9.s[2]
+       OP_ir   s25, s1, v8.s[2]
+
+       OP_rr   s28, s0, v8.s[3]
+       OP_ii   s28, s1, v9.s[3]
+       OP_ri   s29, s0, v9.s[3]
+       OP_ir   s29, s1, v8.s[3]
 .endm
 
 .macro SAVE1x4
@@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.2s[0]
-       OP_ii   v16.4s, v1.4s, v9.2s[0]
-       OP_ri   v17.4s, v0.4s, v9.2s[0]
-       OP_ir   v17.4s, v1.4s, v8.2s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       OP_rr   v20.4s, v0.4s, v8.2s[1]
-       OP_ii   v20.4s, v1.4s, v9.2s[1]
-       OP_ri   v21.4s, v0.4s, v9.2s[1]
-       OP_ir   v21.4s, v1.4s, v8.2s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       OP_rr   v16.2s, v0.2s, v8.2s[0]
-       OP_ii   v16.2s, v1.2s, v9.2s[0]
-       OP_ri   v17.2s, v0.2s, v9.2s[0]
-       OP_ir   v17.2s, v1.2s, v8.2s[0]
+       OP_rr   v16.2s, v0.2s, v8.s[0]
+       OP_ii   v16.2s, v1.2s, v9.s[0]
+       OP_ri   v17.2s, v0.2s, v9.s[0]
+       OP_ir   v17.2s, v1.2s, v8.s[0]
 
-       OP_rr   v20.2s, v0.2s, v8.2s[1]
-       OP_ii   v20.2s, v1.2s, v9.2s[1]
-       OP_ri   v21.2s, v0.2s, v9.2s[1]
-       OP_ir   v21.2s, v1.2s, v8.2s[1]
+       OP_rr   v20.2s, v0.2s, v8.s[1]
+       OP_ii   v20.2s, v1.2s, v9.s[1]
+       OP_ri   v21.2s, v0.2s, v9.s[1]
+       OP_ir   v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.s, v1.s}[0], [pA]
        add     pA, pA, #8
 
-       OP_rr   s16, s0, v8.2s[0]
-       OP_ii   s16, s1, v9.2s[0]
-       OP_ri   s17, s0, v9.2s[0]
-       OP_ir   s17, s1, v8.2s[0]
+       OP_rr   s16, s0, v8.s[0]
+       OP_ii   s16, s1, v9.s[0]
+       OP_ri   s17, s0, v9.s[0]
+       OP_ir   s17, s1, v8.s[0]
 
-       OP_rr   s20, s0, v8.2s[1]
-       OP_ii   s20, s1, v9.2s[1]
-       OP_ri   s21, s0, v9.2s[1]
-       OP_ir   s21, s1, v8.2s[1]
+       OP_rr   s20, s0, v8.s[1]
+       OP_ii   s20, s1, v9.s[1]
+       OP_ri   s21, s0, v9.s[1]
+       OP_ir   s21, s1, v8.s[1]
 .endm
 
 .macro SAVE1x2
old mode 100755 (executable)
new mode 100644 (file)
index 40b98ce..d58cef5
@@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v17.16b, v17.16b, v17.16b
-       fmls    v17.4s, v0.4s, v9.4s[0]
+       fmls    v17.4s, v0.4s, v9.s[0]
 #else
-       fmul    v17.4s, v0.4s, v9.4s[0]
+       fmul    v17.4s, v0.4s, v9.s[0]
 #endif
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       fmul    v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
+       fmul    v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v19.16b, v19.16b, v19.16b
-       fmls    v19.4s, v2.4s, v9.4s[0]
+       fmls    v19.4s, v2.4s, v9.s[0]
 #else
-       fmul    v19.4s, v2.4s, v9.4s[0]
+       fmul    v19.4s, v2.4s, v9.s[0]
 #endif
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
 
-       fmul    v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v21.16b, v21.16b, v21.16b
-       fmls    v21.4s, v0.4s, v9.4s[1]
+       fmls    v21.4s, v0.4s, v9.s[1]
 #else
-       fmul    v21.4s, v0.4s, v9.4s[1]
+       fmul    v21.4s, v0.4s, v9.s[1]
 #endif
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
-       fmul    v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
+       fmul    v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v23.16b, v23.16b, v23.16b
-       fmls    v23.4s, v2.4s, v9.4s[1]
+       fmls    v23.4s, v2.4s, v9.s[1]
 #else
-       fmul    v23.4s, v2.4s, v9.4s[1]
+       fmul    v23.4s, v2.4s, v9.s[1]
 #endif
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
 
-       fmul    v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
+       fmul    v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v25.16b, v25.16b, v25.16b
-       fmls    v25.4s, v0.4s, v9.4s[2]
+       fmls    v25.4s, v0.4s, v9.s[2]
 #else
-       fmul    v25.4s, v0.4s, v9.4s[2]
+       fmul    v25.4s, v0.4s, v9.s[2]
 #endif
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
-       fmul    v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
+       fmul    v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v27.16b, v27.16b, v27.16b
-       fmls    v27.4s, v2.4s, v9.4s[2]
+       fmls    v27.4s, v2.4s, v9.s[2]
 #else
-       fmul    v27.4s, v2.4s, v9.4s[2]
+       fmul    v27.4s, v2.4s, v9.s[2]
 #endif
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
 
-       fmul    v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
+       fmul    v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v29.16b, v29.16b, v29.16b
-       fmls    v29.4s, v0.4s, v9.4s[3]
+       fmls    v29.4s, v0.4s, v9.s[3]
 #else
-       fmul    v29.4s, v0.4s, v9.4s[3]
+       fmul    v29.4s, v0.4s, v9.s[3]
 #endif
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 
-       fmul    v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
+       fmul    v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v31.16b, v31.16b, v31.16b
-       fmls    v31.4s, v2.4s, v9.4s[3]
+       fmls    v31.4s, v2.4s, v9.s[3]
 #else
-       fmul    v31.4s, v2.4s, v9.4s[3]
+       fmul    v31.4s, v2.4s, v9.s[3]
 #endif
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 
        ld2     {v12.4s, v13.4s}, [pB]
        add     pB, pB, #32
@@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
-
-       OP_rr   v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
-       OP_ri   v19.4s, v2.4s, v9.4s[0]
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
-
-       OP_rr   v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
-       OP_ri   v23.4s, v2.4s, v9.4s[1]
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
-
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
-       OP_ri   v27.4s, v2.4s, v9.4s[2]
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
-
-       OP_rr   v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
-       OP_ri   v31.4s, v2.4s, v9.4s[3]
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
+       OP_ri   v19.4s, v2.4s, v9.s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
+       OP_ri   v23.4s, v2.4s, v9.s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
+
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
+       OP_ri   v27.4s, v2.4s, v9.s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
+
+       OP_rr   v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
+       OP_ri   v31.4s, v2.4s, v9.s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 
        ld2     {v12.4s, v13.4s}, [pB]          // For next round
        add     pB, pB, #32
@@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
-
-       OP_rr   v18.4s, v6.4s, v12.4s[0]
-       OP_ii   v18.4s, v7.4s, v13.4s[0]
-       OP_ri   v19.4s, v6.4s, v13.4s[0]
-       OP_ir   v19.4s, v7.4s, v12.4s[0]
-
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
-
-       OP_rr   v22.4s, v6.4s, v12.4s[1]
-       OP_ii   v22.4s, v7.4s, v13.4s[1]
-       OP_ri   v23.4s, v6.4s, v13.4s[1]
-       OP_ir   v23.4s, v7.4s, v12.4s[1]
-
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
-
-       OP_rr   v26.4s, v6.4s, v12.4s[2]
-       OP_ii   v26.4s, v7.4s, v13.4s[2]
-       OP_ri   v27.4s, v6.4s, v13.4s[2]
-       OP_ir   v27.4s, v7.4s, v12.4s[2]
-
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
-
-       OP_rr   v30.4s, v6.4s, v12.4s[3]
-       OP_ii   v30.4s, v7.4s, v13.4s[3]
-       OP_ri   v31.4s, v6.4s, v13.4s[3]
-       OP_ir   v31.4s, v7.4s, v12.4s[3]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
+
+       OP_rr   v18.4s, v6.4s, v12.s[0]
+       OP_ii   v18.4s, v7.4s, v13.s[0]
+       OP_ri   v19.4s, v6.4s, v13.s[0]
+       OP_ir   v19.4s, v7.4s, v12.s[0]
+
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
+
+       OP_rr   v22.4s, v6.4s, v12.s[1]
+       OP_ii   v22.4s, v7.4s, v13.s[1]
+       OP_ri   v23.4s, v6.4s, v13.s[1]
+       OP_ir   v23.4s, v7.4s, v12.s[1]
+
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
+
+       OP_rr   v26.4s, v6.4s, v12.s[2]
+       OP_ii   v26.4s, v7.4s, v13.s[2]
+       OP_ri   v27.4s, v6.4s, v13.s[2]
+       OP_ir   v27.4s, v7.4s, v12.s[2]
+
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
+
+       OP_rr   v30.4s, v6.4s, v12.s[3]
+       OP_ii   v30.4s, v7.4s, v13.s[3]
+       OP_ri   v31.4s, v6.4s, v13.s[3]
+       OP_ir   v31.4s, v7.4s, v12.s[3]
 
        ld2     {v8.4s, v9.4s}, [pB]
        add     pB, pB, #32
@@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
-
-       OP_rr   v18.4s, v6.4s, v12.4s[0]
-       OP_ii   v18.4s, v7.4s, v13.4s[0]
-       OP_ri   v19.4s, v6.4s, v13.4s[0]
-       OP_ir   v19.4s, v7.4s, v12.4s[0]
-
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
-
-       OP_rr   v22.4s, v6.4s, v12.4s[1]
-       OP_ii   v22.4s, v7.4s, v13.4s[1]
-       OP_ri   v23.4s, v6.4s, v13.4s[1]
-       OP_ir   v23.4s, v7.4s, v12.4s[1]
-
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
-
-       OP_rr   v26.4s, v6.4s, v12.4s[2]
-       OP_ii   v26.4s, v7.4s, v13.4s[2]
-       OP_ri   v27.4s, v6.4s, v13.4s[2]
-       OP_ir   v27.4s, v7.4s, v12.4s[2]
-
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
-
-       OP_rr   v30.4s, v6.4s, v12.4s[3]
-       OP_ii   v30.4s, v7.4s, v13.4s[3]
-       OP_ri   v31.4s, v6.4s, v13.4s[3]
-       OP_ir   v31.4s, v7.4s, v12.4s[3]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
+
+       OP_rr   v18.4s, v6.4s, v12.s[0]
+       OP_ii   v18.4s, v7.4s, v13.s[0]
+       OP_ri   v19.4s, v6.4s, v13.s[0]
+       OP_ir   v19.4s, v7.4s, v12.s[0]
+
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
+
+       OP_rr   v22.4s, v6.4s, v12.s[1]
+       OP_ii   v22.4s, v7.4s, v13.s[1]
+       OP_ri   v23.4s, v6.4s, v13.s[1]
+       OP_ir   v23.4s, v7.4s, v12.s[1]
+
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
+
+       OP_rr   v26.4s, v6.4s, v12.s[2]
+       OP_ii   v26.4s, v7.4s, v13.s[2]
+       OP_ri   v27.4s, v6.4s, v13.s[2]
+       OP_ir   v27.4s, v7.4s, v12.s[2]
+
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
+
+       OP_rr   v30.4s, v6.4s, v12.s[3]
+       OP_ii   v30.4s, v7.4s, v13.s[3]
+       OP_ri   v31.4s, v6.4s, v13.s[3]
+       OP_ir   v31.4s, v7.4s, v12.s[3]
 
 .endm
 
@@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
-
-       OP_rr   v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
-       OP_ri   v19.4s, v2.4s, v9.4s[0]
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
-
-       OP_rr   v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
-       OP_ri   v23.4s, v2.4s, v9.4s[1]
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
-
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
-       OP_ri   v27.4s, v2.4s, v9.4s[2]
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
-
-       OP_rr   v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
-       OP_ri   v31.4s, v2.4s, v9.4s[3]
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
+       OP_ri   v19.4s, v2.4s, v9.s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
+       OP_ri   v23.4s, v2.4s, v9.s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
+
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
+       OP_ri   v27.4s, v2.4s, v9.s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
+
+       OP_rr   v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
+       OP_ri   v31.4s, v2.4s, v9.s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 
 .endm
 
@@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v17.16b, v17.16b, v17.16b
-       fmls    v17.4s, v0.4s, v9.4s[0]
+       fmls    v17.4s, v0.4s, v9.s[0]
 #else
-       fmul    v17.4s, v0.4s, v9.4s[0]
+       fmul    v17.4s, v0.4s, v9.s[0]
 #endif
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       fmul    v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v21.16b, v21.16b, v21.16b
-       fmls    v21.4s, v0.4s, v9.4s[1]
+       fmls    v21.4s, v0.4s, v9.s[1]
 #else
-       fmul    v21.4s, v0.4s, v9.4s[1]
+       fmul    v21.4s, v0.4s, v9.s[1]
 #endif
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
-       fmul    v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
+       fmul    v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v25.16b, v25.16b, v25.16b
-       fmls    v25.4s, v0.4s, v9.4s[2]
+       fmls    v25.4s, v0.4s, v9.s[2]
 #else
-       fmul    v25.4s, v0.4s, v9.4s[2]
+       fmul    v25.4s, v0.4s, v9.s[2]
 #endif
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
-       fmul    v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
+       fmul    v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v29.16b, v29.16b, v29.16b
-       fmls    v29.4s, v0.4s, v9.4s[3]
+       fmls    v29.4s, v0.4s, v9.s[3]
 #else
-       fmul    v29.4s, v0.4s, v9.4s[3]
+       fmul    v29.4s, v0.4s, v9.s[3]
 #endif
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 
        ld2     {v12.4s, v13.4s}, [pB]
        add     pB, pB, #32
@@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
        ld2     {v12.4s, v13.4s}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
        ld2     {v4.4s, v5.4s}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro KERNEL4x4_M2
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
 
        ld2     {v8.4s, v9.4s}, [pB]            // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
 
        ld2     {v0.4s, v1.4s}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_E
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
-
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
-
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
-
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
+
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
+
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
+
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
-
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro SAVE4x4
@@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       OP_rr   v16.2s, v0.2s, v8.4s[0]
-       OP_ii   v16.2s, v1.2s, v9.4s[0]
-       OP_ri   v17.2s, v0.2s, v9.4s[0]
-       OP_ir   v17.2s, v1.2s, v8.4s[0]
-
-       OP_rr   v20.2s, v0.2s, v8.4s[1]
-       OP_ii   v20.2s, v1.2s, v9.4s[1]
-       OP_ri   v21.2s, v0.2s, v9.4s[1]
-       OP_ir   v21.2s, v1.2s, v8.4s[1]
-
-       OP_rr   v24.2s, v0.2s, v8.4s[2]
-       OP_ii   v24.2s, v1.2s, v9.4s[2]
-       OP_ri   v25.2s, v0.2s, v9.4s[2]
-       OP_ir   v25.2s, v1.2s, v8.4s[2]
-
-       OP_rr   v28.2s, v0.2s, v8.4s[3]
-       OP_ii   v28.2s, v1.2s, v9.4s[3]
-       OP_ri   v29.2s, v0.2s, v9.4s[3]
-       OP_ir   v29.2s, v1.2s, v8.4s[3]
+       OP_rr   v16.2s, v0.2s, v8.s[0]
+       OP_ii   v16.2s, v1.2s, v9.s[0]
+       OP_ri   v17.2s, v0.2s, v9.s[0]
+       OP_ir   v17.2s, v1.2s, v8.s[0]
+
+       OP_rr   v20.2s, v0.2s, v8.s[1]
+       OP_ii   v20.2s, v1.2s, v9.s[1]
+       OP_ri   v21.2s, v0.2s, v9.s[1]
+       OP_ir   v21.2s, v1.2s, v8.s[1]
+
+       OP_rr   v24.2s, v0.2s, v8.s[2]
+       OP_ii   v24.2s, v1.2s, v9.s[2]
+       OP_ri   v25.2s, v0.2s, v9.s[2]
+       OP_ir   v25.2s, v1.2s, v8.s[2]
+
+       OP_rr   v28.2s, v0.2s, v8.s[3]
+       OP_ii   v28.2s, v1.2s, v9.s[3]
+       OP_ri   v29.2s, v0.2s, v9.s[3]
+       OP_ir   v29.2s, v1.2s, v8.s[3]
 .endm
 
 .macro SAVE2x4
@@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.s, v1.s}[0], [pA]
        add     pA, pA, #8
 
-       OP_rr   s16, s0, v8.4s[0]
-       OP_ii   s16, s1, v9.4s[0]
-       OP_ri   s17, s0, v9.4s[0]
-       OP_ir   s17, s1, v8.4s[0]
-
-       OP_rr   s20, s0, v8.4s[1]
-       OP_ii   s20, s1, v9.4s[1]
-       OP_ri   s21, s0, v9.4s[1]
-       OP_ir   s21, s1, v8.4s[1]
-
-       OP_rr   s24, s0, v8.4s[2]
-       OP_ii   s24, s1, v9.4s[2]
-       OP_ri   s25, s0, v9.4s[2]
-       OP_ir   s25, s1, v8.4s[2]
-
-       OP_rr   s28, s0, v8.4s[3]
-       OP_ii   s28, s1, v9.4s[3]
-       OP_ri   s29, s0, v9.4s[3]
-       OP_ir   s29, s1, v8.4s[3]
+       OP_rr   s16, s0, v8.s[0]
+       OP_ii   s16, s1, v9.s[0]
+       OP_ri   s17, s0, v9.s[0]
+       OP_ir   s17, s1, v8.s[0]
+
+       OP_rr   s20, s0, v8.s[1]
+       OP_ii   s20, s1, v9.s[1]
+       OP_ri   s21, s0, v9.s[1]
+       OP_ir   s21, s1, v8.s[1]
+
+       OP_rr   s24, s0, v8.s[2]
+       OP_ii   s24, s1, v9.s[2]
+       OP_ri   s25, s0, v9.s[2]
+       OP_ir   s25, s1, v8.s[2]
+
+       OP_rr   s28, s0, v8.s[3]
+       OP_ii   s28, s1, v9.s[3]
+       OP_ri   s29, s0, v9.s[3]
+       OP_ir   s29, s1, v8.s[3]
 .endm
 
 .macro SAVE1x4
@@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.2s[0]
-       OP_ii   v16.4s, v1.4s, v9.2s[0]
-       OP_ri   v17.4s, v0.4s, v9.2s[0]
-       OP_ir   v17.4s, v1.4s, v8.2s[0]
-
-       OP_rr   v18.4s, v2.4s, v8.2s[0]
-       OP_ii   v18.4s, v3.4s, v9.2s[0]
-       OP_ri   v19.4s, v2.4s, v9.2s[0]
-       OP_ir   v19.4s, v3.4s, v8.2s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.2s[1]
-       OP_ii   v20.4s, v1.4s, v9.2s[1]
-       OP_ri   v21.4s, v0.4s, v9.2s[1]
-       OP_ir   v21.4s, v1.4s, v8.2s[1]
-
-       OP_rr   v22.4s, v2.4s, v8.2s[1]
-       OP_ii   v22.4s, v3.4s, v9.2s[1]
-       OP_ri   v23.4s, v2.4s, v9.2s[1]
-       OP_ir   v23.4s, v3.4s, v8.2s[1]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
+       OP_ri   v19.4s, v2.4s, v9.s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
+       OP_ri   v23.4s, v2.4s, v9.s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.2s[0]
-       OP_ii   v16.4s, v1.4s, v9.2s[0]
-       OP_ri   v17.4s, v0.4s, v9.2s[0]
-       OP_ir   v17.4s, v1.4s, v8.2s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       OP_rr   v20.4s, v0.4s, v8.2s[1]
-       OP_ii   v20.4s, v1.4s, v9.2s[1]
-       OP_ri   v21.4s, v0.4s, v9.2s[1]
-       OP_ir   v21.4s, v1.4s, v8.2s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       OP_rr   v16.2s, v0.2s, v8.2s[0]
-       OP_ii   v16.2s, v1.2s, v9.2s[0]
-       OP_ri   v17.2s, v0.2s, v9.2s[0]
-       OP_ir   v17.2s, v1.2s, v8.2s[0]
+       OP_rr   v16.2s, v0.2s, v8.s[0]
+       OP_ii   v16.2s, v1.2s, v9.s[0]
+       OP_ri   v17.2s, v0.2s, v9.s[0]
+       OP_ir   v17.2s, v1.2s, v8.s[0]
 
-       OP_rr   v20.2s, v0.2s, v8.2s[1]
-       OP_ii   v20.2s, v1.2s, v9.2s[1]
-       OP_ri   v21.2s, v0.2s, v9.2s[1]
-       OP_ir   v21.2s, v1.2s, v8.2s[1]
+       OP_rr   v20.2s, v0.2s, v8.s[1]
+       OP_ii   v20.2s, v1.2s, v9.s[1]
+       OP_ri   v21.2s, v0.2s, v9.s[1]
+       OP_ir   v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.s, v1.s}[0], [pA]
        add     pA, pA, #8
 
-       OP_rr   s16, s0, v8.2s[0]
-       OP_ii   s16, s1, v9.2s[0]
-       OP_ri   s17, s0, v9.2s[0]
-       OP_ir   s17, s1, v8.2s[0]
+       OP_rr   s16, s0, v8.s[0]
+       OP_ii   s16, s1, v9.s[0]
+       OP_ri   s17, s0, v9.s[0]
+       OP_ir   s17, s1, v8.s[0]
 
-       OP_rr   s20, s0, v8.2s[1]
-       OP_ii   s20, s1, v9.2s[1]
-       OP_ri   s21, s0, v9.2s[1]
-       OP_ir   s21, s1, v8.2s[1]
+       OP_rr   s20, s0, v8.s[1]
+       OP_ii   s20, s1, v9.s[1]
+       OP_ri   s21, s0, v9.s[1]
+       OP_ir   s21, s1, v8.s[1]
 .endm
 
 .macro SAVE1x2
@@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v8.4s[1]
-       OP_ri   v17.4s, v0.4s, v8.4s[1]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v8.s[1]
+       OP_ri   v17.4s, v0.4s, v8.s[1]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       OP_rr   v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v8.4s[1]
-       OP_ri   v19.4s, v2.4s, v8.4s[1]
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v8.s[1]
+       OP_ri   v19.4s, v2.4s, v8.s[1]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
index be0e9bd..3de2725 100644 (file)
@@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v17.16b, v17.16b, v17.16b
-       fmls    v17.4s, v0.4s, v9.4s[0]
+       fmls    v17.4s, v0.4s, v9.s[0]
 #else
-       fmul    v17.4s, v0.4s, v9.4s[0]
+       fmul    v17.4s, v0.4s, v9.s[0]
 #endif
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       fmul    v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v21.16b, v21.16b, v21.16b
-       fmls    v21.4s, v0.4s, v9.4s[1]
+       fmls    v21.4s, v0.4s, v9.s[1]
 #else
-       fmul    v21.4s, v0.4s, v9.4s[1]
+       fmul    v21.4s, v0.4s, v9.s[1]
 #endif
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
-       fmul    v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
+       fmul    v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v25.16b, v25.16b, v25.16b
-       fmls    v25.4s, v0.4s, v9.4s[2]
+       fmls    v25.4s, v0.4s, v9.s[2]
 #else
-       fmul    v25.4s, v0.4s, v9.4s[2]
+       fmul    v25.4s, v0.4s, v9.s[2]
 #endif
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
-       fmul    v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
+       fmul    v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v29.16b, v29.16b, v29.16b
-       fmls    v29.4s, v0.4s, v9.4s[3]
+       fmls    v29.4s, v0.4s, v9.s[3]
 #else
-       fmul    v29.4s, v0.4s, v9.4s[3]
+       fmul    v29.4s, v0.4s, v9.s[3]
 #endif
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 
        ld2     {v12.4s, v13.4s}, [pB]
        add     pB, pB, #32
@@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
        ld2     {v12.4s, v13.4s}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
        ld2     {v4.4s, v5.4s}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro KERNEL4x4_M2
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
 
        ld2     {v8.4s, v9.4s}, [pB]            // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
 
        ld2     {v0.4s, v1.4s}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_E
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
-
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
-
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
-
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
+
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
+
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
+
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
-
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro SAVE4x4
@@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       OP_rr   v16.2s, v0.2s, v8.4s[0]
-       OP_ii   v16.2s, v1.2s, v9.4s[0]
-       OP_ri   v17.2s, v0.2s, v9.4s[0]
-       OP_ir   v17.2s, v1.2s, v8.4s[0]
-
-       OP_rr   v20.2s, v0.2s, v8.4s[1]
-       OP_ii   v20.2s, v1.2s, v9.4s[1]
-       OP_ri   v21.2s, v0.2s, v9.4s[1]
-       OP_ir   v21.2s, v1.2s, v8.4s[1]
-
-       OP_rr   v24.2s, v0.2s, v8.4s[2]
-       OP_ii   v24.2s, v1.2s, v9.4s[2]
-       OP_ri   v25.2s, v0.2s, v9.4s[2]
-       OP_ir   v25.2s, v1.2s, v8.4s[2]
-
-       OP_rr   v28.2s, v0.2s, v8.4s[3]
-       OP_ii   v28.2s, v1.2s, v9.4s[3]
-       OP_ri   v29.2s, v0.2s, v9.4s[3]
-       OP_ir   v29.2s, v1.2s, v8.4s[3]
+       OP_rr   v16.2s, v0.2s, v8.s[0]
+       OP_ii   v16.2s, v1.2s, v9.s[0]
+       OP_ri   v17.2s, v0.2s, v9.s[0]
+       OP_ir   v17.2s, v1.2s, v8.s[0]
+
+       OP_rr   v20.2s, v0.2s, v8.s[1]
+       OP_ii   v20.2s, v1.2s, v9.s[1]
+       OP_ri   v21.2s, v0.2s, v9.s[1]
+       OP_ir   v21.2s, v1.2s, v8.s[1]
+
+       OP_rr   v24.2s, v0.2s, v8.s[2]
+       OP_ii   v24.2s, v1.2s, v9.s[2]
+       OP_ri   v25.2s, v0.2s, v9.s[2]
+       OP_ir   v25.2s, v1.2s, v8.s[2]
+
+       OP_rr   v28.2s, v0.2s, v8.s[3]
+       OP_ii   v28.2s, v1.2s, v9.s[3]
+       OP_ri   v29.2s, v0.2s, v9.s[3]
+       OP_ir   v29.2s, v1.2s, v8.s[3]
 .endm
 
 .macro SAVE2x4
@@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.s, v1.s}[0], [pA]
        add     pA, pA, #8
 
-       OP_rr   s16, s0, v8.4s[0]
-       OP_ii   s16, s1, v9.4s[0]
-       OP_ri   s17, s0, v9.4s[0]
-       OP_ir   s17, s1, v8.4s[0]
-
-       OP_rr   s20, s0, v8.4s[1]
-       OP_ii   s20, s1, v9.4s[1]
-       OP_ri   s21, s0, v9.4s[1]
-       OP_ir   s21, s1, v8.4s[1]
-
-       OP_rr   s24, s0, v8.4s[2]
-       OP_ii   s24, s1, v9.4s[2]
-       OP_ri   s25, s0, v9.4s[2]
-       OP_ir   s25, s1, v8.4s[2]
-
-       OP_rr   s28, s0, v8.4s[3]
-       OP_ii   s28, s1, v9.4s[3]
-       OP_ri   s29, s0, v9.4s[3]
-       OP_ir   s29, s1, v8.4s[3]
+       OP_rr   s16, s0, v8.s[0]
+       OP_ii   s16, s1, v9.s[0]
+       OP_ri   s17, s0, v9.s[0]
+       OP_ir   s17, s1, v8.s[0]
+
+       OP_rr   s20, s0, v8.s[1]
+       OP_ii   s20, s1, v9.s[1]
+       OP_ri   s21, s0, v9.s[1]
+       OP_ir   s21, s1, v8.s[1]
+
+       OP_rr   s24, s0, v8.s[2]
+       OP_ii   s24, s1, v9.s[2]
+       OP_ri   s25, s0, v9.s[2]
+       OP_ir   s25, s1, v8.s[2]
+
+       OP_rr   s28, s0, v8.s[3]
+       OP_ii   s28, s1, v9.s[3]
+       OP_ri   s29, s0, v9.s[3]
+       OP_ir   s29, s1, v8.s[3]
 .endm
 
 .macro SAVE1x4
@@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.2s[0]
-       OP_ii   v16.4s, v1.4s, v9.2s[0]
-       OP_ri   v17.4s, v0.4s, v9.2s[0]
-       OP_ir   v17.4s, v1.4s, v8.2s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       OP_rr   v20.4s, v0.4s, v8.2s[1]
-       OP_ii   v20.4s, v1.4s, v9.2s[1]
-       OP_ri   v21.4s, v0.4s, v9.2s[1]
-       OP_ir   v21.4s, v1.4s, v8.2s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       OP_rr   v16.2s, v0.2s, v8.2s[0]
-       OP_ii   v16.2s, v1.2s, v9.2s[0]
-       OP_ri   v17.2s, v0.2s, v9.2s[0]
-       OP_ir   v17.2s, v1.2s, v8.2s[0]
+       OP_rr   v16.2s, v0.2s, v8.s[0]
+       OP_ii   v16.2s, v1.2s, v9.s[0]
+       OP_ri   v17.2s, v0.2s, v9.s[0]
+       OP_ir   v17.2s, v1.2s, v8.s[0]
 
-       OP_rr   v20.2s, v0.2s, v8.2s[1]
-       OP_ii   v20.2s, v1.2s, v9.2s[1]
-       OP_ri   v21.2s, v0.2s, v9.2s[1]
-       OP_ir   v21.2s, v1.2s, v8.2s[1]
+       OP_rr   v20.2s, v0.2s, v8.s[1]
+       OP_ii   v20.2s, v1.2s, v9.s[1]
+       OP_ri   v21.2s, v0.2s, v9.s[1]
+       OP_ir   v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.s, v1.s}[0], [pA]
        add     pA, pA, #8
 
-       OP_rr   s16, s0, v8.2s[0]
-       OP_ii   s16, s1, v9.2s[0]
-       OP_ri   s17, s0, v9.2s[0]
-       OP_ir   s17, s1, v8.2s[0]
+       OP_rr   s16, s0, v8.s[0]
+       OP_ii   s16, s1, v9.s[0]
+       OP_ri   s17, s0, v9.s[0]
+       OP_ir   s17, s1, v8.s[0]
 
-       OP_rr   s20, s0, v8.2s[1]
-       OP_ii   s20, s1, v9.2s[1]
-       OP_ri   s21, s0, v9.2s[1]
-       OP_ir   s21, s1, v8.2s[1]
+       OP_rr   s20, s0, v8.s[1]
+       OP_ii   s20, s1, v9.s[1]
+       OP_ri   s21, s0, v9.s[1]
+       OP_ir   s21, s1, v8.s[1]
 .endm
 
 .macro SAVE1x2
old mode 100755 (executable)
new mode 100644 (file)
index 3131541..ce5cb04
@@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v17.16b, v17.16b, v17.16b
-       fmls    v17.4s, v0.4s, v9.4s[0]
+       fmls    v17.4s, v0.4s, v9.s[0]
 #else
-       fmul    v17.4s, v0.4s, v9.4s[0]
+       fmul    v17.4s, v0.4s, v9.s[0]
 #endif
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       fmul    v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
+       fmul    v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v19.16b, v19.16b, v19.16b
-       fmls    v19.4s, v2.4s, v9.4s[0]
+       fmls    v19.4s, v2.4s, v9.s[0]
 #else
-       fmul    v19.4s, v2.4s, v9.4s[0]
+       fmul    v19.4s, v2.4s, v9.s[0]
 #endif
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
 
-       fmul    v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v21.16b, v21.16b, v21.16b
-       fmls    v21.4s, v0.4s, v9.4s[1]
+       fmls    v21.4s, v0.4s, v9.s[1]
 #else
-       fmul    v21.4s, v0.4s, v9.4s[1]
+       fmul    v21.4s, v0.4s, v9.s[1]
 #endif
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
-       fmul    v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
+       fmul    v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v23.16b, v23.16b, v23.16b
-       fmls    v23.4s, v2.4s, v9.4s[1]
+       fmls    v23.4s, v2.4s, v9.s[1]
 #else
-       fmul    v23.4s, v2.4s, v9.4s[1]
+       fmul    v23.4s, v2.4s, v9.s[1]
 #endif
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
 
-       fmul    v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
+       fmul    v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v25.16b, v25.16b, v25.16b
-       fmls    v25.4s, v0.4s, v9.4s[2]
+       fmls    v25.4s, v0.4s, v9.s[2]
 #else
-       fmul    v25.4s, v0.4s, v9.4s[2]
+       fmul    v25.4s, v0.4s, v9.s[2]
 #endif
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
-       fmul    v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
+       fmul    v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v27.16b, v27.16b, v27.16b
-       fmls    v27.4s, v2.4s, v9.4s[2]
+       fmls    v27.4s, v2.4s, v9.s[2]
 #else
-       fmul    v27.4s, v2.4s, v9.4s[2]
+       fmul    v27.4s, v2.4s, v9.s[2]
 #endif
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
 
-       fmul    v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
+       fmul    v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v29.16b, v29.16b, v29.16b
-       fmls    v29.4s, v0.4s, v9.4s[3]
+       fmls    v29.4s, v0.4s, v9.s[3]
 #else
-       fmul    v29.4s, v0.4s, v9.4s[3]
+       fmul    v29.4s, v0.4s, v9.s[3]
 #endif
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 
-       fmul    v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
+       fmul    v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v31.16b, v31.16b, v31.16b
-       fmls    v31.4s, v2.4s, v9.4s[3]
+       fmls    v31.4s, v2.4s, v9.s[3]
 #else
-       fmul    v31.4s, v2.4s, v9.4s[3]
+       fmul    v31.4s, v2.4s, v9.s[3]
 #endif
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 
        ld2     {v12.4s, v13.4s}, [pB]
        add     pB, pB, #32
@@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
-
-       OP_rr   v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
-       OP_ri   v19.4s, v2.4s, v9.4s[0]
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
-
-       OP_rr   v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
-       OP_ri   v23.4s, v2.4s, v9.4s[1]
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
-
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
-       OP_ri   v27.4s, v2.4s, v9.4s[2]
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
-
-       OP_rr   v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
-       OP_ri   v31.4s, v2.4s, v9.4s[3]
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
+       OP_ri   v19.4s, v2.4s, v9.s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
+       OP_ri   v23.4s, v2.4s, v9.s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
+
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
+       OP_ri   v27.4s, v2.4s, v9.s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
+
+       OP_rr   v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
+       OP_ri   v31.4s, v2.4s, v9.s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 
        ld2     {v12.4s, v13.4s}, [pB]          // For next round
        add     pB, pB, #32
@@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
-
-       OP_rr   v18.4s, v6.4s, v12.4s[0]
-       OP_ii   v18.4s, v7.4s, v13.4s[0]
-       OP_ri   v19.4s, v6.4s, v13.4s[0]
-       OP_ir   v19.4s, v7.4s, v12.4s[0]
-
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
-
-       OP_rr   v22.4s, v6.4s, v12.4s[1]
-       OP_ii   v22.4s, v7.4s, v13.4s[1]
-       OP_ri   v23.4s, v6.4s, v13.4s[1]
-       OP_ir   v23.4s, v7.4s, v12.4s[1]
-
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
-
-       OP_rr   v26.4s, v6.4s, v12.4s[2]
-       OP_ii   v26.4s, v7.4s, v13.4s[2]
-       OP_ri   v27.4s, v6.4s, v13.4s[2]
-       OP_ir   v27.4s, v7.4s, v12.4s[2]
-
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
-
-       OP_rr   v30.4s, v6.4s, v12.4s[3]
-       OP_ii   v30.4s, v7.4s, v13.4s[3]
-       OP_ri   v31.4s, v6.4s, v13.4s[3]
-       OP_ir   v31.4s, v7.4s, v12.4s[3]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
+
+       OP_rr   v18.4s, v6.4s, v12.s[0]
+       OP_ii   v18.4s, v7.4s, v13.s[0]
+       OP_ri   v19.4s, v6.4s, v13.s[0]
+       OP_ir   v19.4s, v7.4s, v12.s[0]
+
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
+
+       OP_rr   v22.4s, v6.4s, v12.s[1]
+       OP_ii   v22.4s, v7.4s, v13.s[1]
+       OP_ri   v23.4s, v6.4s, v13.s[1]
+       OP_ir   v23.4s, v7.4s, v12.s[1]
+
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
+
+       OP_rr   v26.4s, v6.4s, v12.s[2]
+       OP_ii   v26.4s, v7.4s, v13.s[2]
+       OP_ri   v27.4s, v6.4s, v13.s[2]
+       OP_ir   v27.4s, v7.4s, v12.s[2]
+
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
+
+       OP_rr   v30.4s, v6.4s, v12.s[3]
+       OP_ii   v30.4s, v7.4s, v13.s[3]
+       OP_ri   v31.4s, v6.4s, v13.s[3]
+       OP_ir   v31.4s, v7.4s, v12.s[3]
 
        ld2     {v8.4s, v9.4s}, [pB]
        add     pB, pB, #32
@@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
-
-       OP_rr   v18.4s, v6.4s, v12.4s[0]
-       OP_ii   v18.4s, v7.4s, v13.4s[0]
-       OP_ri   v19.4s, v6.4s, v13.4s[0]
-       OP_ir   v19.4s, v7.4s, v12.4s[0]
-
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
-
-       OP_rr   v22.4s, v6.4s, v12.4s[1]
-       OP_ii   v22.4s, v7.4s, v13.4s[1]
-       OP_ri   v23.4s, v6.4s, v13.4s[1]
-       OP_ir   v23.4s, v7.4s, v12.4s[1]
-
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
-
-       OP_rr   v26.4s, v6.4s, v12.4s[2]
-       OP_ii   v26.4s, v7.4s, v13.4s[2]
-       OP_ri   v27.4s, v6.4s, v13.4s[2]
-       OP_ir   v27.4s, v7.4s, v12.4s[2]
-
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
-
-       OP_rr   v30.4s, v6.4s, v12.4s[3]
-       OP_ii   v30.4s, v7.4s, v13.4s[3]
-       OP_ri   v31.4s, v6.4s, v13.4s[3]
-       OP_ir   v31.4s, v7.4s, v12.4s[3]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
+
+       OP_rr   v18.4s, v6.4s, v12.s[0]
+       OP_ii   v18.4s, v7.4s, v13.s[0]
+       OP_ri   v19.4s, v6.4s, v13.s[0]
+       OP_ir   v19.4s, v7.4s, v12.s[0]
+
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
+
+       OP_rr   v22.4s, v6.4s, v12.s[1]
+       OP_ii   v22.4s, v7.4s, v13.s[1]
+       OP_ri   v23.4s, v6.4s, v13.s[1]
+       OP_ir   v23.4s, v7.4s, v12.s[1]
+
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
+
+       OP_rr   v26.4s, v6.4s, v12.s[2]
+       OP_ii   v26.4s, v7.4s, v13.s[2]
+       OP_ri   v27.4s, v6.4s, v13.s[2]
+       OP_ir   v27.4s, v7.4s, v12.s[2]
+
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
+
+       OP_rr   v30.4s, v6.4s, v12.s[3]
+       OP_ii   v30.4s, v7.4s, v13.s[3]
+       OP_ri   v31.4s, v6.4s, v13.s[3]
+       OP_ir   v31.4s, v7.4s, v12.s[3]
 
 .endm
 
@@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
-
-       OP_rr   v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v9.4s[0]
-       OP_ri   v19.4s, v2.4s, v9.4s[0]
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
-
-       OP_rr   v22.4s, v2.4s, v8.4s[1]
-       OP_ii   v22.4s, v3.4s, v9.4s[1]
-       OP_ri   v23.4s, v2.4s, v9.4s[1]
-       OP_ir   v23.4s, v3.4s, v8.4s[1]
-
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v26.4s, v2.4s, v8.4s[2]
-       OP_ii   v26.4s, v3.4s, v9.4s[2]
-       OP_ri   v27.4s, v2.4s, v9.4s[2]
-       OP_ir   v27.4s, v3.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
-
-       OP_rr   v30.4s, v2.4s, v8.4s[3]
-       OP_ii   v30.4s, v3.4s, v9.4s[3]
-       OP_ri   v31.4s, v2.4s, v9.4s[3]
-       OP_ir   v31.4s, v3.4s, v8.4s[3]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
+       OP_ri   v19.4s, v2.4s, v9.s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
+       OP_ri   v23.4s, v2.4s, v9.s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
+
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v26.4s, v2.4s, v8.s[2]
+       OP_ii   v26.4s, v3.4s, v9.s[2]
+       OP_ri   v27.4s, v2.4s, v9.s[2]
+       OP_ir   v27.4s, v3.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
+
+       OP_rr   v30.4s, v2.4s, v8.s[3]
+       OP_ii   v30.4s, v3.4s, v9.s[3]
+       OP_ri   v31.4s, v2.4s, v9.s[3]
+       OP_ir   v31.4s, v3.4s, v8.s[3]
 
 .endm
 
@@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v17.16b, v17.16b, v17.16b
-       fmls    v17.4s, v0.4s, v9.4s[0]
+       fmls    v17.4s, v0.4s, v9.s[0]
 #else
-       fmul    v17.4s, v0.4s, v9.4s[0]
+       fmul    v17.4s, v0.4s, v9.s[0]
 #endif
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       fmul    v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v21.16b, v21.16b, v21.16b
-       fmls    v21.4s, v0.4s, v9.4s[1]
+       fmls    v21.4s, v0.4s, v9.s[1]
 #else
-       fmul    v21.4s, v0.4s, v9.4s[1]
+       fmul    v21.4s, v0.4s, v9.s[1]
 #endif
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
-       fmul    v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
+       fmul    v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v25.16b, v25.16b, v25.16b
-       fmls    v25.4s, v0.4s, v9.4s[2]
+       fmls    v25.4s, v0.4s, v9.s[2]
 #else
-       fmul    v25.4s, v0.4s, v9.4s[2]
+       fmul    v25.4s, v0.4s, v9.s[2]
 #endif
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
-       fmul    v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
+       fmul    v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v29.16b, v29.16b, v29.16b
-       fmls    v29.4s, v0.4s, v9.4s[3]
+       fmls    v29.4s, v0.4s, v9.s[3]
 #else
-       fmul    v29.4s, v0.4s, v9.4s[3]
+       fmul    v29.4s, v0.4s, v9.s[3]
 #endif
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 
        ld2     {v12.4s, v13.4s}, [pB]
        add     pB, pB, #32
@@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
        ld2     {v12.4s, v13.4s}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 
        ld2     {v4.4s, v5.4s}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro KERNEL4x4_M2
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
 
        ld2     {v8.4s, v9.4s}, [pB]            // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
 
        ld2     {v0.4s, v1.4s}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_E
-       OP_rr   v16.4s, v4.4s, v12.4s[0]
-       OP_ii   v16.4s, v5.4s, v13.4s[0]
-       OP_ri   v17.4s, v4.4s, v13.4s[0]
-       OP_ir   v17.4s, v5.4s, v12.4s[0]
-
-       OP_rr   v20.4s, v4.4s, v12.4s[1]
-       OP_ii   v20.4s, v5.4s, v13.4s[1]
-       OP_ri   v21.4s, v4.4s, v13.4s[1]
-       OP_ir   v21.4s, v5.4s, v12.4s[1]
-
-       OP_rr   v24.4s, v4.4s, v12.4s[2]
-       OP_ii   v24.4s, v5.4s, v13.4s[2]
-       OP_ri   v25.4s, v4.4s, v13.4s[2]
-       OP_ir   v25.4s, v5.4s, v12.4s[2]
-
-       OP_rr   v28.4s, v4.4s, v12.4s[3]
-       OP_ii   v28.4s, v5.4s, v13.4s[3]
-       OP_ri   v29.4s, v4.4s, v13.4s[3]
-       OP_ir   v29.4s, v5.4s, v12.4s[3]
+       OP_rr   v16.4s, v4.4s, v12.s[0]
+       OP_ii   v16.4s, v5.4s, v13.s[0]
+       OP_ri   v17.4s, v4.4s, v13.s[0]
+       OP_ir   v17.4s, v5.4s, v12.s[0]
+
+       OP_rr   v20.4s, v4.4s, v12.s[1]
+       OP_ii   v20.4s, v5.4s, v13.s[1]
+       OP_ri   v21.4s, v4.4s, v13.s[1]
+       OP_ir   v21.4s, v5.4s, v12.s[1]
+
+       OP_rr   v24.4s, v4.4s, v12.s[2]
+       OP_ii   v24.4s, v5.4s, v13.s[2]
+       OP_ri   v25.4s, v4.4s, v13.s[2]
+       OP_ir   v25.4s, v5.4s, v12.s[2]
+
+       OP_rr   v28.4s, v4.4s, v12.s[3]
+       OP_ii   v28.4s, v5.4s, v13.s[3]
+       OP_ri   v29.4s, v4.4s, v13.s[3]
+       OP_ir   v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v9.4s[0]
-       OP_ri   v17.4s, v0.4s, v9.4s[0]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.4s[1]
-       OP_ii   v20.4s, v1.4s, v9.4s[1]
-       OP_ri   v21.4s, v0.4s, v9.4s[1]
-       OP_ir   v21.4s, v1.4s, v8.4s[1]
-
-       OP_rr   v24.4s, v0.4s, v8.4s[2]
-       OP_ii   v24.4s, v1.4s, v9.4s[2]
-       OP_ri   v25.4s, v0.4s, v9.4s[2]
-       OP_ir   v25.4s, v1.4s, v8.4s[2]
-
-       OP_rr   v28.4s, v0.4s, v8.4s[3]
-       OP_ii   v28.4s, v1.4s, v9.4s[3]
-       OP_ri   v29.4s, v0.4s, v9.4s[3]
-       OP_ir   v29.4s, v1.4s, v8.4s[3]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v24.4s, v0.4s, v8.s[2]
+       OP_ii   v24.4s, v1.4s, v9.s[2]
+       OP_ri   v25.4s, v0.4s, v9.s[2]
+       OP_ir   v25.4s, v1.4s, v8.s[2]
+
+       OP_rr   v28.4s, v0.4s, v8.s[3]
+       OP_ii   v28.4s, v1.4s, v9.s[3]
+       OP_ri   v29.4s, v0.4s, v9.s[3]
+       OP_ir   v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro SAVE4x4
@@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       OP_rr   v16.2s, v0.2s, v8.4s[0]
-       OP_ii   v16.2s, v1.2s, v9.4s[0]
-       OP_ri   v17.2s, v0.2s, v9.4s[0]
-       OP_ir   v17.2s, v1.2s, v8.4s[0]
-
-       OP_rr   v20.2s, v0.2s, v8.4s[1]
-       OP_ii   v20.2s, v1.2s, v9.4s[1]
-       OP_ri   v21.2s, v0.2s, v9.4s[1]
-       OP_ir   v21.2s, v1.2s, v8.4s[1]
-
-       OP_rr   v24.2s, v0.2s, v8.4s[2]
-       OP_ii   v24.2s, v1.2s, v9.4s[2]
-       OP_ri   v25.2s, v0.2s, v9.4s[2]
-       OP_ir   v25.2s, v1.2s, v8.4s[2]
-
-       OP_rr   v28.2s, v0.2s, v8.4s[3]
-       OP_ii   v28.2s, v1.2s, v9.4s[3]
-       OP_ri   v29.2s, v0.2s, v9.4s[3]
-       OP_ir   v29.2s, v1.2s, v8.4s[3]
+       OP_rr   v16.2s, v0.2s, v8.s[0]
+       OP_ii   v16.2s, v1.2s, v9.s[0]
+       OP_ri   v17.2s, v0.2s, v9.s[0]
+       OP_ir   v17.2s, v1.2s, v8.s[0]
+
+       OP_rr   v20.2s, v0.2s, v8.s[1]
+       OP_ii   v20.2s, v1.2s, v9.s[1]
+       OP_ri   v21.2s, v0.2s, v9.s[1]
+       OP_ir   v21.2s, v1.2s, v8.s[1]
+
+       OP_rr   v24.2s, v0.2s, v8.s[2]
+       OP_ii   v24.2s, v1.2s, v9.s[2]
+       OP_ri   v25.2s, v0.2s, v9.s[2]
+       OP_ir   v25.2s, v1.2s, v8.s[2]
+
+       OP_rr   v28.2s, v0.2s, v8.s[3]
+       OP_ii   v28.2s, v1.2s, v9.s[3]
+       OP_ri   v29.2s, v0.2s, v9.s[3]
+       OP_ir   v29.2s, v1.2s, v8.s[3]
 .endm
 
 .macro SAVE2x4
@@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.s, v1.s}[0], [pA]
        add     pA, pA, #8
 
-       OP_rr   s16, s0, v8.4s[0]
-       OP_ii   s16, s1, v9.4s[0]
-       OP_ri   s17, s0, v9.4s[0]
-       OP_ir   s17, s1, v8.4s[0]
-
-       OP_rr   s20, s0, v8.4s[1]
-       OP_ii   s20, s1, v9.4s[1]
-       OP_ri   s21, s0, v9.4s[1]
-       OP_ir   s21, s1, v8.4s[1]
-
-       OP_rr   s24, s0, v8.4s[2]
-       OP_ii   s24, s1, v9.4s[2]
-       OP_ri   s25, s0, v9.4s[2]
-       OP_ir   s25, s1, v8.4s[2]
-
-       OP_rr   s28, s0, v8.4s[3]
-       OP_ii   s28, s1, v9.4s[3]
-       OP_ri   s29, s0, v9.4s[3]
-       OP_ir   s29, s1, v8.4s[3]
+       OP_rr   s16, s0, v8.s[0]
+       OP_ii   s16, s1, v9.s[0]
+       OP_ri   s17, s0, v9.s[0]
+       OP_ir   s17, s1, v8.s[0]
+
+       OP_rr   s20, s0, v8.s[1]
+       OP_ii   s20, s1, v9.s[1]
+       OP_ri   s21, s0, v9.s[1]
+       OP_ir   s21, s1, v8.s[1]
+
+       OP_rr   s24, s0, v8.s[2]
+       OP_ii   s24, s1, v9.s[2]
+       OP_ri   s25, s0, v9.s[2]
+       OP_ir   s25, s1, v8.s[2]
+
+       OP_rr   s28, s0, v8.s[3]
+       OP_ii   s28, s1, v9.s[3]
+       OP_ri   s29, s0, v9.s[3]
+       OP_ir   s29, s1, v8.s[3]
 .endm
 
 .macro SAVE1x4
@@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.2s[0]
-       OP_ii   v16.4s, v1.4s, v9.2s[0]
-       OP_ri   v17.4s, v0.4s, v9.2s[0]
-       OP_ir   v17.4s, v1.4s, v8.2s[0]
-
-       OP_rr   v18.4s, v2.4s, v8.2s[0]
-       OP_ii   v18.4s, v3.4s, v9.2s[0]
-       OP_ri   v19.4s, v2.4s, v9.2s[0]
-       OP_ir   v19.4s, v3.4s, v8.2s[0]
-
-       OP_rr   v20.4s, v0.4s, v8.2s[1]
-       OP_ii   v20.4s, v1.4s, v9.2s[1]
-       OP_ri   v21.4s, v0.4s, v9.2s[1]
-       OP_ir   v21.4s, v1.4s, v8.2s[1]
-
-       OP_rr   v22.4s, v2.4s, v8.2s[1]
-       OP_ii   v22.4s, v3.4s, v9.2s[1]
-       OP_ri   v23.4s, v2.4s, v9.2s[1]
-       OP_ir   v23.4s, v3.4s, v8.2s[1]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
+
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v9.s[0]
+       OP_ri   v19.4s, v2.4s, v9.s[0]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
+
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
+
+       OP_rr   v22.4s, v2.4s, v8.s[1]
+       OP_ii   v22.4s, v3.4s, v9.s[1]
+       OP_ri   v23.4s, v2.4s, v9.s[1]
+       OP_ir   v23.4s, v3.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.4s, v1.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.2s[0]
-       OP_ii   v16.4s, v1.4s, v9.2s[0]
-       OP_ri   v17.4s, v0.4s, v9.2s[0]
-       OP_ir   v17.4s, v1.4s, v8.2s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v9.s[0]
+       OP_ri   v17.4s, v0.4s, v9.s[0]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       OP_rr   v20.4s, v0.4s, v8.2s[1]
-       OP_ii   v20.4s, v1.4s, v9.2s[1]
-       OP_ri   v21.4s, v0.4s, v9.2s[1]
-       OP_ir   v21.4s, v1.4s, v8.2s[1]
+       OP_rr   v20.4s, v0.4s, v8.s[1]
+       OP_ii   v20.4s, v1.4s, v9.s[1]
+       OP_ri   v21.4s, v0.4s, v9.s[1]
+       OP_ir   v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       OP_rr   v16.2s, v0.2s, v8.2s[0]
-       OP_ii   v16.2s, v1.2s, v9.2s[0]
-       OP_ri   v17.2s, v0.2s, v9.2s[0]
-       OP_ir   v17.2s, v1.2s, v8.2s[0]
+       OP_rr   v16.2s, v0.2s, v8.s[0]
+       OP_ii   v16.2s, v1.2s, v9.s[0]
+       OP_ri   v17.2s, v0.2s, v9.s[0]
+       OP_ir   v17.2s, v1.2s, v8.s[0]
 
-       OP_rr   v20.2s, v0.2s, v8.2s[1]
-       OP_ii   v20.2s, v1.2s, v9.2s[1]
-       OP_ri   v21.2s, v0.2s, v9.2s[1]
-       OP_ir   v21.2s, v1.2s, v8.2s[1]
+       OP_rr   v20.2s, v0.2s, v8.s[1]
+       OP_ii   v20.2s, v1.2s, v9.s[1]
+       OP_ri   v21.2s, v0.2s, v9.s[1]
+       OP_ir   v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.s, v1.s}[0], [pA]
        add     pA, pA, #8
 
-       OP_rr   s16, s0, v8.2s[0]
-       OP_ii   s16, s1, v9.2s[0]
-       OP_ri   s17, s0, v9.2s[0]
-       OP_ir   s17, s1, v8.2s[0]
+       OP_rr   s16, s0, v8.s[0]
+       OP_ii   s16, s1, v9.s[0]
+       OP_ri   s17, s0, v9.s[0]
+       OP_ir   s17, s1, v8.s[0]
 
-       OP_rr   s20, s0, v8.2s[1]
-       OP_ii   s20, s1, v9.2s[1]
-       OP_ri   s21, s0, v9.2s[1]
-       OP_ir   s21, s1, v8.2s[1]
+       OP_rr   s20, s0, v8.s[1]
+       OP_ii   s20, s1, v9.s[1]
+       OP_ri   s21, s0, v9.s[1]
+       OP_ir   s21, s1, v8.s[1]
 .endm
 
 .macro SAVE1x2
@@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.4s, v3.4s}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.4s, v0.4s, v8.4s[0]
-       OP_ii   v16.4s, v1.4s, v8.4s[1]
-       OP_ri   v17.4s, v0.4s, v8.4s[1]
-       OP_ir   v17.4s, v1.4s, v8.4s[0]
+       OP_rr   v16.4s, v0.4s, v8.s[0]
+       OP_ii   v16.4s, v1.4s, v8.s[1]
+       OP_ri   v17.4s, v0.4s, v8.s[1]
+       OP_ir   v17.4s, v1.4s, v8.s[0]
 
-       OP_rr   v18.4s, v2.4s, v8.4s[0]
-       OP_ii   v18.4s, v3.4s, v8.4s[1]
-       OP_ri   v19.4s, v2.4s, v8.4s[1]
-       OP_ir   v19.4s, v3.4s, v8.4s[0]
+       OP_rr   v18.4s, v2.4s, v8.s[0]
+       OP_ii   v18.4s, v3.4s, v8.s[1]
+       OP_ri   v19.4s, v2.4s, v8.s[1]
+       OP_ir   v19.4s, v3.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
index e2ad114..44b0f7f 100644 (file)
@@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldp     q0, q1, [pA]
        add     pA, pA, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v29.2d, v1.2d, v11.2d[0]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       fmul    v29.2d, v1.2d, v11.d[0]
 
        ldp     q2, q3, [ppA]
        add     ppA, ppA, #32
 
-       fmul    v20.2d, v0.2d, v9.2d[0]
-       fmul    v25.2d, v1.2d, v10.2d[0]
+       fmul    v20.2d, v0.2d, v9.d[0]
+       fmul    v25.2d, v1.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-       fmul    v18.2d, v2.2d, v8.2d[0]
-       fmul    v31.2d, v3.2d, v11.2d[0]
+       fmul    v18.2d, v2.2d, v8.d[0]
+       fmul    v31.2d, v3.2d, v11.d[0]
 
        prfm    PLDL1KEEP, [ppA, #A_PRE_SIZE]
 
-       fmul    v22.2d, v2.2d, v9.2d[0]
-       fmul    v27.2d, v3.2d, v10.2d[0]
+       fmul    v22.2d, v2.2d, v9.d[0]
+       fmul    v27.2d, v3.2d, v10.d[0]
 
        ldp     d12, d13, [pB]
        add     pB, pB, #16
 
-       fmul    v24.2d, v0.2d, v10.2d[0]
-       fmul    v21.2d, v1.2d, v9.2d[0]
+       fmul    v24.2d, v0.2d, v10.d[0]
+       fmul    v21.2d, v1.2d, v9.d[0]
 
        ldp     q4, q5, [pA]            // for next round
        add     pA, pA, #32
 
-       fmul    v26.2d, v2.2d, v10.2d[0]
-       fmul    v23.2d, v3.2d, v9.2d[0]
+       fmul    v26.2d, v2.2d, v10.d[0]
+       fmul    v23.2d, v3.2d, v9.d[0]
 
        ldp     q6, q7, [ppA]           // for next round
        add     ppA, ppA, #32
 
-       fmul    v28.2d, v0.2d, v11.2d[0]
-       fmul    v17.2d, v1.2d, v8.2d[0]
+       fmul    v28.2d, v0.2d, v11.d[0]
+       fmul    v17.2d, v1.2d, v8.d[0]
 
        ldp     d14, d15, [pB]
        add     pB, pB, #16
 
-       fmul    v30.2d, v2.2d, v11.2d[0]
-       fmul    v19.2d, v3.2d, v8.2d[0]
+       fmul    v30.2d, v2.2d, v11.d[0]
+       fmul    v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro KERNEL8x4_M2
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v29.2d, v5.2d, v15.2d[0]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v29.2d, v5.2d, v15.d[0]
 
        ldp     d8, d9, [pB]
        add     pB, pB, #16
 
-       fmla    v18.2d, v6.2d, v12.2d[0]
-       fmla    v31.2d, v7.2d, v15.2d[0]
+       fmla    v18.2d, v6.2d, v12.d[0]
+       fmla    v31.2d, v7.2d, v15.d[0]
 
        ldp     d10, d11, [pB]
        add     pB, pB, #16
 
-       fmla    v20.2d, v4.2d, v13.2d[0]
-       fmla    v25.2d, v5.2d, v14.2d[0]
+       fmla    v20.2d, v4.2d, v13.d[0]
+       fmla    v25.2d, v5.2d, v14.d[0]
 
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
 
-       fmla    v22.2d, v6.2d, v13.2d[0]
-       fmla    v27.2d, v7.2d, v14.2d[0]
-       fmla    v24.2d, v4.2d, v14.2d[0]
-       fmla    v21.2d, v5.2d, v13.2d[0]
+       fmla    v22.2d, v6.2d, v13.d[0]
+       fmla    v27.2d, v7.2d, v14.d[0]
+       fmla    v24.2d, v4.2d, v14.d[0]
+       fmla    v21.2d, v5.2d, v13.d[0]
 
        ldp     q0, q1, [pA]
        add     pA, pA, #32
 
-       fmla    v26.2d, v6.2d, v14.2d[0]
-       fmla    v23.2d, v7.2d, v13.2d[0]
-       fmla    v28.2d, v4.2d, v15.2d[0]
-       fmla    v17.2d, v5.2d, v12.2d[0]
+       fmla    v26.2d, v6.2d, v14.d[0]
+       fmla    v23.2d, v7.2d, v13.d[0]
+       fmla    v28.2d, v4.2d, v15.d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
 
        ldp     q2, q3, [ppA]
        add     ppA, ppA, #32
 
-       fmla    v30.2d, v6.2d, v15.2d[0]
-       fmla    v19.2d, v7.2d, v12.2d[0]
+       fmla    v30.2d, v6.2d, v15.d[0]
+       fmla    v19.2d, v7.2d, v12.d[0]
 .endm
 
 .macro KERNEL8x4_M1
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v11.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v11.d[0]
 
        ldp     d12, d13, [pB]
        add     pB, pB, #16
 
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v31.2d, v3.2d, v11.2d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v31.2d, v3.2d, v11.d[0]
 
        ldp     d14, d15, [pB]
        add     pB, pB, #16
 
-       fmla    v20.2d, v0.2d, v9.2d[0]
-       fmla    v25.2d, v1.2d, v10.2d[0]
+       fmla    v20.2d, v0.2d, v9.d[0]
+       fmla    v25.2d, v1.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-       fmla    v22.2d, v2.2d, v9.2d[0]
-       fmla    v27.2d, v3.2d, v10.2d[0]
+       fmla    v22.2d, v2.2d, v9.d[0]
+       fmla    v27.2d, v3.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [ppA, #A_PRE_SIZE]
 
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v21.2d, v1.2d, v9.2d[0]
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v21.2d, v1.2d, v9.d[0]
 
        ldp     q4, q5, [pA]
        add     pA, pA, #32
 
-       fmla    v26.2d, v2.2d, v10.2d[0]
-       fmla    v23.2d, v3.2d, v9.2d[0]
+       fmla    v26.2d, v2.2d, v10.d[0]
+       fmla    v23.2d, v3.2d, v9.d[0]
 
-       fmla    v28.2d, v0.2d, v11.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v11.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
 
        ldp     q6, q7, [ppA]
        add     ppA, ppA, #32
 
-       fmla    v30.2d, v2.2d, v11.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
+       fmla    v30.2d, v2.2d, v11.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro KERNEL8x4_E
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v25.2d, v5.2d, v14.2d[0]
-       fmla    v18.2d, v6.2d, v12.2d[0]
-       fmla    v27.2d, v7.2d, v14.2d[0]
-
-       fmla    v20.2d, v4.2d, v13.2d[0]
-       fmla    v29.2d, v5.2d, v15.2d[0]
-       fmla    v22.2d, v6.2d, v13.2d[0]
-       fmla    v31.2d, v7.2d, v15.2d[0]
-
-       fmla    v24.2d, v4.2d, v14.2d[0]
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v26.2d, v6.2d, v14.2d[0]
-       fmla    v19.2d, v7.2d, v12.2d[0]
-
-       fmla    v28.2d, v4.2d, v15.2d[0]
-       fmla    v21.2d, v5.2d, v13.2d[0]
-       fmla    v30.2d, v6.2d, v15.2d[0]
-       fmla    v23.2d, v7.2d, v13.2d[0]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v25.2d, v5.2d, v14.d[0]
+       fmla    v18.2d, v6.2d, v12.d[0]
+       fmla    v27.2d, v7.2d, v14.d[0]
+
+       fmla    v20.2d, v4.2d, v13.d[0]
+       fmla    v29.2d, v5.2d, v15.d[0]
+       fmla    v22.2d, v6.2d, v13.d[0]
+       fmla    v31.2d, v7.2d, v15.d[0]
+
+       fmla    v24.2d, v4.2d, v14.d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v26.2d, v6.2d, v14.d[0]
+       fmla    v19.2d, v7.2d, v12.d[0]
+
+       fmla    v28.2d, v4.2d, v15.d[0]
+       fmla    v21.2d, v5.2d, v13.d[0]
+       fmla    v30.2d, v6.2d, v15.d[0]
+       fmla    v23.2d, v7.2d, v13.d[0]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldp     q0, q1, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v11.2d[0]
-       fmla    v20.2d, v0.2d, v9.2d[0]
-       fmla    v25.2d, v1.2d, v10.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v11.d[0]
+       fmla    v20.2d, v0.2d, v9.d[0]
+       fmla    v25.2d, v1.2d, v10.d[0]
 
        ldp     q2, q3, [ppA]
        add     ppA, ppA, #32
 
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v21.2d, v1.2d, v9.2d[0]
-       fmla    v28.2d, v0.2d, v11.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v21.2d, v1.2d, v9.d[0]
+       fmla    v28.2d, v0.2d, v11.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
 
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v31.2d, v3.2d, v11.2d[0]
-       fmla    v22.2d, v2.2d, v9.2d[0]
-       fmla    v27.2d, v3.2d, v10.2d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v31.2d, v3.2d, v11.d[0]
+       fmla    v22.2d, v2.2d, v9.d[0]
+       fmla    v27.2d, v3.2d, v10.d[0]
 
-       fmla    v26.2d, v2.2d, v10.2d[0]
-       fmla    v23.2d, v3.2d, v9.2d[0]
-       fmla    v30.2d, v2.2d, v11.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
+       fmla    v26.2d, v2.2d, v10.d[0]
+       fmla    v23.2d, v3.2d, v9.d[0]
+       fmla    v30.2d, v2.2d, v11.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro SAVE8x4
@@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v28.2d, v0.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     d0 , [pA]
        add     pA, pA, #8
 
-       fmla    v16.2d, v8.2d, v0.2d[0]
+       fmla    v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA , pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
old mode 100755 (executable)
new mode 100644 (file)
index 88e9a77..b04dbb5
@@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v10.2d, v11.2d}, [pB]
        add     pB, pB, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v17.2d, v1.2d, v8.2d[0]
-       fmul    v18.2d, v0.2d, v8.2d[1]
-       fmul    v19.2d, v1.2d, v8.2d[1]
-
-       fmul    v20.2d, v0.2d, v9.2d[0]
-       fmul    v21.2d, v1.2d, v9.2d[0]
-       fmul    v22.2d, v0.2d, v9.2d[1]
-       fmul    v23.2d, v1.2d, v9.2d[1]
-
-       fmul    v24.2d, v0.2d, v10.2d[0]
-       fmul    v25.2d, v1.2d, v10.2d[0]
-       fmul    v26.2d, v0.2d, v10.2d[1]
-       fmul    v27.2d, v1.2d, v10.2d[1]
-
-       fmul    v28.2d, v0.2d, v11.2d[0]
-       fmul    v29.2d, v1.2d, v11.2d[0]
-       fmul    v30.2d, v0.2d, v11.2d[1]
-       fmul    v31.2d, v1.2d, v11.2d[1]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       fmul    v17.2d, v1.2d, v8.d[0]
+       fmul    v18.2d, v0.2d, v8.d[1]
+       fmul    v19.2d, v1.2d, v8.d[1]
+
+       fmul    v20.2d, v0.2d, v9.d[0]
+       fmul    v21.2d, v1.2d, v9.d[0]
+       fmul    v22.2d, v0.2d, v9.d[1]
+       fmul    v23.2d, v1.2d, v9.d[1]
+
+       fmul    v24.2d, v0.2d, v10.d[0]
+       fmul    v25.2d, v1.2d, v10.d[0]
+       fmul    v26.2d, v0.2d, v10.d[1]
+       fmul    v27.2d, v1.2d, v10.d[1]
+
+       fmul    v28.2d, v0.2d, v11.d[0]
+       fmul    v29.2d, v1.2d, v11.d[0]
+       fmul    v30.2d, v0.2d, v11.d[1]
+       fmul    v31.2d, v1.2d, v11.d[1]
 
        ld1     {v12.2d, v13.2d}, [pB]
        add     pB, pB, #32
@@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M1
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v0.2d, v8.2d[1]
-       fmla    v19.2d, v1.2d, v8.2d[1]
-
-       fmla    v20.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v9.2d[0]
-       fmla    v22.2d, v0.2d, v9.2d[1]
-       fmla    v23.2d, v1.2d, v9.2d[1]
-
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v25.2d, v1.2d, v10.2d[0]
-       fmla    v26.2d, v0.2d, v10.2d[1]
-       fmla    v27.2d, v1.2d, v10.2d[1]
-
-       fmla    v28.2d, v0.2d, v11.2d[0]
-       fmla    v29.2d, v1.2d, v11.2d[0]
-       fmla    v30.2d, v0.2d, v11.2d[1]
-       fmla    v31.2d, v1.2d, v11.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v0.2d, v8.d[1]
+       fmla    v19.2d, v1.2d, v8.d[1]
+
+       fmla    v20.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v9.d[0]
+       fmla    v22.2d, v0.2d, v9.d[1]
+       fmla    v23.2d, v1.2d, v9.d[1]
+
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v25.2d, v1.2d, v10.d[0]
+       fmla    v26.2d, v0.2d, v10.d[1]
+       fmla    v27.2d, v1.2d, v10.d[1]
+
+       fmla    v28.2d, v0.2d, v11.d[0]
+       fmla    v29.2d, v1.2d, v11.d[0]
+       fmla    v30.2d, v0.2d, v11.d[1]
+       fmla    v31.2d, v1.2d, v11.d[1]
 
        ld1     {v12.2d, v13.2d}, [pB]          // For next round
        add     pB, pB, #32
@@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v18.2d, v4.2d, v12.2d[1]
-       fmla    v19.2d, v5.2d, v12.2d[1]
-
-       fmla    v20.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v13.2d[0]
-       fmla    v22.2d, v4.2d, v13.2d[1]
-       fmla    v23.2d, v5.2d, v13.2d[1]
-
-       fmla    v24.2d, v4.2d, v14.2d[0]
-       fmla    v25.2d, v5.2d, v14.2d[0]
-       fmla    v26.2d, v4.2d, v14.2d[1]
-       fmla    v27.2d, v5.2d, v14.2d[1]
-
-       fmla    v28.2d, v4.2d, v15.2d[0]
-       fmla    v29.2d, v5.2d, v15.2d[0]
-       fmla    v30.2d, v4.2d, v15.2d[1]
-       fmla    v31.2d, v5.2d, v15.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v18.2d, v4.2d, v12.d[1]
+       fmla    v19.2d, v5.2d, v12.d[1]
+
+       fmla    v20.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v13.d[0]
+       fmla    v22.2d, v4.2d, v13.d[1]
+       fmla    v23.2d, v5.2d, v13.d[1]
+
+       fmla    v24.2d, v4.2d, v14.d[0]
+       fmla    v25.2d, v5.2d, v14.d[0]
+       fmla    v26.2d, v4.2d, v14.d[1]
+       fmla    v27.2d, v5.2d, v14.d[1]
+
+       fmla    v28.2d, v4.2d, v15.d[0]
+       fmla    v29.2d, v5.2d, v15.d[0]
+       fmla    v30.2d, v4.2d, v15.d[1]
+       fmla    v31.2d, v5.2d, v15.d[1]
 
        ld1     {v8.2d, v9.2d}, [pB]            // For next round
        add     pB, pB, #32
@@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_E
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v18.2d, v4.2d, v12.2d[1]
-       fmla    v19.2d, v5.2d, v12.2d[1]
-
-       fmla    v20.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v13.2d[0]
-       fmla    v22.2d, v4.2d, v13.2d[1]
-       fmla    v23.2d, v5.2d, v13.2d[1]
-
-       fmla    v24.2d, v4.2d, v14.2d[0]
-       fmla    v25.2d, v5.2d, v14.2d[0]
-       fmla    v26.2d, v4.2d, v14.2d[1]
-       fmla    v27.2d, v5.2d, v14.2d[1]
-
-       fmla    v28.2d, v4.2d, v15.2d[0]
-       fmla    v29.2d, v5.2d, v15.2d[0]
-       fmla    v30.2d, v4.2d, v15.2d[1]
-       fmla    v31.2d, v5.2d, v15.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v18.2d, v4.2d, v12.d[1]
+       fmla    v19.2d, v5.2d, v12.d[1]
+
+       fmla    v20.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v13.d[0]
+       fmla    v22.2d, v4.2d, v13.d[1]
+       fmla    v23.2d, v5.2d, v13.d[1]
+
+       fmla    v24.2d, v4.2d, v14.d[0]
+       fmla    v25.2d, v5.2d, v14.d[0]
+       fmla    v26.2d, v4.2d, v14.d[1]
+       fmla    v27.2d, v5.2d, v14.d[1]
+
+       fmla    v28.2d, v4.2d, v15.d[0]
+       fmla    v29.2d, v5.2d, v15.d[0]
+       fmla    v30.2d, v4.2d, v15.d[1]
+       fmla    v31.2d, v5.2d, v15.d[1]
 .endm
 
 .macro KERNEL4x8_SUB
@@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v10.2d, v11.2d}, [pB]
        add     pB, pB, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v0.2d, v8.2d[1]
-       fmla    v19.2d, v1.2d, v8.2d[1]
-
-       fmla    v20.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v9.2d[0]
-       fmla    v22.2d, v0.2d, v9.2d[1]
-       fmla    v23.2d, v1.2d, v9.2d[1]
-
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v25.2d, v1.2d, v10.2d[0]
-       fmla    v26.2d, v0.2d, v10.2d[1]
-       fmla    v27.2d, v1.2d, v10.2d[1]
-
-       fmla    v28.2d, v0.2d, v11.2d[0]
-       fmla    v29.2d, v1.2d, v11.2d[0]
-       fmla    v30.2d, v0.2d, v11.2d[1]
-       fmla    v31.2d, v1.2d, v11.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v0.2d, v8.d[1]
+       fmla    v19.2d, v1.2d, v8.d[1]
+
+       fmla    v20.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v9.d[0]
+       fmla    v22.2d, v0.2d, v9.d[1]
+       fmla    v23.2d, v1.2d, v9.d[1]
+
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v25.2d, v1.2d, v10.d[0]
+       fmla    v26.2d, v0.2d, v10.d[1]
+       fmla    v27.2d, v1.2d, v10.d[1]
+
+       fmla    v28.2d, v0.2d, v11.d[0]
+       fmla    v29.2d, v1.2d, v11.d[0]
+       fmla    v30.2d, v0.2d, v11.d[1]
+       fmla    v31.2d, v1.2d, v11.d[1]
 .endm
 
 .macro SAVE4x8
@@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v10.2d, v11.2d}, [pB]
        add     pB, pB, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v18.2d, v0.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v18.2d, v0.2d, v8.d[1]
 
-       fmla    v20.2d, v0.2d, v9.2d[0]
-       fmla    v22.2d, v0.2d, v9.2d[1]
+       fmla    v20.2d, v0.2d, v9.d[0]
+       fmla    v22.2d, v0.2d, v9.d[1]
 
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v26.2d, v0.2d, v10.2d[1]
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v26.2d, v0.2d, v10.d[1]
 
-       fmla    v28.2d, v0.2d, v11.2d[0]
-       fmla    v30.2d, v0.2d, v11.2d[1]
+       fmla    v28.2d, v0.2d, v11.d[0]
+       fmla    v30.2d, v0.2d, v11.d[1]
 .endm
 
 .macro SAVE2x8
@@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v29.2d, v1.2d, v9.2d[1]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       fmul    v29.2d, v1.2d, v9.d[1]
 
-       fmul    v20.2d, v0.2d, v8.2d[1]
-       fmul    v25.2d, v1.2d, v9.2d[0]
+       fmul    v20.2d, v0.2d, v8.d[1]
+       fmul    v25.2d, v1.2d, v9.d[0]
 
-       fmul    v24.2d, v0.2d, v9.2d[0]
-       fmul    v21.2d, v1.2d, v8.2d[1]
+       fmul    v24.2d, v0.2d, v9.d[0]
+       fmul    v21.2d, v1.2d, v8.d[1]
 
-       fmul    v28.2d, v0.2d, v9.2d[1]
-       fmul    v17.2d, v1.2d, v8.2d[0]
+       fmul    v28.2d, v0.2d, v9.d[1]
+       fmul    v17.2d, v1.2d, v8.d[0]
 
        ld1     {v12.2d, v13.2d}, [pB]
        add     pB, pB, #32
@@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
        ld1     {v12.2d, v13.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
        ld1     {v4.2d, v5.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro KERNEL4x4_M2
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v29.2d, v5.2d, v13.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v29.2d, v5.2d, v13.d[1]
 
        ld1     {v8.2d, v9.2d}, [pB]            // For next round
        add     pB, pB, #32
 
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v25.2d, v5.2d, v13.2d[0]
+       fmla    v20.2d, v4.2d, v12.d[1]
+       fmla    v25.2d, v5.2d, v13.d[0]
 
        ld1     {v0.2d, v1.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v12.2d[1]
+       fmla    v24.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v12.d[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v17.2d, v5.2d, v12.2d[0]
+       fmla    v28.2d, v4.2d, v13.d[1]
+       fmla    v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_E
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v29.2d, v5.2d, v13.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v29.2d, v5.2d, v13.d[1]
 
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v25.2d, v5.2d, v13.2d[0]
+       fmla    v20.2d, v4.2d, v12.d[1]
+       fmla    v25.2d, v5.2d, v13.d[0]
 
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v12.2d[1]
+       fmla    v24.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v12.d[1]
 
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v17.2d, v5.2d, v12.2d[0]
+       fmla    v28.2d, v4.2d, v13.d[1]
+       fmla    v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v28.2d, v0.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     d0 , [pA]
        add     pA, pA, #8
 
-       fmla    v16.2d, v8.2d, v0.2d[0]
+       fmla    v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA , pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
old mode 100755 (executable)
new mode 100644 (file)
index 33e076e..f3c3d5c
@@ -151,141 +151,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        ldp     d8, d9, [pB], #16
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v20.2d, v0.2d, v9.2d[0]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       fmul    v20.2d, v0.2d, v9.d[0]
 
        ldp     d10, d11, [pB], #16
 
-       fmul    v17.2d, v1.2d, v8.2d[0]
-       fmul    v21.2d, v1.2d, v9.2d[0]
+       fmul    v17.2d, v1.2d, v8.d[0]
+       fmul    v21.2d, v1.2d, v9.d[0]
 
        ldp     q2, q3, [pA], #32
 
-       fmul    v24.2d, v0.2d, v10.2d[0]
-       fmul    v28.2d, v0.2d, v11.2d[0]
+       fmul    v24.2d, v0.2d, v10.d[0]
+       fmul    v28.2d, v0.2d, v11.d[0]
 
        ldp     q4, q5, [pA], #32
 
-       fmul    v25.2d, v1.2d, v10.2d[0]
-       fmul    v29.2d, v1.2d, v11.2d[0]
+       fmul    v25.2d, v1.2d, v10.d[0]
+       fmul    v29.2d, v1.2d, v11.d[0]
 
        ldp     d12, d13, [pB], #16
 
-       fmul    v18.2d, v2.2d, v8.2d[0]
-       fmul    v22.2d, v2.2d, v9.2d[0]
+       fmul    v18.2d, v2.2d, v8.d[0]
+       fmul    v22.2d, v2.2d, v9.d[0]
 
        ldp     d14, d15, [pB], #16
 
-       fmul    v26.2d, v2.2d, v10.2d[0]
-       fmul    v30.2d, v2.2d, v11.2d[0]
+       fmul    v26.2d, v2.2d, v10.d[0]
+       fmul    v30.2d, v2.2d, v11.d[0]
 
        ldp     q6, q7, [pA], #32
 
-       fmul    v19.2d, v3.2d, v8.2d[0]
-       fmul    v27.2d, v3.2d, v10.2d[0]
+       fmul    v19.2d, v3.2d, v8.d[0]
+       fmul    v27.2d, v3.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-       fmul    v31.2d, v3.2d, v11.2d[0]
-       fmul    v23.2d, v3.2d, v9.2d[0]
+       fmul    v31.2d, v3.2d, v11.d[0]
+       fmul    v23.2d, v3.2d, v9.d[0]
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNEL8x4_M1
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v9.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v9.d[0]
 
        ldp     q4, q5, [pA], #32
 
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v28.2d, v0.2d, v11.2d[0]
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v28.2d, v0.2d, v11.d[0]
 
        ldp     d12, d13, [pB], #16
 
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v25.2d, v1.2d, v10.2d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v25.2d, v1.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 
-       fmla    v21.2d, v1.2d, v9.2d[0]
-       fmla    v29.2d, v1.2d, v11.2d[0]
+       fmla    v21.2d, v1.2d, v9.d[0]
+       fmla    v29.2d, v1.2d, v11.d[0]
 
        ldp     d14, d15, [pB], #16
 
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v22.2d, v2.2d, v9.2d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v22.2d, v2.2d, v9.d[0]
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-       fmla    v26.2d, v2.2d, v10.2d[0]
-       fmla    v30.2d, v2.2d, v11.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
-       fmla    v23.2d, v3.2d, v9.2d[0]
+       fmla    v26.2d, v2.2d, v10.d[0]
+       fmla    v30.2d, v2.2d, v11.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
+       fmla    v23.2d, v3.2d, v9.d[0]
 
        ldp     q6, q7, [pA], #32
 
-       fmla    v27.2d, v3.2d, v10.2d[0]
-       fmla    v31.2d, v3.2d, v11.2d[0]
+       fmla    v27.2d, v3.2d, v10.d[0]
+       fmla    v31.2d, v3.2d, v11.d[0]
 .endm
 
 .macro KERNEL8x4_M2
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v20.2d, v4.2d, v13.2d[0]
-       fmla    v24.2d, v4.2d, v14.2d[0]
-       fmla    v28.2d, v4.2d, v15.2d[0]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v20.2d, v4.2d, v13.d[0]
+       fmla    v24.2d, v4.2d, v14.d[0]
+       fmla    v28.2d, v4.2d, v15.d[0]
 
        ldp     q0, q1, [pA], #32
 
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v25.2d, v5.2d, v14.2d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v25.2d, v5.2d, v14.d[0]
 
        ldp     d8, d9, [pB], #16
 
-       fmla    v21.2d, v5.2d, v13.2d[0]
-       fmla    v29.2d, v5.2d, v15.2d[0]
+       fmla    v21.2d, v5.2d, v13.d[0]
+       fmla    v29.2d, v5.2d, v15.d[0]
 
        ldp     d10, d11, [pB], #16
 
-       fmla    v18.2d, v6.2d, v12.2d[0]
-       fmla    v22.2d, v6.2d, v13.2d[0]
+       fmla    v18.2d, v6.2d, v12.d[0]
+       fmla    v22.2d, v6.2d, v13.d[0]
 
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
 
-       fmla    v26.2d, v6.2d, v14.2d[0]
-       fmla    v30.2d, v6.2d, v15.2d[0]
+       fmla    v26.2d, v6.2d, v14.d[0]
+       fmla    v30.2d, v6.2d, v15.d[0]
 
-       fmla    v19.2d, v7.2d, v12.2d[0]
-       fmla    v23.2d, v7.2d, v13.2d[0]
+       fmla    v19.2d, v7.2d, v12.d[0]
+       fmla    v23.2d, v7.2d, v13.d[0]
 
        ldp     q2, q3, [pA], #32
 
-       fmla    v27.2d, v7.2d, v14.2d[0]
-       fmla    v31.2d, v7.2d, v15.2d[0]
+       fmla    v27.2d, v7.2d, v14.d[0]
+       fmla    v31.2d, v7.2d, v15.d[0]
 .endm
 
 .macro KERNEL8x4_E
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v20.2d, v4.2d, v13.2d[0]
-       fmla    v24.2d, v4.2d, v14.2d[0]
-       fmla    v28.2d, v4.2d, v15.2d[0]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v20.2d, v4.2d, v13.d[0]
+       fmla    v24.2d, v4.2d, v14.d[0]
+       fmla    v28.2d, v4.2d, v15.d[0]
 
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v25.2d, v5.2d, v14.2d[0]
-       fmla    v21.2d, v5.2d, v13.2d[0]
-       fmla    v29.2d, v5.2d, v15.2d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v25.2d, v5.2d, v14.d[0]
+       fmla    v21.2d, v5.2d, v13.d[0]
+       fmla    v29.2d, v5.2d, v15.d[0]
 
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
 
-       fmla    v18.2d, v6.2d, v12.2d[0]
-       fmla    v22.2d, v6.2d, v13.2d[0]
-       fmla    v26.2d, v6.2d, v14.2d[0]
-       fmla    v30.2d, v6.2d, v15.2d[0]
+       fmla    v18.2d, v6.2d, v12.d[0]
+       fmla    v22.2d, v6.2d, v13.d[0]
+       fmla    v26.2d, v6.2d, v14.d[0]
+       fmla    v30.2d, v6.2d, v15.d[0]
 
-       fmla    v19.2d, v7.2d, v12.2d[0]
-       fmla    v23.2d, v7.2d, v13.2d[0]
-       fmla    v27.2d, v7.2d, v14.2d[0]
-       fmla    v31.2d, v7.2d, v15.2d[0]
+       fmla    v19.2d, v7.2d, v12.d[0]
+       fmla    v23.2d, v7.2d, v13.d[0]
+       fmla    v27.2d, v7.2d, v14.d[0]
+       fmla    v31.2d, v7.2d, v15.d[0]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -293,39 +293,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        ldp     d8, d9, [pB], #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v9.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v9.d[0]
 
        ldp     d10, d11, [pB], #16
 
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v21.2d, v1.2d, v9.2d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v21.2d, v1.2d, v9.d[0]
 
        ldp     q2, q3, [pA], #32
 
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v28.2d, v0.2d, v11.2d[0]
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v28.2d, v0.2d, v11.d[0]
 
-       fmla    v25.2d, v1.2d, v10.2d[0]
-       fmla    v29.2d, v1.2d, v11.2d[0]
+       fmla    v25.2d, v1.2d, v10.d[0]
+       fmla    v29.2d, v1.2d, v11.d[0]
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v22.2d, v2.2d, v9.2d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v22.2d, v2.2d, v9.d[0]
 
        prfm    PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 
-       fmla    v26.2d, v2.2d, v10.2d[0]
-       fmla    v30.2d, v2.2d, v11.2d[0]
+       fmla    v26.2d, v2.2d, v10.d[0]
+       fmla    v30.2d, v2.2d, v11.d[0]
 
        prfm    PLDL1KEEP, [pB, #B_PRE_SIZE]
 
-       fmla    v19.2d, v3.2d, v8.2d[0]
-       fmla    v27.2d, v3.2d, v10.2d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
+       fmla    v27.2d, v3.2d, v10.d[0]
 
-       fmla    v31.2d, v3.2d, v11.2d[0]
-       fmla    v23.2d, v3.2d, v9.2d[0]
+       fmla    v31.2d, v3.2d, v11.d[0]
+       fmla    v23.2d, v3.2d, v9.d[0]
 .endm
 
 .macro SAVE8x4
@@ -419,17 +419,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -479,10 +479,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v28.2d, v0.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -573,15 +573,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
-       fmla    v22.2d, v2.2d, v8.2d[1]
-       fmla    v23.2d, v3.2d, v8.2d[1]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
+       fmla    v22.2d, v2.2d, v8.d[1]
+       fmla    v23.2d, v3.2d, v8.d[1]
 .endm
 
 .macro SAVE8x2
@@ -620,10 +620,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -657,8 +657,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -689,7 +689,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     d0 , [pA]
        add     pA, pA, #8
 
-       fmla    v16.2d, v8.2d, v0.2d[0]
+       fmla    v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -724,10 +724,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro SAVE8x1
@@ -757,8 +757,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA , pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -785,7 +785,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
index 0d1b128..34fb8c2 100644 (file)
@@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v29.2d, v1.2d, v9.2d[1]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       fmul    v29.2d, v1.2d, v9.d[1]
 
-       fmul    v20.2d, v0.2d, v8.2d[1]
-       fmul    v25.2d, v1.2d, v9.2d[0]
+       fmul    v20.2d, v0.2d, v8.d[1]
+       fmul    v25.2d, v1.2d, v9.d[0]
 
-       fmul    v24.2d, v0.2d, v9.2d[0]
-       fmul    v21.2d, v1.2d, v8.2d[1]
+       fmul    v24.2d, v0.2d, v9.d[0]
+       fmul    v21.2d, v1.2d, v8.d[1]
 
-       fmul    v28.2d, v0.2d, v9.2d[1]
-       fmul    v17.2d, v1.2d, v8.2d[0]
+       fmul    v28.2d, v0.2d, v9.d[1]
+       fmul    v17.2d, v1.2d, v8.d[0]
 
        ld1     {v12.2d, v13.2d}, [pB]
        add     pB, pB, #32
@@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
        ld1     {v12.2d, v13.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
        ld1     {v4.2d, v5.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro KERNEL4x4_M2
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v29.2d, v5.2d, v13.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v29.2d, v5.2d, v13.d[1]
 
        ld1     {v8.2d, v9.2d}, [pB]            // For next round
        add     pB, pB, #32
 
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v25.2d, v5.2d, v13.2d[0]
+       fmla    v20.2d, v4.2d, v12.d[1]
+       fmla    v25.2d, v5.2d, v13.d[0]
 
        ld1     {v0.2d, v1.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v12.2d[1]
+       fmla    v24.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v12.d[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v17.2d, v5.2d, v12.2d[0]
+       fmla    v28.2d, v4.2d, v13.d[1]
+       fmla    v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_E
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v29.2d, v5.2d, v13.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v29.2d, v5.2d, v13.d[1]
 
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v25.2d, v5.2d, v13.2d[0]
+       fmla    v20.2d, v4.2d, v12.d[1]
+       fmla    v25.2d, v5.2d, v13.d[0]
 
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v12.2d[1]
+       fmla    v24.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v12.d[1]
 
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v17.2d, v5.2d, v12.2d[0]
+       fmla    v28.2d, v4.2d, v13.d[1]
+       fmla    v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v28.2d, v0.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     d0 , [pA]
        add     pA, pA, #8
 
-       fmla    v16.2d, v8.2d, v0.2d[0]
+       fmla    v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA , pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
old mode 100755 (executable)
new mode 100644 (file)
index eb7397f..4aecf28
@@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v10.2d, v11.2d}, [pB]
        add     pB, pB, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v17.2d, v1.2d, v8.2d[0]
-       fmul    v18.2d, v0.2d, v8.2d[1]
-       fmul    v19.2d, v1.2d, v8.2d[1]
-
-       fmul    v20.2d, v0.2d, v9.2d[0]
-       fmul    v21.2d, v1.2d, v9.2d[0]
-       fmul    v22.2d, v0.2d, v9.2d[1]
-       fmul    v23.2d, v1.2d, v9.2d[1]
-
-       fmul    v24.2d, v0.2d, v10.2d[0]
-       fmul    v25.2d, v1.2d, v10.2d[0]
-       fmul    v26.2d, v0.2d, v10.2d[1]
-       fmul    v27.2d, v1.2d, v10.2d[1]
-
-       fmul    v28.2d, v0.2d, v11.2d[0]
-       fmul    v29.2d, v1.2d, v11.2d[0]
-       fmul    v30.2d, v0.2d, v11.2d[1]
-       fmul    v31.2d, v1.2d, v11.2d[1]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       fmul    v17.2d, v1.2d, v8.d[0]
+       fmul    v18.2d, v0.2d, v8.d[1]
+       fmul    v19.2d, v1.2d, v8.d[1]
+
+       fmul    v20.2d, v0.2d, v9.d[0]
+       fmul    v21.2d, v1.2d, v9.d[0]
+       fmul    v22.2d, v0.2d, v9.d[1]
+       fmul    v23.2d, v1.2d, v9.d[1]
+
+       fmul    v24.2d, v0.2d, v10.d[0]
+       fmul    v25.2d, v1.2d, v10.d[0]
+       fmul    v26.2d, v0.2d, v10.d[1]
+       fmul    v27.2d, v1.2d, v10.d[1]
+
+       fmul    v28.2d, v0.2d, v11.d[0]
+       fmul    v29.2d, v1.2d, v11.d[0]
+       fmul    v30.2d, v0.2d, v11.d[1]
+       fmul    v31.2d, v1.2d, v11.d[1]
 
        ld1     {v12.2d, v13.2d}, [pB]
        add     pB, pB, #32
@@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M1
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v0.2d, v8.2d[1]
-       fmla    v19.2d, v1.2d, v8.2d[1]
-
-       fmla    v20.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v9.2d[0]
-       fmla    v22.2d, v0.2d, v9.2d[1]
-       fmla    v23.2d, v1.2d, v9.2d[1]
-
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v25.2d, v1.2d, v10.2d[0]
-       fmla    v26.2d, v0.2d, v10.2d[1]
-       fmla    v27.2d, v1.2d, v10.2d[1]
-
-       fmla    v28.2d, v0.2d, v11.2d[0]
-       fmla    v29.2d, v1.2d, v11.2d[0]
-       fmla    v30.2d, v0.2d, v11.2d[1]
-       fmla    v31.2d, v1.2d, v11.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v0.2d, v8.d[1]
+       fmla    v19.2d, v1.2d, v8.d[1]
+
+       fmla    v20.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v9.d[0]
+       fmla    v22.2d, v0.2d, v9.d[1]
+       fmla    v23.2d, v1.2d, v9.d[1]
+
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v25.2d, v1.2d, v10.d[0]
+       fmla    v26.2d, v0.2d, v10.d[1]
+       fmla    v27.2d, v1.2d, v10.d[1]
+
+       fmla    v28.2d, v0.2d, v11.d[0]
+       fmla    v29.2d, v1.2d, v11.d[0]
+       fmla    v30.2d, v0.2d, v11.d[1]
+       fmla    v31.2d, v1.2d, v11.d[1]
 
        ld1     {v12.2d, v13.2d}, [pB]          // For next round
        add     pB, pB, #32
@@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v18.2d, v4.2d, v12.2d[1]
-       fmla    v19.2d, v5.2d, v12.2d[1]
-
-       fmla    v20.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v13.2d[0]
-       fmla    v22.2d, v4.2d, v13.2d[1]
-       fmla    v23.2d, v5.2d, v13.2d[1]
-
-       fmla    v24.2d, v4.2d, v14.2d[0]
-       fmla    v25.2d, v5.2d, v14.2d[0]
-       fmla    v26.2d, v4.2d, v14.2d[1]
-       fmla    v27.2d, v5.2d, v14.2d[1]
-
-       fmla    v28.2d, v4.2d, v15.2d[0]
-       fmla    v29.2d, v5.2d, v15.2d[0]
-       fmla    v30.2d, v4.2d, v15.2d[1]
-       fmla    v31.2d, v5.2d, v15.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v18.2d, v4.2d, v12.d[1]
+       fmla    v19.2d, v5.2d, v12.d[1]
+
+       fmla    v20.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v13.d[0]
+       fmla    v22.2d, v4.2d, v13.d[1]
+       fmla    v23.2d, v5.2d, v13.d[1]
+
+       fmla    v24.2d, v4.2d, v14.d[0]
+       fmla    v25.2d, v5.2d, v14.d[0]
+       fmla    v26.2d, v4.2d, v14.d[1]
+       fmla    v27.2d, v5.2d, v14.d[1]
+
+       fmla    v28.2d, v4.2d, v15.d[0]
+       fmla    v29.2d, v5.2d, v15.d[0]
+       fmla    v30.2d, v4.2d, v15.d[1]
+       fmla    v31.2d, v5.2d, v15.d[1]
 
        ld1     {v8.2d, v9.2d}, [pB]            // For next round
        add     pB, pB, #32
@@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_E
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v18.2d, v4.2d, v12.2d[1]
-       fmla    v19.2d, v5.2d, v12.2d[1]
-
-       fmla    v20.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v13.2d[0]
-       fmla    v22.2d, v4.2d, v13.2d[1]
-       fmla    v23.2d, v5.2d, v13.2d[1]
-
-       fmla    v24.2d, v4.2d, v14.2d[0]
-       fmla    v25.2d, v5.2d, v14.2d[0]
-       fmla    v26.2d, v4.2d, v14.2d[1]
-       fmla    v27.2d, v5.2d, v14.2d[1]
-
-       fmla    v28.2d, v4.2d, v15.2d[0]
-       fmla    v29.2d, v5.2d, v15.2d[0]
-       fmla    v30.2d, v4.2d, v15.2d[1]
-       fmla    v31.2d, v5.2d, v15.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v18.2d, v4.2d, v12.d[1]
+       fmla    v19.2d, v5.2d, v12.d[1]
+
+       fmla    v20.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v13.d[0]
+       fmla    v22.2d, v4.2d, v13.d[1]
+       fmla    v23.2d, v5.2d, v13.d[1]
+
+       fmla    v24.2d, v4.2d, v14.d[0]
+       fmla    v25.2d, v5.2d, v14.d[0]
+       fmla    v26.2d, v4.2d, v14.d[1]
+       fmla    v27.2d, v5.2d, v14.d[1]
+
+       fmla    v28.2d, v4.2d, v15.d[0]
+       fmla    v29.2d, v5.2d, v15.d[0]
+       fmla    v30.2d, v4.2d, v15.d[1]
+       fmla    v31.2d, v5.2d, v15.d[1]
 .endm
 
 .macro KERNEL4x8_SUB
@@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v10.2d, v11.2d}, [pB]
        add     pB, pB, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v0.2d, v8.2d[1]
-       fmla    v19.2d, v1.2d, v8.2d[1]
-
-       fmla    v20.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v9.2d[0]
-       fmla    v22.2d, v0.2d, v9.2d[1]
-       fmla    v23.2d, v1.2d, v9.2d[1]
-
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v25.2d, v1.2d, v10.2d[0]
-       fmla    v26.2d, v0.2d, v10.2d[1]
-       fmla    v27.2d, v1.2d, v10.2d[1]
-
-       fmla    v28.2d, v0.2d, v11.2d[0]
-       fmla    v29.2d, v1.2d, v11.2d[0]
-       fmla    v30.2d, v0.2d, v11.2d[1]
-       fmla    v31.2d, v1.2d, v11.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v0.2d, v8.d[1]
+       fmla    v19.2d, v1.2d, v8.d[1]
+
+       fmla    v20.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v9.d[0]
+       fmla    v22.2d, v0.2d, v9.d[1]
+       fmla    v23.2d, v1.2d, v9.d[1]
+
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v25.2d, v1.2d, v10.d[0]
+       fmla    v26.2d, v0.2d, v10.d[1]
+       fmla    v27.2d, v1.2d, v10.d[1]
+
+       fmla    v28.2d, v0.2d, v11.d[0]
+       fmla    v29.2d, v1.2d, v11.d[0]
+       fmla    v30.2d, v0.2d, v11.d[1]
+       fmla    v31.2d, v1.2d, v11.d[1]
 .endm
 
 .macro SAVE4x8
@@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v10.2d, v11.2d}, [pB]
        add     pB, pB, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v18.2d, v0.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v18.2d, v0.2d, v8.d[1]
 
-       fmla    v20.2d, v0.2d, v9.2d[0]
-       fmla    v22.2d, v0.2d, v9.2d[1]
+       fmla    v20.2d, v0.2d, v9.d[0]
+       fmla    v22.2d, v0.2d, v9.d[1]
 
-       fmla    v24.2d, v0.2d, v10.2d[0]
-       fmla    v26.2d, v0.2d, v10.2d[1]
+       fmla    v24.2d, v0.2d, v10.d[0]
+       fmla    v26.2d, v0.2d, v10.d[1]
 
-       fmla    v28.2d, v0.2d, v11.2d[0]
-       fmla    v30.2d, v0.2d, v11.2d[1]
+       fmla    v28.2d, v0.2d, v11.d[0]
+       fmla    v30.2d, v0.2d, v11.d[1]
 .endm
 
 .macro SAVE2x8
@@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v29.2d, v1.2d, v9.2d[1]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       fmul    v29.2d, v1.2d, v9.d[1]
 
-       fmul    v20.2d, v0.2d, v8.2d[1]
-       fmul    v25.2d, v1.2d, v9.2d[0]
+       fmul    v20.2d, v0.2d, v8.d[1]
+       fmul    v25.2d, v1.2d, v9.d[0]
 
-       fmul    v24.2d, v0.2d, v9.2d[0]
-       fmul    v21.2d, v1.2d, v8.2d[1]
+       fmul    v24.2d, v0.2d, v9.d[0]
+       fmul    v21.2d, v1.2d, v8.d[1]
 
-       fmul    v28.2d, v0.2d, v9.2d[1]
-       fmul    v17.2d, v1.2d, v8.2d[0]
+       fmul    v28.2d, v0.2d, v9.d[1]
+       fmul    v17.2d, v1.2d, v8.d[0]
 
        ld1     {v12.2d, v13.2d}, [pB]
        add     pB, pB, #32
@@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
        ld1     {v12.2d, v13.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
        ld1     {v4.2d, v5.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro KERNEL4x4_M2
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v29.2d, v5.2d, v13.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v29.2d, v5.2d, v13.d[1]
 
        ld1     {v8.2d, v9.2d}, [pB]            // For next round
        add     pB, pB, #32
 
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v25.2d, v5.2d, v13.2d[0]
+       fmla    v20.2d, v4.2d, v12.d[1]
+       fmla    v25.2d, v5.2d, v13.d[0]
 
        ld1     {v0.2d, v1.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v12.2d[1]
+       fmla    v24.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v12.d[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v17.2d, v5.2d, v12.2d[0]
+       fmla    v28.2d, v4.2d, v13.d[1]
+       fmla    v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_E
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v29.2d, v5.2d, v13.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v29.2d, v5.2d, v13.d[1]
 
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v25.2d, v5.2d, v13.2d[0]
+       fmla    v20.2d, v4.2d, v12.d[1]
+       fmla    v25.2d, v5.2d, v13.d[0]
 
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v21.2d, v5.2d, v12.2d[1]
+       fmla    v24.2d, v4.2d, v13.d[0]
+       fmla    v21.2d, v5.2d, v12.d[1]
 
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v17.2d, v5.2d, v12.2d[0]
+       fmla    v28.2d, v4.2d, v13.d[1]
+       fmla    v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v28.2d, v0.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     d0 , [pA]
        add     pA, pA, #8
 
-       fmla    v16.2d, v8.2d, v0.2d[0]
+       fmla    v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA , pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
old mode 100755 (executable)
new mode 100644 (file)
index 6890505..b06c756
@@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       fmul    v17.2d, v1.2d, v8.2d[0]
-       fmul    v18.2d, v2.2d, v8.2d[0]
-       fmul    v19.2d, v3.2d, v8.2d[0]
-
-       fmul    v20.2d, v0.2d, v8.2d[1]
-       fmul    v21.2d, v1.2d, v8.2d[1]
-       fmul    v22.2d, v2.2d, v8.2d[1]
-       fmul    v23.2d, v3.2d, v8.2d[1]
-
-       fmul    v24.2d, v0.2d, v9.2d[0]
-       fmul    v25.2d, v1.2d, v9.2d[0]
-       fmul    v26.2d, v2.2d, v9.2d[0]
-       fmul    v27.2d, v3.2d, v9.2d[0]
-
-       fmul    v28.2d, v0.2d, v9.2d[1]
-       fmul    v29.2d, v1.2d, v9.2d[1]
-       fmul    v30.2d, v2.2d, v9.2d[1]
-       fmul    v31.2d, v3.2d, v9.2d[1]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       fmul    v17.2d, v1.2d, v8.d[0]
+       fmul    v18.2d, v2.2d, v8.d[0]
+       fmul    v19.2d, v3.2d, v8.d[0]
+
+       fmul    v20.2d, v0.2d, v8.d[1]
+       fmul    v21.2d, v1.2d, v8.d[1]
+       fmul    v22.2d, v2.2d, v8.d[1]
+       fmul    v23.2d, v3.2d, v8.d[1]
+
+       fmul    v24.2d, v0.2d, v9.d[0]
+       fmul    v25.2d, v1.2d, v9.d[0]
+       fmul    v26.2d, v2.2d, v9.d[0]
+       fmul    v27.2d, v3.2d, v9.d[0]
+
+       fmul    v28.2d, v0.2d, v9.d[1]
+       fmul    v29.2d, v1.2d, v9.d[1]
+       fmul    v30.2d, v2.2d, v9.d[1]
+       fmul    v31.2d, v3.2d, v9.d[1]
 
        ld1     {v4.2d, v5.2d}, [pA]
        add     pA, pA, #32
@@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
-
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
-       fmla    v22.2d, v2.2d, v8.2d[1]
-       fmla    v23.2d, v3.2d, v8.2d[1]
-
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v25.2d, v1.2d, v9.2d[0]
-       fmla    v26.2d, v2.2d, v9.2d[0]
-       fmla    v27.2d, v3.2d, v9.2d[0]
-
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v29.2d, v1.2d, v9.2d[1]
-       fmla    v30.2d, v2.2d, v9.2d[1]
-       fmla    v31.2d, v3.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
+
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
+       fmla    v22.2d, v2.2d, v8.d[1]
+       fmla    v23.2d, v3.2d, v8.d[1]
+
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v25.2d, v1.2d, v9.d[0]
+       fmla    v26.2d, v2.2d, v9.d[0]
+       fmla    v27.2d, v3.2d, v9.d[0]
+
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v29.2d, v1.2d, v9.d[1]
+       fmla    v30.2d, v2.2d, v9.d[1]
+       fmla    v31.2d, v3.2d, v9.d[1]
 
        ld1     {v4.2d, v5.2d}, [pA]
        add     pA, pA, #32
@@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v18.2d, v6.2d, v12.2d[0]
-       fmla    v19.2d, v7.2d, v12.2d[0]
-
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v21.2d, v5.2d, v12.2d[1]
-       fmla    v22.2d, v6.2d, v12.2d[1]
-       fmla    v23.2d, v7.2d, v12.2d[1]
-
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v25.2d, v5.2d, v13.2d[0]
-       fmla    v26.2d, v6.2d, v13.2d[0]
-       fmla    v27.2d, v7.2d, v13.2d[0]
-
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v29.2d, v5.2d, v13.2d[1]
-       fmla    v30.2d, v6.2d, v13.2d[1]
-       fmla    v31.2d, v7.2d, v13.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v18.2d, v6.2d, v12.d[0]
+       fmla    v19.2d, v7.2d, v12.d[0]
+
+       fmla    v20.2d, v4.2d, v12.d[1]
+       fmla    v21.2d, v5.2d, v12.d[1]
+       fmla    v22.2d, v6.2d, v12.d[1]
+       fmla    v23.2d, v7.2d, v12.d[1]
+
+       fmla    v24.2d, v4.2d, v13.d[0]
+       fmla    v25.2d, v5.2d, v13.d[0]
+       fmla    v26.2d, v6.2d, v13.d[0]
+       fmla    v27.2d, v7.2d, v13.d[0]
+
+       fmla    v28.2d, v4.2d, v13.d[1]
+       fmla    v29.2d, v5.2d, v13.d[1]
+       fmla    v30.2d, v6.2d, v13.d[1]
+       fmla    v31.2d, v7.2d, v13.d[1]
 
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
@@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-       fmla    v16.2d, v4.2d, v12.2d[0]
-       fmla    v17.2d, v5.2d, v12.2d[0]
-       fmla    v18.2d, v6.2d, v12.2d[0]
-       fmla    v19.2d, v7.2d, v12.2d[0]
-
-       fmla    v20.2d, v4.2d, v12.2d[1]
-       fmla    v21.2d, v5.2d, v12.2d[1]
-       fmla    v22.2d, v6.2d, v12.2d[1]
-       fmla    v23.2d, v7.2d, v12.2d[1]
-
-       fmla    v24.2d, v4.2d, v13.2d[0]
-       fmla    v25.2d, v5.2d, v13.2d[0]
-       fmla    v26.2d, v6.2d, v13.2d[0]
-       fmla    v27.2d, v7.2d, v13.2d[0]
-
-       fmla    v28.2d, v4.2d, v13.2d[1]
-       fmla    v29.2d, v5.2d, v13.2d[1]
-       fmla    v30.2d, v6.2d, v13.2d[1]
-       fmla    v31.2d, v7.2d, v13.2d[1]
+       fmla    v16.2d, v4.2d, v12.d[0]
+       fmla    v17.2d, v5.2d, v12.d[0]
+       fmla    v18.2d, v6.2d, v12.d[0]
+       fmla    v19.2d, v7.2d, v12.d[0]
+
+       fmla    v20.2d, v4.2d, v12.d[1]
+       fmla    v21.2d, v5.2d, v12.d[1]
+       fmla    v22.2d, v6.2d, v12.d[1]
+       fmla    v23.2d, v7.2d, v12.d[1]
+
+       fmla    v24.2d, v4.2d, v13.d[0]
+       fmla    v25.2d, v5.2d, v13.d[0]
+       fmla    v26.2d, v6.2d, v13.d[0]
+       fmla    v27.2d, v7.2d, v13.d[0]
+
+       fmla    v28.2d, v4.2d, v13.d[1]
+       fmla    v29.2d, v5.2d, v13.d[1]
+       fmla    v30.2d, v6.2d, v13.d[1]
+       fmla    v31.2d, v7.2d, v13.d[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
-
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
-       fmla    v22.2d, v2.2d, v8.2d[1]
-       fmla    v23.2d, v3.2d, v8.2d[1]
-
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v25.2d, v1.2d, v9.2d[0]
-       fmla    v26.2d, v2.2d, v9.2d[0]
-       fmla    v27.2d, v3.2d, v9.2d[0]
-
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v29.2d, v1.2d, v9.2d[1]
-       fmla    v30.2d, v2.2d, v9.2d[1]
-       fmla    v31.2d, v3.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
+
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
+       fmla    v22.2d, v2.2d, v8.d[1]
+       fmla    v23.2d, v3.2d, v8.d[1]
+
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v25.2d, v1.2d, v9.d[0]
+       fmla    v26.2d, v2.2d, v9.d[0]
+       fmla    v27.2d, v3.2d, v9.d[0]
+
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v29.2d, v1.2d, v9.d[1]
+       fmla    v30.2d, v2.2d, v9.d[1]
+       fmla    v31.2d, v3.2d, v9.d[1]
 .endm
 
 .macro SAVE8x4
@@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v29.2d, v1.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v29.2d, v1.2d, v9.d[1]
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v25.2d, v1.2d, v9.2d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v25.2d, v1.2d, v9.d[0]
 
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v21.2d, v1.2d, v8.d[1]
 
-       fmla    v28.2d, v0.2d, v9.2d[1]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v24.2d, v0.2d, v9.2d[0]
-       fmla    v28.2d, v0.2d, v9.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v24.2d, v0.2d, v9.d[0]
+       fmla    v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
 
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
-       fmla    v22.2d, v2.2d, v8.2d[1]
-       fmla    v23.2d, v3.2d, v8.2d[1]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
+       fmla    v22.2d, v2.2d, v8.d[1]
+       fmla    v23.2d, v3.2d, v8.d[1]
 .endm
 
 .macro SAVE8x2
@@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
-       fmla    v21.2d, v1.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
+       fmla    v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v20.2d, v0.2d, v8.2d[1]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     d0 , [pA]
        add     pA, pA, #8
 
-       fmla    v16.2d, v8.2d, v0.2d[0]
+       fmla    v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
-       fmla    v18.2d, v2.2d, v8.2d[0]
-       fmla    v19.2d, v3.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
+       fmla    v18.2d, v2.2d, v8.d[0]
+       fmla    v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro SAVE8x1
@@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d, v1.2d}, [pA]
        add     pA , pA, #32
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
-       fmla    v17.2d, v1.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
+       fmla    v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2d}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2d, v0.2d, v8.2d[0]
+       fmla    v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
index 22b55b0..68366d9 100644 (file)
@@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v3.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v8.2s[0]
-       fmul    v17.4s, v1.4s, v8.2s[0]
-       fmul    v18.4s, v2.4s, v8.2s[0]
-       fmul    v19.4s, v3.4s, v8.2s[0]
-
-       fmul    v20.4s, v0.4s, v8.2s[1]
-       fmul    v21.4s, v1.4s, v8.2s[1]
-       fmul    v22.4s, v2.4s, v8.2s[1]
-       fmul    v23.4s, v3.4s, v8.2s[1]
-
-       fmul    v24.4s, v0.4s, v9.2s[0]
-       fmul    v25.4s, v1.4s, v9.2s[0]
-       fmul    v26.4s, v2.4s, v9.2s[0]
-       fmul    v27.4s, v3.4s, v9.2s[0]
-
-       fmul    v28.4s, v0.4s, v9.2s[1]
-       fmul    v29.4s, v1.4s, v9.2s[1]
-       fmul    v30.4s, v2.4s, v9.2s[1]
-       fmul    v31.4s, v3.4s, v9.2s[1]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       fmul    v17.4s, v1.4s, v8.s[0]
+       fmul    v18.4s, v2.4s, v8.s[0]
+       fmul    v19.4s, v3.4s, v8.s[0]
+
+       fmul    v20.4s, v0.4s, v8.s[1]
+       fmul    v21.4s, v1.4s, v8.s[1]
+       fmul    v22.4s, v2.4s, v8.s[1]
+       fmul    v23.4s, v3.4s, v8.s[1]
+
+       fmul    v24.4s, v0.4s, v9.s[0]
+       fmul    v25.4s, v1.4s, v9.s[0]
+       fmul    v26.4s, v2.4s, v9.s[0]
+       fmul    v27.4s, v3.4s, v9.s[0]
+
+       fmul    v28.4s, v0.4s, v9.s[1]
+       fmul    v29.4s, v1.4s, v9.s[1]
+       fmul    v30.4s, v2.4s, v9.s[1]
+       fmul    v31.4s, v3.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_M1
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v18.4s, v2.4s, v8.2s[0]
-       fmla    v19.4s, v3.4s, v8.2s[0]
-
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v22.4s, v2.4s, v8.2s[1]
-       fmla    v23.4s, v3.4s, v8.2s[1]
-
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v26.4s, v2.4s, v9.2s[0]
-       fmla    v27.4s, v3.4s, v9.2s[0]
-
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
-       fmla    v30.4s, v2.4s, v9.2s[1]
-       fmla    v31.4s, v3.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v18.4s, v2.4s, v8.s[0]
+       fmla    v19.4s, v3.4s, v8.s[0]
+
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v22.4s, v2.4s, v8.s[1]
+       fmla    v23.4s, v3.4s, v8.s[1]
+
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v26.4s, v2.4s, v9.s[0]
+       fmla    v27.4s, v3.4s, v9.s[0]
+
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
+       fmla    v30.4s, v2.4s, v9.s[1]
+       fmla    v31.4s, v3.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_M2
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v18.4s, v6.4s, v12.2s[0]
-       fmla    v19.4s, v7.4s, v12.2s[0]
-
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v22.4s, v6.4s, v12.2s[1]
-       fmla    v23.4s, v7.4s, v12.2s[1]
-
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v26.4s, v6.4s, v13.2s[0]
-       fmla    v27.4s, v7.4s, v13.2s[0]
-
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
-       fmla    v30.4s, v6.4s, v13.2s[1]
-       fmla    v31.4s, v7.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v18.4s, v6.4s, v12.s[0]
+       fmla    v19.4s, v7.4s, v12.s[0]
+
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v22.4s, v6.4s, v12.s[1]
+       fmla    v23.4s, v7.4s, v12.s[1]
+
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v26.4s, v6.4s, v13.s[0]
+       fmla    v27.4s, v7.4s, v13.s[0]
+
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
+       fmla    v30.4s, v6.4s, v13.s[1]
+       fmla    v31.4s, v7.4s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]
        add     pB, pB, #16
@@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_E
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v18.4s, v6.4s, v12.2s[0]
-       fmla    v19.4s, v7.4s, v12.2s[0]
-
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v22.4s, v6.4s, v12.2s[1]
-       fmla    v23.4s, v7.4s, v12.2s[1]
-
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v26.4s, v6.4s, v13.2s[0]
-       fmla    v27.4s, v7.4s, v13.2s[0]
-
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
-       fmla    v30.4s, v6.4s, v13.2s[1]
-       fmla    v31.4s, v7.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v18.4s, v6.4s, v12.s[0]
+       fmla    v19.4s, v7.4s, v12.s[0]
+
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v22.4s, v6.4s, v12.s[1]
+       fmla    v23.4s, v7.4s, v12.s[1]
+
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v26.4s, v6.4s, v13.s[0]
+       fmla    v27.4s, v7.4s, v13.s[0]
+
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
+       fmla    v30.4s, v6.4s, v13.s[1]
+       fmla    v31.4s, v7.4s, v13.s[1]
 .endm
 
 .macro KERNEL16x4_SUB
@@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v3.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v18.4s, v2.4s, v8.2s[0]
-       fmla    v19.4s, v3.4s, v8.2s[0]
-
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v22.4s, v2.4s, v8.2s[1]
-       fmla    v23.4s, v3.4s, v8.2s[1]
-
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v26.4s, v2.4s, v9.2s[0]
-       fmla    v27.4s, v3.4s, v9.2s[0]
-
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
-       fmla    v30.4s, v2.4s, v9.2s[1]
-       fmla    v31.4s, v3.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v18.4s, v2.4s, v8.s[0]
+       fmla    v19.4s, v3.4s, v8.s[0]
+
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v22.4s, v2.4s, v8.s[1]
+       fmla    v23.4s, v3.4s, v8.s[1]
+
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v26.4s, v2.4s, v9.s[0]
+       fmla    v27.4s, v3.4s, v9.s[0]
+
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
+       fmla    v30.4s, v2.4s, v9.s[1]
+       fmla    v31.4s, v3.4s, v9.s[1]
 .endm
 
 .macro SAVE16x4
@@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v8.2s[0]
-       fmul    v17.4s, v1.4s, v8.2s[0]
-       fmul    v20.4s, v0.4s, v8.2s[1]
-       fmul    v21.4s, v1.4s, v8.2s[1]
-       fmul    v24.4s, v0.4s, v9.2s[0]
-       fmul    v25.4s, v1.4s, v9.2s[0]
-       fmul    v28.4s, v0.4s, v9.2s[1]
-       fmul    v29.4s, v1.4s, v9.2s[1]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       fmul    v17.4s, v1.4s, v8.s[0]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       fmul    v21.4s, v1.4s, v8.s[1]
+       fmul    v24.4s, v0.4s, v9.s[0]
+       fmul    v25.4s, v1.4s, v9.s[0]
+       fmul    v28.4s, v0.4s, v9.s[1]
+       fmul    v29.4s, v1.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]
        add     pB, pB, #16
@@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
 .endm
 
 .macro SAVE8x4
@@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.2s, v0.2s, v8.2s[0]
-       fmul    v29.2s, v1.2s, v9.2s[1]
+       fmul    v16.2s, v0.2s, v8.s[0]
+       fmul    v29.2s, v1.2s, v9.s[1]
 
-       fmul    v20.2s, v0.2s, v8.2s[1]
-       fmul    v25.2s, v1.2s, v9.2s[0]
+       fmul    v20.2s, v0.2s, v8.s[1]
+       fmul    v25.2s, v1.2s, v9.s[0]
 
-       fmul    v24.2s, v0.2s, v9.2s[0]
-       fmul    v21.2s, v1.2s, v8.2s[1]
+       fmul    v24.2s, v0.2s, v9.s[0]
+       fmul    v21.2s, v1.2s, v8.s[1]
 
-       fmul    v28.2s, v0.2s, v9.2s[1]
-       fmul    v17.2s, v1.2s, v8.2s[0]
+       fmul    v28.2s, v0.2s, v9.s[1]
+       fmul    v17.2s, v1.2s, v8.s[0]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]          // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
        ld1     {v4.2s, v5.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]            // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
        ld1     {v0.2s, v1.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v28.2s, v0.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v3.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v18.4s, v2.4s, v8.2s[0]
-       fmla    v19.4s, v3.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v18.4s, v2.4s, v8.s[0]
+       fmla    v19.4s, v3.4s, v8.s[0]
 
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v22.4s, v2.4s, v8.2s[1]
-       fmla    v23.4s, v3.4s, v8.2s[1]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v22.4s, v2.4s, v8.s[1]
+       fmla    v23.4s, v3.4s, v8.s[1]
 .endm
 
 .macro SAVE16x2
@@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
 
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     s0 , [pA]
        add     pA, pA, #4
 
-       fmla    v16.2s, v8.2s, v0.2s[0]
+       fmla    v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v3.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v18.4s, v2.4s, v8.2s[0]
-       fmla    v19.4s, v3.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v18.4s, v2.4s, v8.s[0]
+       fmla    v19.4s, v3.4s, v8.s[0]
 .endm
 
 .macro SAVE16x1
@@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
@@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA , pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
index bfa80d5..a5cf7ba 100644 (file)
@@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.4s}, [pA_0]
        add     pA_0, pA_0, #16
 
-       fmul    v16.4s, v0.4s, v8.4s[0]
-       fmul    v20.4s, v0.4s, v8.4s[1]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       fmul    v20.4s, v0.4s, v8.s[1]
 
        ld1     {v2.4s}, [pA_1]
        add     pA_1, pA_1, #16
 
-       fmul    v24.4s, v0.4s, v8.4s[2]
-       fmul    v28.4s, v0.4s, v8.4s[3]
+       fmul    v24.4s, v0.4s, v8.s[2]
+       fmul    v28.4s, v0.4s, v8.s[3]
 
        ld1     {v4.4s}, [pA_2]
        add     pA_2, pA_2, #16
 
-       fmul    v17.4s, v2.4s, v8.4s[0]
-       fmul    v21.4s, v2.4s, v8.4s[1]
+       fmul    v17.4s, v2.4s, v8.s[0]
+       fmul    v21.4s, v2.4s, v8.s[1]
 
        ld1     {v6.4s}, [pA_3]
        add     pA_3, pA_3, #16
 
-       fmul    v25.4s, v2.4s, v8.4s[2]
-       fmul    v29.4s, v2.4s, v8.4s[3]
+       fmul    v25.4s, v2.4s, v8.s[2]
+       fmul    v29.4s, v2.4s, v8.s[3]
 
        ld1     {v12.4s}, [pB]          // for next round
        add     pB, pB, #16
 
-       fmul    v18.4s, v4.4s, v8.4s[0]
-       fmul    v19.4s, v6.4s, v8.4s[0]
+       fmul    v18.4s, v4.4s, v8.s[0]
+       fmul    v19.4s, v6.4s, v8.s[0]
 
        ld1     {v1.4s}, [pA_0]         // for next round
        add     pA_0, pA_0, #16
 
-       fmul    v22.4s, v4.4s, v8.4s[1]
-       fmul    v23.4s, v6.4s, v8.4s[1]
+       fmul    v22.4s, v4.4s, v8.s[1]
+       fmul    v23.4s, v6.4s, v8.s[1]
 
        ld1     {v3.4s}, [pA_1]         // for next round
        add     pA_1, pA_1, #16
 
-       fmul    v26.4s, v4.4s, v8.4s[2]
-       fmul    v27.4s, v6.4s, v8.4s[2]
+       fmul    v26.4s, v4.4s, v8.s[2]
+       fmul    v27.4s, v6.4s, v8.s[2]
 
        ld1     {v5.4s}, [pA_2]         // for next round
        add     pA_2, pA_2, #16
 
-       fmul    v30.4s, v4.4s, v8.4s[3]
-       fmul    v31.4s, v6.4s, v8.4s[3]
+       fmul    v30.4s, v4.4s, v8.s[3]
+       fmul    v31.4s, v6.4s, v8.s[3]
 
        ld1     {v7.4s}, [pA_3]         // for next round
        add     pA_3, pA_3, #16
 .endm
 
 .macro KERNEL16x4_M2
-       fmla    v16.4s, v1.4s, v12.4s[0]
-       fmla    v17.4s, v3.4s, v12.4s[0]
+       fmla    v16.4s, v1.4s, v12.s[0]
+       fmla    v17.4s, v3.4s, v12.s[0]
 
        ld1     {v8.4s}, [pB]           // for next round
        add     pB, pB, #16
 
-       fmla    v18.4s, v5.4s, v12.4s[0]
-       fmla    v19.4s, v7.4s, v12.4s[0]
+       fmla    v18.4s, v5.4s, v12.s[0]
+       fmla    v19.4s, v7.4s, v12.s[0]
 
        ld1     {v0.4s}, [pA_0]         // for next round
        add     pA_0, pA_0, #16
 
-       fmla    v20.4s, v1.4s, v12.4s[1]
-       fmla    v21.4s, v3.4s, v12.4s[1]
+       fmla    v20.4s, v1.4s, v12.s[1]
+       fmla    v21.4s, v3.4s, v12.s[1]
 
        ld1     {v2.4s}, [pA_1]         // for next round
        add     pA_1, pA_1, #16
 
-       fmla    v22.4s, v5.4s, v12.4s[1]
-       fmla    v23.4s, v7.4s, v12.4s[1]
+       fmla    v22.4s, v5.4s, v12.s[1]
+       fmla    v23.4s, v7.4s, v12.s[1]
 
        ld1     {v4.4s}, [pA_2]         // for next round
        add     pA_2, pA_2, #16
 
-       fmla    v24.4s, v1.4s, v12.4s[2]
-       fmla    v25.4s, v3.4s, v12.4s[2]
+       fmla    v24.4s, v1.4s, v12.s[2]
+       fmla    v25.4s, v3.4s, v12.s[2]
 
        ld1     {v6.4s}, [pA_3]         // for next round
        add     pA_3, pA_3, #16
 
-       fmla    v26.4s, v5.4s, v12.4s[2]
-       fmla    v27.4s, v7.4s, v12.4s[2]
+       fmla    v26.4s, v5.4s, v12.s[2]
+       fmla    v27.4s, v7.4s, v12.s[2]
 
        prfm    PLDL1KEEP, [pA_2, #512]
 
-       fmla    v28.4s, v1.4s, v12.4s[3]
-       fmla    v29.4s, v3.4s, v12.4s[3]
+       fmla    v28.4s, v1.4s, v12.s[3]
+       fmla    v29.4s, v3.4s, v12.s[3]
 
        prfm    PLDL1KEEP, [pA_3, #512]
 
-       fmla    v30.4s, v5.4s, v12.4s[3]
-       fmla    v31.4s, v7.4s, v12.4s[3]
+       fmla    v30.4s, v5.4s, v12.s[3]
+       fmla    v31.4s, v7.4s, v12.s[3]
 
        prfm    PLDL1KEEP, [pB, #512]
 .endm
 
 .macro KERNEL16x4_M1
-       fmla    v16.4s, v0.4s, v8.4s[0]
-       fmla    v17.4s, v2.4s, v8.4s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v2.4s, v8.s[0]
 
        ld1     {v12.4s}, [pB]          // for next round
        add     pB, pB, #16
 
-       fmla    v18.4s, v4.4s, v8.4s[0]
-       fmla    v19.4s, v6.4s, v8.4s[0]
+       fmla    v18.4s, v4.4s, v8.s[0]
+       fmla    v19.4s, v6.4s, v8.s[0]
 
        ld1     {v1.4s}, [pA_0]         // for next round
        add     pA_0, pA_0, #16
 
-       fmla    v20.4s, v0.4s, v8.4s[1]
-       fmla    v21.4s, v2.4s, v8.4s[1]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v2.4s, v8.s[1]
 
        ld1     {v3.4s}, [pA_1]         // for next round
        add     pA_1, pA_1, #16
 
-       fmla    v22.4s, v4.4s, v8.4s[1]
-       fmla    v23.4s, v6.4s, v8.4s[1]
+       fmla    v22.4s, v4.4s, v8.s[1]
+       fmla    v23.4s, v6.4s, v8.s[1]
 
        ld1     {v5.4s}, [pA_2]         // for next round
        add     pA_2, pA_2, #16
 
-       fmla    v24.4s, v0.4s, v8.4s[2]
-       fmla    v25.4s, v2.4s, v8.4s[2]
+       fmla    v24.4s, v0.4s, v8.s[2]
+       fmla    v25.4s, v2.4s, v8.s[2]
 
        ld1     {v7.4s}, [pA_3]         // for next round
        add     pA_3, pA_3, #16
 
-       fmla    v26.4s, v4.4s, v8.4s[2]
-       fmla    v27.4s, v6.4s, v8.4s[2]
+       fmla    v26.4s, v4.4s, v8.s[2]
+       fmla    v27.4s, v6.4s, v8.s[2]
 
        prfm    PLDL1KEEP, [pA_0, #512]
 
-       fmla    v28.4s, v0.4s, v8.4s[3]
-       fmla    v29.4s, v2.4s, v8.4s[3]
+       fmla    v28.4s, v0.4s, v8.s[3]
+       fmla    v29.4s, v2.4s, v8.s[3]
 
        prfm    PLDL1KEEP, [pA_1, #512]
 
-       fmla    v30.4s, v4.4s, v8.4s[3]
-       fmla    v31.4s, v6.4s, v8.4s[3]
+       fmla    v30.4s, v4.4s, v8.s[3]
+       fmla    v31.4s, v6.4s, v8.s[3]
 .endm
 
 .macro KERNEL16x4_E
-       fmla    v16.4s, v1.4s, v12.4s[0]
-       fmla    v17.4s, v3.4s, v12.4s[0]
-       fmla    v18.4s, v5.4s, v12.4s[0]
-       fmla    v19.4s, v7.4s, v12.4s[0]
-       fmla    v20.4s, v1.4s, v12.4s[1]
-       fmla    v21.4s, v3.4s, v12.4s[1]
-       fmla    v22.4s, v5.4s, v12.4s[1]
-       fmla    v23.4s, v7.4s, v12.4s[1]
-       fmla    v24.4s, v1.4s, v12.4s[2]
-       fmla    v25.4s, v3.4s, v12.4s[2]
-       fmla    v26.4s, v5.4s, v12.4s[2]
-       fmla    v27.4s, v7.4s, v12.4s[2]
-       fmla    v28.4s, v1.4s, v12.4s[3]
-       fmla    v29.4s, v3.4s, v12.4s[3]
-       fmla    v30.4s, v5.4s, v12.4s[3]
-       fmla    v31.4s, v7.4s, v12.4s[3]
+       fmla    v16.4s, v1.4s, v12.s[0]
+       fmla    v17.4s, v3.4s, v12.s[0]
+       fmla    v18.4s, v5.4s, v12.s[0]
+       fmla    v19.4s, v7.4s, v12.s[0]
+       fmla    v20.4s, v1.4s, v12.s[1]
+       fmla    v21.4s, v3.4s, v12.s[1]
+       fmla    v22.4s, v5.4s, v12.s[1]
+       fmla    v23.4s, v7.4s, v12.s[1]
+       fmla    v24.4s, v1.4s, v12.s[2]
+       fmla    v25.4s, v3.4s, v12.s[2]
+       fmla    v26.4s, v5.4s, v12.s[2]
+       fmla    v27.4s, v7.4s, v12.s[2]
+       fmla    v28.4s, v1.4s, v12.s[3]
+       fmla    v29.4s, v3.4s, v12.s[3]
+       fmla    v30.4s, v5.4s, v12.s[3]
+       fmla    v31.4s, v7.4s, v12.s[3]
 .endm
 
 .macro KERNEL16x4_SUB
@@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.4s}, [pA_0]
        add     pA_0, pA_0, #16
 
-       fmla    v16.4s, v0.4s, v8.4s[0]
-       fmla    v20.4s, v0.4s, v8.4s[1]
-       fmla    v24.4s, v0.4s, v8.4s[2]
-       fmla    v28.4s, v0.4s, v8.4s[3]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v8.s[2]
+       fmla    v28.4s, v0.4s, v8.s[3]
 
        ld1     {v2.4s}, [pA_1]
        add     pA_1, pA_1, #16
 
-       fmla    v17.4s, v2.4s, v8.4s[0]
-       fmla    v21.4s, v2.4s, v8.4s[1]
-       fmla    v25.4s, v2.4s, v8.4s[2]
-       fmla    v29.4s, v2.4s, v8.4s[3]
+       fmla    v17.4s, v2.4s, v8.s[0]
+       fmla    v21.4s, v2.4s, v8.s[1]
+       fmla    v25.4s, v2.4s, v8.s[2]
+       fmla    v29.4s, v2.4s, v8.s[3]
 
        ld1     {v4.4s}, [pA_2]
        add     pA_2, pA_2, #16
 
-       fmla    v18.4s, v4.4s, v8.4s[0]
-       fmla    v22.4s, v4.4s, v8.4s[1]
-       fmla    v26.4s, v4.4s, v8.4s[2]
-       fmla    v30.4s, v4.4s, v8.4s[3]
+       fmla    v18.4s, v4.4s, v8.s[0]
+       fmla    v22.4s, v4.4s, v8.s[1]
+       fmla    v26.4s, v4.4s, v8.s[2]
+       fmla    v30.4s, v4.4s, v8.s[3]
 
        ld1     {v6.4s}, [pA_3]
        add     pA_3, pA_3, #16
 
-       fmla    v19.4s, v6.4s, v8.4s[0]
-       fmla    v23.4s, v6.4s, v8.4s[1]
-       fmla    v27.4s, v6.4s, v8.4s[2]
-       fmla    v31.4s, v6.4s, v8.4s[3]
+       fmla    v19.4s, v6.4s, v8.s[0]
+       fmla    v23.4s, v6.4s, v8.s[1]
+       fmla    v27.4s, v6.4s, v8.s[2]
+       fmla    v31.4s, v6.4s, v8.s[3]
 .endm
 
 .macro SAVE16x4
@@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA_0]
        add     pA_0, pA_0, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
        ld1     {v2.2s, v3.2s}, [pA_1]
        add     pA_1, pA_1, #16
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 
-       fmla    v18.2s, v2.2s, v8.2s[0]
-       fmla    v31.2s, v3.2s, v9.2s[1]
-       fmla    v22.2s, v2.2s, v8.2s[1]
-       fmla    v27.2s, v3.2s, v9.2s[0]
+       fmla    v18.2s, v2.2s, v8.s[0]
+       fmla    v31.2s, v3.2s, v9.s[1]
+       fmla    v22.2s, v2.2s, v8.s[1]
+       fmla    v27.2s, v3.2s, v9.s[0]
 
-       fmla    v26.2s, v2.2s, v9.2s[0]
-       fmla    v23.2s, v3.2s, v8.2s[1]
-       fmla    v30.2s, v2.2s, v9.2s[1]
-       fmla    v19.2s, v3.2s, v8.2s[0]
+       fmla    v26.2s, v2.2s, v9.s[0]
+       fmla    v23.2s, v3.2s, v8.s[1]
+       fmla    v30.2s, v2.2s, v9.s[1]
+       fmla    v19.2s, v3.2s, v8.s[0]
 .endm
 
 .macro SAVE8x4
@@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA_0]
        add     pA_0, pA_0, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA_0]
        add     pA_0, pA_0, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v28.2s, v0.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA_0]
        add     pA_0, pA_0, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA_0]
        add     pA_0, pA_0, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     s0 , [pA_0]
        add     pA_0, pA_0, #4
 
-       fmla    v16.2s, v8.2s, v0.2s[0]
+       fmla    v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA_0]
        add     pA_0 , pA_0, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA_0]
        add     pA_0 , pA_0, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
index ac690e4..bd47bed 100644 (file)
@@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v4.4s[0]
-       fmul    v17.4s, v1.4s, v4.4s[0]
-       fmul    v18.4s, v0.4s, v4.4s[1]
-       fmul    v19.4s, v1.4s, v4.4s[1]
-       fmul    v20.4s, v0.4s, v4.4s[2]
-       fmul    v21.4s, v1.4s, v4.4s[2]
-       fmul    v22.4s, v0.4s, v4.4s[3]
-       fmul    v23.4s, v1.4s, v4.4s[3]
-       fmul    v24.4s, v0.4s, v5.4s[0]
-       fmul    v25.4s, v1.4s, v5.4s[0]
-       fmul    v26.4s, v0.4s, v5.4s[1]
-       fmul    v27.4s, v1.4s, v5.4s[1]
-       fmul    v28.4s, v0.4s, v5.4s[2]
-       fmul    v29.4s, v1.4s, v5.4s[2]
-       fmul    v30.4s, v0.4s, v5.4s[3]
-       fmul    v31.4s, v1.4s, v5.4s[3]
+       fmul    v16.4s, v0.4s, v4.s[0]
+       fmul    v17.4s, v1.4s, v4.s[0]
+       fmul    v18.4s, v0.4s, v4.s[1]
+       fmul    v19.4s, v1.4s, v4.s[1]
+       fmul    v20.4s, v0.4s, v4.s[2]
+       fmul    v21.4s, v1.4s, v4.s[2]
+       fmul    v22.4s, v0.4s, v4.s[3]
+       fmul    v23.4s, v1.4s, v4.s[3]
+       fmul    v24.4s, v0.4s, v5.s[0]
+       fmul    v25.4s, v1.4s, v5.s[0]
+       fmul    v26.4s, v0.4s, v5.s[1]
+       fmul    v27.4s, v1.4s, v5.s[1]
+       fmul    v28.4s, v0.4s, v5.s[2]
+       fmul    v29.4s, v1.4s, v5.s[2]
+       fmul    v30.4s, v0.4s, v5.s[3]
+       fmul    v31.4s, v1.4s, v5.s[3]
 
        ld1     {v6.4s}, [pB]
        add     pB, pB, #16
@@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_M1
-       fmla    v16.4s, v0.4s, v4.4s[0]
-       fmla    v17.4s, v1.4s, v4.4s[0]
-       fmla    v18.4s, v0.4s, v4.4s[1]
-       fmla    v19.4s, v1.4s, v4.4s[1]
-       fmla    v20.4s, v0.4s, v4.4s[2]
-       fmla    v21.4s, v1.4s, v4.4s[2]
-       fmla    v22.4s, v0.4s, v4.4s[3]
-       fmla    v23.4s, v1.4s, v4.4s[3]
-       fmla    v24.4s, v0.4s, v5.4s[0]
-       fmla    v25.4s, v1.4s, v5.4s[0]
-       fmla    v26.4s, v0.4s, v5.4s[1]
-       fmla    v27.4s, v1.4s, v5.4s[1]
-       fmla    v28.4s, v0.4s, v5.4s[2]
-       fmla    v29.4s, v1.4s, v5.4s[2]
-       fmla    v30.4s, v0.4s, v5.4s[3]
-       fmla    v31.4s, v1.4s, v5.4s[3]
+       fmla    v16.4s, v0.4s, v4.s[0]
+       fmla    v17.4s, v1.4s, v4.s[0]
+       fmla    v18.4s, v0.4s, v4.s[1]
+       fmla    v19.4s, v1.4s, v4.s[1]
+       fmla    v20.4s, v0.4s, v4.s[2]
+       fmla    v21.4s, v1.4s, v4.s[2]
+       fmla    v22.4s, v0.4s, v4.s[3]
+       fmla    v23.4s, v1.4s, v4.s[3]
+       fmla    v24.4s, v0.4s, v5.s[0]
+       fmla    v25.4s, v1.4s, v5.s[0]
+       fmla    v26.4s, v0.4s, v5.s[1]
+       fmla    v27.4s, v1.4s, v5.s[1]
+       fmla    v28.4s, v0.4s, v5.s[2]
+       fmla    v29.4s, v1.4s, v5.s[2]
+       fmla    v30.4s, v0.4s, v5.s[3]
+       fmla    v31.4s, v1.4s, v5.s[3]
 
        ld1     {v6.4s}, [pB]
        add     pB, pB, #16
@@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_M2
-       fmla    v16.4s, v2.4s, v6.4s[0]
-       fmla    v17.4s, v3.4s, v6.4s[0]
-       fmla    v18.4s, v2.4s, v6.4s[1]
-       fmla    v19.4s, v3.4s, v6.4s[1]
-       fmla    v20.4s, v2.4s, v6.4s[2]
-       fmla    v21.4s, v3.4s, v6.4s[2]
-       fmla    v22.4s, v2.4s, v6.4s[3]
-       fmla    v23.4s, v3.4s, v6.4s[3]
-       fmla    v24.4s, v2.4s, v7.4s[0]
-       fmla    v25.4s, v3.4s, v7.4s[0]
-       fmla    v26.4s, v2.4s, v7.4s[1]
-       fmla    v27.4s, v3.4s, v7.4s[1]
-       fmla    v28.4s, v2.4s, v7.4s[2]
-       fmla    v29.4s, v3.4s, v7.4s[2]
-       fmla    v30.4s, v2.4s, v7.4s[3]
-       fmla    v31.4s, v3.4s, v7.4s[3]
+       fmla    v16.4s, v2.4s, v6.s[0]
+       fmla    v17.4s, v3.4s, v6.s[0]
+       fmla    v18.4s, v2.4s, v6.s[1]
+       fmla    v19.4s, v3.4s, v6.s[1]
+       fmla    v20.4s, v2.4s, v6.s[2]
+       fmla    v21.4s, v3.4s, v6.s[2]
+       fmla    v22.4s, v2.4s, v6.s[3]
+       fmla    v23.4s, v3.4s, v6.s[3]
+       fmla    v24.4s, v2.4s, v7.s[0]
+       fmla    v25.4s, v3.4s, v7.s[0]
+       fmla    v26.4s, v2.4s, v7.s[1]
+       fmla    v27.4s, v3.4s, v7.s[1]
+       fmla    v28.4s, v2.4s, v7.s[2]
+       fmla    v29.4s, v3.4s, v7.s[2]
+       fmla    v30.4s, v2.4s, v7.s[3]
+       fmla    v31.4s, v3.4s, v7.s[3]
 
        ld1     {v4.4s}, [pB]
        add     pB, pB, #16
@@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_E
-       fmla    v16.4s, v2.4s, v6.4s[0]
-       fmla    v17.4s, v3.4s, v6.4s[0]
-       fmla    v18.4s, v2.4s, v6.4s[1]
-       fmla    v19.4s, v3.4s, v6.4s[1]
-       fmla    v20.4s, v2.4s, v6.4s[2]
-       fmla    v21.4s, v3.4s, v6.4s[2]
-       fmla    v22.4s, v2.4s, v6.4s[3]
-       fmla    v23.4s, v3.4s, v6.4s[3]
-       fmla    v24.4s, v2.4s, v7.4s[0]
-       fmla    v25.4s, v3.4s, v7.4s[0]
-       fmla    v26.4s, v2.4s, v7.4s[1]
-       fmla    v27.4s, v3.4s, v7.4s[1]
-       fmla    v28.4s, v2.4s, v7.4s[2]
-       fmla    v29.4s, v3.4s, v7.4s[2]
-       fmla    v30.4s, v2.4s, v7.4s[3]
-       fmla    v31.4s, v3.4s, v7.4s[3]
+       fmla    v16.4s, v2.4s, v6.s[0]
+       fmla    v17.4s, v3.4s, v6.s[0]
+       fmla    v18.4s, v2.4s, v6.s[1]
+       fmla    v19.4s, v3.4s, v6.s[1]
+       fmla    v20.4s, v2.4s, v6.s[2]
+       fmla    v21.4s, v3.4s, v6.s[2]
+       fmla    v22.4s, v2.4s, v6.s[3]
+       fmla    v23.4s, v3.4s, v6.s[3]
+       fmla    v24.4s, v2.4s, v7.s[0]
+       fmla    v25.4s, v3.4s, v7.s[0]
+       fmla    v26.4s, v2.4s, v7.s[1]
+       fmla    v27.4s, v3.4s, v7.s[1]
+       fmla    v28.4s, v2.4s, v7.s[2]
+       fmla    v29.4s, v3.4s, v7.s[2]
+       fmla    v30.4s, v2.4s, v7.s[3]
+       fmla    v31.4s, v3.4s, v7.s[3]
 .endm
 
 .macro KERNEL8x8_SUB
@@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v4.4s[0]
-       fmla    v17.4s, v1.4s, v4.4s[0]
-       fmla    v18.4s, v0.4s, v4.4s[1]
-       fmla    v19.4s, v1.4s, v4.4s[1]
-       fmla    v20.4s, v0.4s, v4.4s[2]
-       fmla    v21.4s, v1.4s, v4.4s[2]
-       fmla    v22.4s, v0.4s, v4.4s[3]
-       fmla    v23.4s, v1.4s, v4.4s[3]
-       fmla    v24.4s, v0.4s, v5.4s[0]
-       fmla    v25.4s, v1.4s, v5.4s[0]
-       fmla    v26.4s, v0.4s, v5.4s[1]
-       fmla    v27.4s, v1.4s, v5.4s[1]
-       fmla    v28.4s, v0.4s, v5.4s[2]
-       fmla    v29.4s, v1.4s, v5.4s[2]
-       fmla    v30.4s, v0.4s, v5.4s[3]
-       fmla    v31.4s, v1.4s, v5.4s[3]
+       fmla    v16.4s, v0.4s, v4.s[0]
+       fmla    v17.4s, v1.4s, v4.s[0]
+       fmla    v18.4s, v0.4s, v4.s[1]
+       fmla    v19.4s, v1.4s, v4.s[1]
+       fmla    v20.4s, v0.4s, v4.s[2]
+       fmla    v21.4s, v1.4s, v4.s[2]
+       fmla    v22.4s, v0.4s, v4.s[3]
+       fmla    v23.4s, v1.4s, v4.s[3]
+       fmla    v24.4s, v0.4s, v5.s[0]
+       fmla    v25.4s, v1.4s, v5.s[0]
+       fmla    v26.4s, v0.4s, v5.s[1]
+       fmla    v27.4s, v1.4s, v5.s[1]
+       fmla    v28.4s, v0.4s, v5.s[2]
+       fmla    v29.4s, v1.4s, v5.s[2]
+       fmla    v30.4s, v0.4s, v5.s[3]
+       fmla    v31.4s, v1.4s, v5.s[3]
 .endm
 
 .macro SAVE8x8
@@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v4.4s[0]
-       fmul    v18.4s, v0.4s, v4.4s[1]
-       fmul    v20.4s, v0.4s, v4.4s[2]
-       fmul    v22.4s, v0.4s, v4.4s[3]
-       fmul    v24.4s, v0.4s, v5.4s[0]
-       fmul    v26.4s, v0.4s, v5.4s[1]
-       fmul    v28.4s, v0.4s, v5.4s[2]
-       fmul    v30.4s, v0.4s, v5.4s[3]
+       fmul    v16.4s, v0.4s, v4.s[0]
+       fmul    v18.4s, v0.4s, v4.s[1]
+       fmul    v20.4s, v0.4s, v4.s[2]
+       fmul    v22.4s, v0.4s, v4.s[3]
+       fmul    v24.4s, v0.4s, v5.s[0]
+       fmul    v26.4s, v0.4s, v5.s[1]
+       fmul    v28.4s, v0.4s, v5.s[2]
+       fmul    v30.4s, v0.4s, v5.s[3]
 
        ld1     {v6.4s}, [pB]
        add     pB, pB, #16
@@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M1
-       fmla    v16.4s, v0.4s, v4.4s[0]
-       fmla    v18.4s, v0.4s, v4.4s[1]
-       fmla    v20.4s, v0.4s, v4.4s[2]
-       fmla    v22.4s, v0.4s, v4.4s[3]
-       fmla    v24.4s, v0.4s, v5.4s[0]
-       fmla    v26.4s, v0.4s, v5.4s[1]
-       fmla    v28.4s, v0.4s, v5.4s[2]
-       fmla    v30.4s, v0.4s, v5.4s[3]
+       fmla    v16.4s, v0.4s, v4.s[0]
+       fmla    v18.4s, v0.4s, v4.s[1]
+       fmla    v20.4s, v0.4s, v4.s[2]
+       fmla    v22.4s, v0.4s, v4.s[3]
+       fmla    v24.4s, v0.4s, v5.s[0]
+       fmla    v26.4s, v0.4s, v5.s[1]
+       fmla    v28.4s, v0.4s, v5.s[2]
+       fmla    v30.4s, v0.4s, v5.s[3]
 
        ld1     {v6.4s}, [pB]
        add     pB, pB, #16
@@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
-       fmla    v16.4s, v2.4s, v6.4s[0]
-       fmla    v18.4s, v2.4s, v6.4s[1]
-       fmla    v20.4s, v2.4s, v6.4s[2]
-       fmla    v22.4s, v2.4s, v6.4s[3]
-       fmla    v24.4s, v2.4s, v7.4s[0]
-       fmla    v26.4s, v2.4s, v7.4s[1]
-       fmla    v28.4s, v2.4s, v7.4s[2]
-       fmla    v30.4s, v2.4s, v7.4s[3]
+       fmla    v16.4s, v2.4s, v6.s[0]
+       fmla    v18.4s, v2.4s, v6.s[1]
+       fmla    v20.4s, v2.4s, v6.s[2]
+       fmla    v22.4s, v2.4s, v6.s[3]
+       fmla    v24.4s, v2.4s, v7.s[0]
+       fmla    v26.4s, v2.4s, v7.s[1]
+       fmla    v28.4s, v2.4s, v7.s[2]
+       fmla    v30.4s, v2.4s, v7.s[3]
 
        ld1     {v4.4s}, [pB]
        add     pB, pB, #16
@@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_E
-       fmla    v16.4s, v2.4s, v6.4s[0]
-       fmla    v18.4s, v2.4s, v6.4s[1]
-       fmla    v20.4s, v2.4s, v6.4s[2]
-       fmla    v22.4s, v2.4s, v6.4s[3]
-       fmla    v24.4s, v2.4s, v7.4s[0]
-       fmla    v26.4s, v2.4s, v7.4s[1]
-       fmla    v28.4s, v2.4s, v7.4s[2]
-       fmla    v30.4s, v2.4s, v7.4s[3]
+       fmla    v16.4s, v2.4s, v6.s[0]
+       fmla    v18.4s, v2.4s, v6.s[1]
+       fmla    v20.4s, v2.4s, v6.s[2]
+       fmla    v22.4s, v2.4s, v6.s[3]
+       fmla    v24.4s, v2.4s, v7.s[0]
+       fmla    v26.4s, v2.4s, v7.s[1]
+       fmla    v28.4s, v2.4s, v7.s[2]
+       fmla    v30.4s, v2.4s, v7.s[3]
 .endm
 
 .macro KERNEL4x8_SUB
@@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v4.4s[0]
-       fmla    v18.4s, v0.4s, v4.4s[1]
-       fmla    v20.4s, v0.4s, v4.4s[2]
-       fmla    v22.4s, v0.4s, v4.4s[3]
-       fmla    v24.4s, v0.4s, v5.4s[0]
-       fmla    v26.4s, v0.4s, v5.4s[1]
-       fmla    v28.4s, v0.4s, v5.4s[2]
-       fmla    v30.4s, v0.4s, v5.4s[3]
+       fmla    v16.4s, v0.4s, v4.s[0]
+       fmla    v18.4s, v0.4s, v4.s[1]
+       fmla    v20.4s, v0.4s, v4.s[2]
+       fmla    v22.4s, v0.4s, v4.s[3]
+       fmla    v24.4s, v0.4s, v5.s[0]
+       fmla    v26.4s, v0.4s, v5.s[1]
+       fmla    v28.4s, v0.4s, v5.s[2]
+       fmla    v30.4s, v0.4s, v5.s[3]
 .endm
 
 .macro SAVE4x8
@@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v4.4s[0]
-       fmla    v18.2s, v0.2s, v4.4s[1]
-       fmla    v20.2s, v0.2s, v4.4s[2]
-       fmla    v22.2s, v0.2s, v4.4s[3]
-       fmla    v24.2s, v0.2s, v5.4s[0]
-       fmla    v26.2s, v0.2s, v5.4s[1]
-       fmla    v28.2s, v0.2s, v5.4s[2]
-       fmla    v30.2s, v0.2s, v5.4s[3]
+       fmla    v16.2s, v0.2s, v4.s[0]
+       fmla    v18.2s, v0.2s, v4.s[1]
+       fmla    v20.2s, v0.2s, v4.s[2]
+       fmla    v22.2s, v0.2s, v4.s[3]
+       fmla    v24.2s, v0.2s, v5.s[0]
+       fmla    v26.2s, v0.2s, v5.s[1]
+       fmla    v28.2s, v0.2s, v5.s[2]
+       fmla    v30.2s, v0.2s, v5.s[3]
 .endm
 
 .macro SAVE2x8
@@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     s0, [pA]
        add     pA, pA, #4
 
-       fmla    s16, s0, v4.4s[0]
-       fmla    s18, s0, v4.4s[1]
-       fmla    s20, s0, v4.4s[2]
-       fmla    s22, s0, v4.4s[3]
-       fmla    s24, s0, v5.4s[0]
-       fmla    s26, s0, v5.4s[1]
-       fmla    s28, s0, v5.4s[2]
-       fmla    s30, s0, v5.4s[3]
+       fmla    s16, s0, v4.s[0]
+       fmla    s18, s0, v4.s[1]
+       fmla    s20, s0, v4.s[2]
+       fmla    s22, s0, v4.s[3]
+       fmla    s24, s0, v5.s[0]
+       fmla    s26, s0, v5.s[1]
+       fmla    s28, s0, v5.s[2]
+       fmla    s30, s0, v5.s[3]
 .endm
 
 .macro SAVE1x8
@@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v8.2s[0]
-       fmul    v17.4s, v1.4s, v8.2s[0]
-       fmul    v20.4s, v0.4s, v8.2s[1]
-       fmul    v21.4s, v1.4s, v8.2s[1]
-       fmul    v24.4s, v0.4s, v9.2s[0]
-       fmul    v25.4s, v1.4s, v9.2s[0]
-       fmul    v28.4s, v0.4s, v9.2s[1]
-       fmul    v29.4s, v1.4s, v9.2s[1]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       fmul    v17.4s, v1.4s, v8.s[0]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       fmul    v21.4s, v1.4s, v8.s[1]
+       fmul    v24.4s, v0.4s, v9.s[0]
+       fmul    v25.4s, v1.4s, v9.s[0]
+       fmul    v28.4s, v0.4s, v9.s[1]
+       fmul    v29.4s, v1.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]
        add     pB, pB, #16
@@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
 .endm
 
 .macro SAVE8x4
@@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.2s, v0.2s, v8.2s[0]
-       fmul    v29.2s, v1.2s, v9.2s[1]
+       fmul    v16.2s, v0.2s, v8.s[0]
+       fmul    v29.2s, v1.2s, v9.s[1]
 
-       fmul    v20.2s, v0.2s, v8.2s[1]
-       fmul    v25.2s, v1.2s, v9.2s[0]
+       fmul    v20.2s, v0.2s, v8.s[1]
+       fmul    v25.2s, v1.2s, v9.s[0]
 
-       fmul    v24.2s, v0.2s, v9.2s[0]
-       fmul    v21.2s, v1.2s, v8.2s[1]
+       fmul    v24.2s, v0.2s, v9.s[0]
+       fmul    v21.2s, v1.2s, v8.s[1]
 
-       fmul    v28.2s, v0.2s, v9.2s[1]
-       fmul    v17.2s, v1.2s, v8.2s[0]
+       fmul    v28.2s, v0.2s, v9.s[1]
+       fmul    v17.2s, v1.2s, v8.s[0]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]          // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
        ld1     {v4.2s, v5.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]            // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
        ld1     {v0.2s, v1.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v28.2s, v0.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
 
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     s0 , [pA]
        add     pA, pA, #4
 
-       fmla    v16.2s, v8.2s, v0.2s[0]
+       fmla    v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
@@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA , pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
old mode 100755 (executable)
new mode 100644 (file)
index b99760a..28b3216
@@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v3.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v8.2s[0]
-       fmul    v17.4s, v1.4s, v8.2s[0]
-       fmul    v18.4s, v2.4s, v8.2s[0]
-       fmul    v19.4s, v3.4s, v8.2s[0]
-
-       fmul    v20.4s, v0.4s, v8.2s[1]
-       fmul    v21.4s, v1.4s, v8.2s[1]
-       fmul    v22.4s, v2.4s, v8.2s[1]
-       fmul    v23.4s, v3.4s, v8.2s[1]
-
-       fmul    v24.4s, v0.4s, v9.2s[0]
-       fmul    v25.4s, v1.4s, v9.2s[0]
-       fmul    v26.4s, v2.4s, v9.2s[0]
-       fmul    v27.4s, v3.4s, v9.2s[0]
-
-       fmul    v28.4s, v0.4s, v9.2s[1]
-       fmul    v29.4s, v1.4s, v9.2s[1]
-       fmul    v30.4s, v2.4s, v9.2s[1]
-       fmul    v31.4s, v3.4s, v9.2s[1]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       fmul    v17.4s, v1.4s, v8.s[0]
+       fmul    v18.4s, v2.4s, v8.s[0]
+       fmul    v19.4s, v3.4s, v8.s[0]
+
+       fmul    v20.4s, v0.4s, v8.s[1]
+       fmul    v21.4s, v1.4s, v8.s[1]
+       fmul    v22.4s, v2.4s, v8.s[1]
+       fmul    v23.4s, v3.4s, v8.s[1]
+
+       fmul    v24.4s, v0.4s, v9.s[0]
+       fmul    v25.4s, v1.4s, v9.s[0]
+       fmul    v26.4s, v2.4s, v9.s[0]
+       fmul    v27.4s, v3.4s, v9.s[0]
+
+       fmul    v28.4s, v0.4s, v9.s[1]
+       fmul    v29.4s, v1.4s, v9.s[1]
+       fmul    v30.4s, v2.4s, v9.s[1]
+       fmul    v31.4s, v3.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_M1
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v18.4s, v2.4s, v8.2s[0]
-       fmla    v19.4s, v3.4s, v8.2s[0]
-
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v22.4s, v2.4s, v8.2s[1]
-       fmla    v23.4s, v3.4s, v8.2s[1]
-
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v26.4s, v2.4s, v9.2s[0]
-       fmla    v27.4s, v3.4s, v9.2s[0]
-
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
-       fmla    v30.4s, v2.4s, v9.2s[1]
-       fmla    v31.4s, v3.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v18.4s, v2.4s, v8.s[0]
+       fmla    v19.4s, v3.4s, v8.s[0]
+
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v22.4s, v2.4s, v8.s[1]
+       fmla    v23.4s, v3.4s, v8.s[1]
+
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v26.4s, v2.4s, v9.s[0]
+       fmla    v27.4s, v3.4s, v9.s[0]
+
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
+       fmla    v30.4s, v2.4s, v9.s[1]
+       fmla    v31.4s, v3.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_M2
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v18.4s, v6.4s, v12.2s[0]
-       fmla    v19.4s, v7.4s, v12.2s[0]
-
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v22.4s, v6.4s, v12.2s[1]
-       fmla    v23.4s, v7.4s, v12.2s[1]
-
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v26.4s, v6.4s, v13.2s[0]
-       fmla    v27.4s, v7.4s, v13.2s[0]
-
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
-       fmla    v30.4s, v6.4s, v13.2s[1]
-       fmla    v31.4s, v7.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v18.4s, v6.4s, v12.s[0]
+       fmla    v19.4s, v7.4s, v12.s[0]
+
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v22.4s, v6.4s, v12.s[1]
+       fmla    v23.4s, v7.4s, v12.s[1]
+
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v26.4s, v6.4s, v13.s[0]
+       fmla    v27.4s, v7.4s, v13.s[0]
+
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
+       fmla    v30.4s, v6.4s, v13.s[1]
+       fmla    v31.4s, v7.4s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]
        add     pB, pB, #16
@@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_E
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v18.4s, v6.4s, v12.2s[0]
-       fmla    v19.4s, v7.4s, v12.2s[0]
-
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v22.4s, v6.4s, v12.2s[1]
-       fmla    v23.4s, v7.4s, v12.2s[1]
-
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v26.4s, v6.4s, v13.2s[0]
-       fmla    v27.4s, v7.4s, v13.2s[0]
-
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
-       fmla    v30.4s, v6.4s, v13.2s[1]
-       fmla    v31.4s, v7.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v18.4s, v6.4s, v12.s[0]
+       fmla    v19.4s, v7.4s, v12.s[0]
+
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v22.4s, v6.4s, v12.s[1]
+       fmla    v23.4s, v7.4s, v12.s[1]
+
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v26.4s, v6.4s, v13.s[0]
+       fmla    v27.4s, v7.4s, v13.s[0]
+
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
+       fmla    v30.4s, v6.4s, v13.s[1]
+       fmla    v31.4s, v7.4s, v13.s[1]
 .endm
 
 .macro KERNEL16x4_SUB
@@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v3.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v18.4s, v2.4s, v8.2s[0]
-       fmla    v19.4s, v3.4s, v8.2s[0]
-
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v22.4s, v2.4s, v8.2s[1]
-       fmla    v23.4s, v3.4s, v8.2s[1]
-
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v26.4s, v2.4s, v9.2s[0]
-       fmla    v27.4s, v3.4s, v9.2s[0]
-
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
-       fmla    v30.4s, v2.4s, v9.2s[1]
-       fmla    v31.4s, v3.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v18.4s, v2.4s, v8.s[0]
+       fmla    v19.4s, v3.4s, v8.s[0]
+
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v22.4s, v2.4s, v8.s[1]
+       fmla    v23.4s, v3.4s, v8.s[1]
+
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v26.4s, v2.4s, v9.s[0]
+       fmla    v27.4s, v3.4s, v9.s[0]
+
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
+       fmla    v30.4s, v2.4s, v9.s[1]
+       fmla    v31.4s, v3.4s, v9.s[1]
 .endm
 
 .macro SAVE16x4
@@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v8.2s[0]
-       fmul    v17.4s, v1.4s, v8.2s[0]
-       fmul    v20.4s, v0.4s, v8.2s[1]
-       fmul    v21.4s, v1.4s, v8.2s[1]
-       fmul    v24.4s, v0.4s, v9.2s[0]
-       fmul    v25.4s, v1.4s, v9.2s[0]
-       fmul    v28.4s, v0.4s, v9.2s[1]
-       fmul    v29.4s, v1.4s, v9.2s[1]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       fmul    v17.4s, v1.4s, v8.s[0]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       fmul    v21.4s, v1.4s, v8.s[1]
+       fmul    v24.4s, v0.4s, v9.s[0]
+       fmul    v25.4s, v1.4s, v9.s[0]
+       fmul    v28.4s, v0.4s, v9.s[1]
+       fmul    v29.4s, v1.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]
        add     pB, pB, #16
@@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
 .endm
 
 .macro SAVE8x4
@@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.2s, v0.2s, v8.2s[0]
-       fmul    v29.2s, v1.2s, v9.2s[1]
+       fmul    v16.2s, v0.2s, v8.s[0]
+       fmul    v29.2s, v1.2s, v9.s[1]
 
-       fmul    v20.2s, v0.2s, v8.2s[1]
-       fmul    v25.2s, v1.2s, v9.2s[0]
+       fmul    v20.2s, v0.2s, v8.s[1]
+       fmul    v25.2s, v1.2s, v9.s[0]
 
-       fmul    v24.2s, v0.2s, v9.2s[0]
-       fmul    v21.2s, v1.2s, v8.2s[1]
+       fmul    v24.2s, v0.2s, v9.s[0]
+       fmul    v21.2s, v1.2s, v8.s[1]
 
-       fmul    v28.2s, v0.2s, v9.2s[1]
-       fmul    v17.2s, v1.2s, v8.2s[0]
+       fmul    v28.2s, v0.2s, v9.s[1]
+       fmul    v17.2s, v1.2s, v8.s[0]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]          // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
        ld1     {v4.2s, v5.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]            // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
        ld1     {v0.2s, v1.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v28.2s, v0.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v3.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v18.4s, v2.4s, v8.2s[0]
-       fmla    v19.4s, v3.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v18.4s, v2.4s, v8.s[0]
+       fmla    v19.4s, v3.4s, v8.s[0]
 
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v22.4s, v2.4s, v8.2s[1]
-       fmla    v23.4s, v3.4s, v8.2s[1]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v22.4s, v2.4s, v8.s[1]
+       fmla    v23.4s, v3.4s, v8.s[1]
 .endm
 
 .macro SAVE16x2
@@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
 
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     s0 , [pA]
        add     pA, pA, #4
 
-       fmla    v16.2s, v8.2s, v0.2s[0]
+       fmla    v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v3.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v18.4s, v2.4s, v8.2s[0]
-       fmla    v19.4s, v3.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v18.4s, v2.4s, v8.s[0]
+       fmla    v19.4s, v3.4s, v8.s[0]
 .endm
 
 .macro SAVE16x1
@@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
@@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA , pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
index 674e200..eeb3e6e 100644 (file)
@@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.2s, v0.2s, v8.2s[0]
-       fmul    v29.2s, v1.2s, v9.2s[1]
+       fmul    v16.2s, v0.2s, v8.s[0]
+       fmul    v29.2s, v1.2s, v9.s[1]
 
-       fmul    v20.2s, v0.2s, v8.2s[1]
-       fmul    v25.2s, v1.2s, v9.2s[0]
+       fmul    v20.2s, v0.2s, v8.s[1]
+       fmul    v25.2s, v1.2s, v9.s[0]
 
-       fmul    v24.2s, v0.2s, v9.2s[0]
-       fmul    v21.2s, v1.2s, v8.2s[1]
+       fmul    v24.2s, v0.2s, v9.s[0]
+       fmul    v21.2s, v1.2s, v8.s[1]
 
-       fmul    v28.2s, v0.2s, v9.2s[1]
-       fmul    v17.2s, v1.2s, v8.2s[0]
+       fmul    v28.2s, v0.2s, v9.s[1]
+       fmul    v17.2s, v1.2s, v8.s[0]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]          // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
        ld1     {v4.2s, v5.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]            // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
        ld1     {v0.2s, v1.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v28.2s, v0.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     s0 , [pA]
        add     pA, pA, #4
 
-       fmla    v16.2s, v8.2s, v0.2s[0]
+       fmla    v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA , pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
old mode 100755 (executable)
new mode 100644 (file)
index 98b9129..843f0c8
@@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v4.4s[0]
-       fmul    v17.4s, v1.4s, v4.4s[0]
-       fmul    v18.4s, v0.4s, v4.4s[1]
-       fmul    v19.4s, v1.4s, v4.4s[1]
-       fmul    v20.4s, v0.4s, v4.4s[2]
-       fmul    v21.4s, v1.4s, v4.4s[2]
-       fmul    v22.4s, v0.4s, v4.4s[3]
-       fmul    v23.4s, v1.4s, v4.4s[3]
-       fmul    v24.4s, v0.4s, v5.4s[0]
-       fmul    v25.4s, v1.4s, v5.4s[0]
-       fmul    v26.4s, v0.4s, v5.4s[1]
-       fmul    v27.4s, v1.4s, v5.4s[1]
-       fmul    v28.4s, v0.4s, v5.4s[2]
-       fmul    v29.4s, v1.4s, v5.4s[2]
-       fmul    v30.4s, v0.4s, v5.4s[3]
-       fmul    v31.4s, v1.4s, v5.4s[3]
+       fmul    v16.4s, v0.4s, v4.s[0]
+       fmul    v17.4s, v1.4s, v4.s[0]
+       fmul    v18.4s, v0.4s, v4.s[1]
+       fmul    v19.4s, v1.4s, v4.s[1]
+       fmul    v20.4s, v0.4s, v4.s[2]
+       fmul    v21.4s, v1.4s, v4.s[2]
+       fmul    v22.4s, v0.4s, v4.s[3]
+       fmul    v23.4s, v1.4s, v4.s[3]
+       fmul    v24.4s, v0.4s, v5.s[0]
+       fmul    v25.4s, v1.4s, v5.s[0]
+       fmul    v26.4s, v0.4s, v5.s[1]
+       fmul    v27.4s, v1.4s, v5.s[1]
+       fmul    v28.4s, v0.4s, v5.s[2]
+       fmul    v29.4s, v1.4s, v5.s[2]
+       fmul    v30.4s, v0.4s, v5.s[3]
+       fmul    v31.4s, v1.4s, v5.s[3]
 
        ld1     {v6.4s}, [pB]
        add     pB, pB, #16
@@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_M1
-       fmla    v16.4s, v0.4s, v4.4s[0]
-       fmla    v17.4s, v1.4s, v4.4s[0]
-       fmla    v18.4s, v0.4s, v4.4s[1]
-       fmla    v19.4s, v1.4s, v4.4s[1]
-       fmla    v20.4s, v0.4s, v4.4s[2]
-       fmla    v21.4s, v1.4s, v4.4s[2]
-       fmla    v22.4s, v0.4s, v4.4s[3]
-       fmla    v23.4s, v1.4s, v4.4s[3]
-       fmla    v24.4s, v0.4s, v5.4s[0]
-       fmla    v25.4s, v1.4s, v5.4s[0]
-       fmla    v26.4s, v0.4s, v5.4s[1]
-       fmla    v27.4s, v1.4s, v5.4s[1]
-       fmla    v28.4s, v0.4s, v5.4s[2]
-       fmla    v29.4s, v1.4s, v5.4s[2]
-       fmla    v30.4s, v0.4s, v5.4s[3]
-       fmla    v31.4s, v1.4s, v5.4s[3]
+       fmla    v16.4s, v0.4s, v4.s[0]
+       fmla    v17.4s, v1.4s, v4.s[0]
+       fmla    v18.4s, v0.4s, v4.s[1]
+       fmla    v19.4s, v1.4s, v4.s[1]
+       fmla    v20.4s, v0.4s, v4.s[2]
+       fmla    v21.4s, v1.4s, v4.s[2]
+       fmla    v22.4s, v0.4s, v4.s[3]
+       fmla    v23.4s, v1.4s, v4.s[3]
+       fmla    v24.4s, v0.4s, v5.s[0]
+       fmla    v25.4s, v1.4s, v5.s[0]
+       fmla    v26.4s, v0.4s, v5.s[1]
+       fmla    v27.4s, v1.4s, v5.s[1]
+       fmla    v28.4s, v0.4s, v5.s[2]
+       fmla    v29.4s, v1.4s, v5.s[2]
+       fmla    v30.4s, v0.4s, v5.s[3]
+       fmla    v31.4s, v1.4s, v5.s[3]
 
        ld1     {v6.4s}, [pB]
        add     pB, pB, #16
@@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_M2
-       fmla    v16.4s, v2.4s, v6.4s[0]
-       fmla    v17.4s, v3.4s, v6.4s[0]
-       fmla    v18.4s, v2.4s, v6.4s[1]
-       fmla    v19.4s, v3.4s, v6.4s[1]
-       fmla    v20.4s, v2.4s, v6.4s[2]
-       fmla    v21.4s, v3.4s, v6.4s[2]
-       fmla    v22.4s, v2.4s, v6.4s[3]
-       fmla    v23.4s, v3.4s, v6.4s[3]
-       fmla    v24.4s, v2.4s, v7.4s[0]
-       fmla    v25.4s, v3.4s, v7.4s[0]
-       fmla    v26.4s, v2.4s, v7.4s[1]
-       fmla    v27.4s, v3.4s, v7.4s[1]
-       fmla    v28.4s, v2.4s, v7.4s[2]
-       fmla    v29.4s, v3.4s, v7.4s[2]
-       fmla    v30.4s, v2.4s, v7.4s[3]
-       fmla    v31.4s, v3.4s, v7.4s[3]
+       fmla    v16.4s, v2.4s, v6.s[0]
+       fmla    v17.4s, v3.4s, v6.s[0]
+       fmla    v18.4s, v2.4s, v6.s[1]
+       fmla    v19.4s, v3.4s, v6.s[1]
+       fmla    v20.4s, v2.4s, v6.s[2]
+       fmla    v21.4s, v3.4s, v6.s[2]
+       fmla    v22.4s, v2.4s, v6.s[3]
+       fmla    v23.4s, v3.4s, v6.s[3]
+       fmla    v24.4s, v2.4s, v7.s[0]
+       fmla    v25.4s, v3.4s, v7.s[0]
+       fmla    v26.4s, v2.4s, v7.s[1]
+       fmla    v27.4s, v3.4s, v7.s[1]
+       fmla    v28.4s, v2.4s, v7.s[2]
+       fmla    v29.4s, v3.4s, v7.s[2]
+       fmla    v30.4s, v2.4s, v7.s[3]
+       fmla    v31.4s, v3.4s, v7.s[3]
 
        ld1     {v4.4s}, [pB]
        add     pB, pB, #16
@@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_E
-       fmla    v16.4s, v2.4s, v6.4s[0]
-       fmla    v17.4s, v3.4s, v6.4s[0]
-       fmla    v18.4s, v2.4s, v6.4s[1]
-       fmla    v19.4s, v3.4s, v6.4s[1]
-       fmla    v20.4s, v2.4s, v6.4s[2]
-       fmla    v21.4s, v3.4s, v6.4s[2]
-       fmla    v22.4s, v2.4s, v6.4s[3]
-       fmla    v23.4s, v3.4s, v6.4s[3]
-       fmla    v24.4s, v2.4s, v7.4s[0]
-       fmla    v25.4s, v3.4s, v7.4s[0]
-       fmla    v26.4s, v2.4s, v7.4s[1]
-       fmla    v27.4s, v3.4s, v7.4s[1]
-       fmla    v28.4s, v2.4s, v7.4s[2]
-       fmla    v29.4s, v3.4s, v7.4s[2]
-       fmla    v30.4s, v2.4s, v7.4s[3]
-       fmla    v31.4s, v3.4s, v7.4s[3]
+       fmla    v16.4s, v2.4s, v6.s[0]
+       fmla    v17.4s, v3.4s, v6.s[0]
+       fmla    v18.4s, v2.4s, v6.s[1]
+       fmla    v19.4s, v3.4s, v6.s[1]
+       fmla    v20.4s, v2.4s, v6.s[2]
+       fmla    v21.4s, v3.4s, v6.s[2]
+       fmla    v22.4s, v2.4s, v6.s[3]
+       fmla    v23.4s, v3.4s, v6.s[3]
+       fmla    v24.4s, v2.4s, v7.s[0]
+       fmla    v25.4s, v3.4s, v7.s[0]
+       fmla    v26.4s, v2.4s, v7.s[1]
+       fmla    v27.4s, v3.4s, v7.s[1]
+       fmla    v28.4s, v2.4s, v7.s[2]
+       fmla    v29.4s, v3.4s, v7.s[2]
+       fmla    v30.4s, v2.4s, v7.s[3]
+       fmla    v31.4s, v3.4s, v7.s[3]
 .endm
 
 .macro KERNEL8x8_SUB
@@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v4.4s[0]
-       fmla    v17.4s, v1.4s, v4.4s[0]
-       fmla    v18.4s, v0.4s, v4.4s[1]
-       fmla    v19.4s, v1.4s, v4.4s[1]
-       fmla    v20.4s, v0.4s, v4.4s[2]
-       fmla    v21.4s, v1.4s, v4.4s[2]
-       fmla    v22.4s, v0.4s, v4.4s[3]
-       fmla    v23.4s, v1.4s, v4.4s[3]
-       fmla    v24.4s, v0.4s, v5.4s[0]
-       fmla    v25.4s, v1.4s, v5.4s[0]
-       fmla    v26.4s, v0.4s, v5.4s[1]
-       fmla    v27.4s, v1.4s, v5.4s[1]
-       fmla    v28.4s, v0.4s, v5.4s[2]
-       fmla    v29.4s, v1.4s, v5.4s[2]
-       fmla    v30.4s, v0.4s, v5.4s[3]
-       fmla    v31.4s, v1.4s, v5.4s[3]
+       fmla    v16.4s, v0.4s, v4.s[0]
+       fmla    v17.4s, v1.4s, v4.s[0]
+       fmla    v18.4s, v0.4s, v4.s[1]
+       fmla    v19.4s, v1.4s, v4.s[1]
+       fmla    v20.4s, v0.4s, v4.s[2]
+       fmla    v21.4s, v1.4s, v4.s[2]
+       fmla    v22.4s, v0.4s, v4.s[3]
+       fmla    v23.4s, v1.4s, v4.s[3]
+       fmla    v24.4s, v0.4s, v5.s[0]
+       fmla    v25.4s, v1.4s, v5.s[0]
+       fmla    v26.4s, v0.4s, v5.s[1]
+       fmla    v27.4s, v1.4s, v5.s[1]
+       fmla    v28.4s, v0.4s, v5.s[2]
+       fmla    v29.4s, v1.4s, v5.s[2]
+       fmla    v30.4s, v0.4s, v5.s[3]
+       fmla    v31.4s, v1.4s, v5.s[3]
 .endm
 
 .macro SAVE8x8
@@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v4.4s[0]
-       fmul    v18.4s, v0.4s, v4.4s[1]
-       fmul    v20.4s, v0.4s, v4.4s[2]
-       fmul    v22.4s, v0.4s, v4.4s[3]
-       fmul    v24.4s, v0.4s, v5.4s[0]
-       fmul    v26.4s, v0.4s, v5.4s[1]
-       fmul    v28.4s, v0.4s, v5.4s[2]
-       fmul    v30.4s, v0.4s, v5.4s[3]
+       fmul    v16.4s, v0.4s, v4.s[0]
+       fmul    v18.4s, v0.4s, v4.s[1]
+       fmul    v20.4s, v0.4s, v4.s[2]
+       fmul    v22.4s, v0.4s, v4.s[3]
+       fmul    v24.4s, v0.4s, v5.s[0]
+       fmul    v26.4s, v0.4s, v5.s[1]
+       fmul    v28.4s, v0.4s, v5.s[2]
+       fmul    v30.4s, v0.4s, v5.s[3]
 
        ld1     {v6.4s}, [pB]
        add     pB, pB, #16
@@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M1
-       fmla    v16.4s, v0.4s, v4.4s[0]
-       fmla    v18.4s, v0.4s, v4.4s[1]
-       fmla    v20.4s, v0.4s, v4.4s[2]
-       fmla    v22.4s, v0.4s, v4.4s[3]
-       fmla    v24.4s, v0.4s, v5.4s[0]
-       fmla    v26.4s, v0.4s, v5.4s[1]
-       fmla    v28.4s, v0.4s, v5.4s[2]
-       fmla    v30.4s, v0.4s, v5.4s[3]
+       fmla    v16.4s, v0.4s, v4.s[0]
+       fmla    v18.4s, v0.4s, v4.s[1]
+       fmla    v20.4s, v0.4s, v4.s[2]
+       fmla    v22.4s, v0.4s, v4.s[3]
+       fmla    v24.4s, v0.4s, v5.s[0]
+       fmla    v26.4s, v0.4s, v5.s[1]
+       fmla    v28.4s, v0.4s, v5.s[2]
+       fmla    v30.4s, v0.4s, v5.s[3]
 
        ld1     {v6.4s}, [pB]
        add     pB, pB, #16
@@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
-       fmla    v16.4s, v2.4s, v6.4s[0]
-       fmla    v18.4s, v2.4s, v6.4s[1]
-       fmla    v20.4s, v2.4s, v6.4s[2]
-       fmla    v22.4s, v2.4s, v6.4s[3]
-       fmla    v24.4s, v2.4s, v7.4s[0]
-       fmla    v26.4s, v2.4s, v7.4s[1]
-       fmla    v28.4s, v2.4s, v7.4s[2]
-       fmla    v30.4s, v2.4s, v7.4s[3]
+       fmla    v16.4s, v2.4s, v6.s[0]
+       fmla    v18.4s, v2.4s, v6.s[1]
+       fmla    v20.4s, v2.4s, v6.s[2]
+       fmla    v22.4s, v2.4s, v6.s[3]
+       fmla    v24.4s, v2.4s, v7.s[0]
+       fmla    v26.4s, v2.4s, v7.s[1]
+       fmla    v28.4s, v2.4s, v7.s[2]
+       fmla    v30.4s, v2.4s, v7.s[3]
 
        ld1     {v4.4s}, [pB]
        add     pB, pB, #16
@@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_E
-       fmla    v16.4s, v2.4s, v6.4s[0]
-       fmla    v18.4s, v2.4s, v6.4s[1]
-       fmla    v20.4s, v2.4s, v6.4s[2]
-       fmla    v22.4s, v2.4s, v6.4s[3]
-       fmla    v24.4s, v2.4s, v7.4s[0]
-       fmla    v26.4s, v2.4s, v7.4s[1]
-       fmla    v28.4s, v2.4s, v7.4s[2]
-       fmla    v30.4s, v2.4s, v7.4s[3]
+       fmla    v16.4s, v2.4s, v6.s[0]
+       fmla    v18.4s, v2.4s, v6.s[1]
+       fmla    v20.4s, v2.4s, v6.s[2]
+       fmla    v22.4s, v2.4s, v6.s[3]
+       fmla    v24.4s, v2.4s, v7.s[0]
+       fmla    v26.4s, v2.4s, v7.s[1]
+       fmla    v28.4s, v2.4s, v7.s[2]
+       fmla    v30.4s, v2.4s, v7.s[3]
 .endm
 
 .macro KERNEL4x8_SUB
@@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v4.4s[0]
-       fmla    v18.4s, v0.4s, v4.4s[1]
-       fmla    v20.4s, v0.4s, v4.4s[2]
-       fmla    v22.4s, v0.4s, v4.4s[3]
-       fmla    v24.4s, v0.4s, v5.4s[0]
-       fmla    v26.4s, v0.4s, v5.4s[1]
-       fmla    v28.4s, v0.4s, v5.4s[2]
-       fmla    v30.4s, v0.4s, v5.4s[3]
+       fmla    v16.4s, v0.4s, v4.s[0]
+       fmla    v18.4s, v0.4s, v4.s[1]
+       fmla    v20.4s, v0.4s, v4.s[2]
+       fmla    v22.4s, v0.4s, v4.s[3]
+       fmla    v24.4s, v0.4s, v5.s[0]
+       fmla    v26.4s, v0.4s, v5.s[1]
+       fmla    v28.4s, v0.4s, v5.s[2]
+       fmla    v30.4s, v0.4s, v5.s[3]
 .endm
 
 .macro SAVE4x8
@@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v4.4s[0]
-       fmla    v18.2s, v0.2s, v4.4s[1]
-       fmla    v20.2s, v0.2s, v4.4s[2]
-       fmla    v22.2s, v0.2s, v4.4s[3]
-       fmla    v24.2s, v0.2s, v5.4s[0]
-       fmla    v26.2s, v0.2s, v5.4s[1]
-       fmla    v28.2s, v0.2s, v5.4s[2]
-       fmla    v30.2s, v0.2s, v5.4s[3]
+       fmla    v16.2s, v0.2s, v4.s[0]
+       fmla    v18.2s, v0.2s, v4.s[1]
+       fmla    v20.2s, v0.2s, v4.s[2]
+       fmla    v22.2s, v0.2s, v4.s[3]
+       fmla    v24.2s, v0.2s, v5.s[0]
+       fmla    v26.2s, v0.2s, v5.s[1]
+       fmla    v28.2s, v0.2s, v5.s[2]
+       fmla    v30.2s, v0.2s, v5.s[3]
 .endm
 
 .macro SAVE2x8
@@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     s0, [pA]
        add     pA, pA, #4
 
-       fmla    s16, s0, v4.4s[0]
-       fmla    s18, s0, v4.4s[1]
-       fmla    s20, s0, v4.4s[2]
-       fmla    s22, s0, v4.4s[3]
-       fmla    s24, s0, v5.4s[0]
-       fmla    s26, s0, v5.4s[1]
-       fmla    s28, s0, v5.4s[2]
-       fmla    s30, s0, v5.4s[3]
+       fmla    s16, s0, v4.s[0]
+       fmla    s18, s0, v4.s[1]
+       fmla    s20, s0, v4.s[2]
+       fmla    s22, s0, v4.s[3]
+       fmla    s24, s0, v5.s[0]
+       fmla    s26, s0, v5.s[1]
+       fmla    s28, s0, v5.s[2]
+       fmla    s30, s0, v5.s[3]
 .endm
 
 .macro SAVE1x8
@@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.4s, v0.4s, v8.2s[0]
-       fmul    v17.4s, v1.4s, v8.2s[0]
-       fmul    v20.4s, v0.4s, v8.2s[1]
-       fmul    v21.4s, v1.4s, v8.2s[1]
-       fmul    v24.4s, v0.4s, v9.2s[0]
-       fmul    v25.4s, v1.4s, v9.2s[0]
-       fmul    v28.4s, v0.4s, v9.2s[1]
-       fmul    v29.4s, v1.4s, v9.2s[1]
+       fmul    v16.4s, v0.4s, v8.s[0]
+       fmul    v17.4s, v1.4s, v8.s[0]
+       fmul    v20.4s, v0.4s, v8.s[1]
+       fmul    v21.4s, v1.4s, v8.s[1]
+       fmul    v24.4s, v0.4s, v9.s[0]
+       fmul    v25.4s, v1.4s, v9.s[0]
+       fmul    v28.4s, v0.4s, v9.s[1]
+       fmul    v29.4s, v1.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]
        add     pB, pB, #16
@@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-       fmla    v16.4s, v4.4s, v12.2s[0]
-       fmla    v17.4s, v5.4s, v12.2s[0]
-       fmla    v20.4s, v4.4s, v12.2s[1]
-       fmla    v21.4s, v5.4s, v12.2s[1]
-       fmla    v24.4s, v4.4s, v13.2s[0]
-       fmla    v25.4s, v5.4s, v13.2s[0]
-       fmla    v28.4s, v4.4s, v13.2s[1]
-       fmla    v29.4s, v5.4s, v13.2s[1]
+       fmla    v16.4s, v4.4s, v12.s[0]
+       fmla    v17.4s, v5.4s, v12.s[0]
+       fmla    v20.4s, v4.4s, v12.s[1]
+       fmla    v21.4s, v5.4s, v12.s[1]
+       fmla    v24.4s, v4.4s, v13.s[0]
+       fmla    v25.4s, v5.4s, v13.s[0]
+       fmla    v28.4s, v4.4s, v13.s[1]
+       fmla    v29.4s, v5.4s, v13.s[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
-       fmla    v24.4s, v0.4s, v9.2s[0]
-       fmla    v25.4s, v1.4s, v9.2s[0]
-       fmla    v28.4s, v0.4s, v9.2s[1]
-       fmla    v29.4s, v1.4s, v9.2s[1]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
+       fmla    v24.4s, v0.4s, v9.s[0]
+       fmla    v25.4s, v1.4s, v9.s[0]
+       fmla    v28.4s, v0.4s, v9.s[1]
+       fmla    v29.4s, v1.4s, v9.s[1]
 .endm
 
 .macro SAVE8x4
@@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmul    v16.2s, v0.2s, v8.2s[0]
-       fmul    v29.2s, v1.2s, v9.2s[1]
+       fmul    v16.2s, v0.2s, v8.s[0]
+       fmul    v29.2s, v1.2s, v9.s[1]
 
-       fmul    v20.2s, v0.2s, v8.2s[1]
-       fmul    v25.2s, v1.2s, v9.2s[0]
+       fmul    v20.2s, v0.2s, v8.s[1]
+       fmul    v25.2s, v1.2s, v9.s[0]
 
-       fmul    v24.2s, v0.2s, v9.2s[0]
-       fmul    v21.2s, v1.2s, v8.2s[1]
+       fmul    v24.2s, v0.2s, v9.s[0]
+       fmul    v21.2s, v1.2s, v8.s[1]
 
-       fmul    v28.2s, v0.2s, v9.2s[1]
-       fmul    v17.2s, v1.2s, v8.2s[0]
+       fmul    v28.2s, v0.2s, v9.s[1]
+       fmul    v17.2s, v1.2s, v8.s[0]
 
        ld1     {v12.2s, v13.2s}, [pB]
        add     pB, pB, #16
@@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
        ld1     {v12.2s, v13.2s}, [pB]          // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
        ld1     {v4.2s, v5.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
        ld1     {v8.2s, v9.2s}, [pB]            // For next round
        add     pB, pB, #16
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
        ld1     {v0.2s, v1.2s}, [pA]            // For next round
        add     pA, pA, #16
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-       fmla    v16.2s, v4.2s, v12.2s[0]
-       fmla    v29.2s, v5.2s, v13.2s[1]
+       fmla    v16.2s, v4.2s, v12.s[0]
+       fmla    v29.2s, v5.2s, v13.s[1]
 
-       fmla    v20.2s, v4.2s, v12.2s[1]
-       fmla    v25.2s, v5.2s, v13.2s[0]
+       fmla    v20.2s, v4.2s, v12.s[1]
+       fmla    v25.2s, v5.2s, v13.s[0]
 
-       fmla    v24.2s, v4.2s, v13.2s[0]
-       fmla    v21.2s, v5.2s, v12.2s[1]
+       fmla    v24.2s, v4.2s, v13.s[0]
+       fmla    v21.2s, v5.2s, v12.s[1]
 
-       fmla    v28.2s, v4.2s, v13.2s[1]
-       fmla    v17.2s, v5.2s, v12.2s[0]
+       fmla    v28.2s, v4.2s, v13.s[1]
+       fmla    v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v29.2s, v1.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v29.2s, v1.2s, v9.s[1]
 
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v25.2s, v1.2s, v9.2s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v25.2s, v1.2s, v9.s[0]
 
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v21.2s, v1.2s, v8.s[1]
 
-       fmla    v28.2s, v0.2s, v9.2s[1]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v24.2s, v0.2s, v9.2s[0]
-       fmla    v28.2s, v0.2s, v9.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v24.2s, v0.2s, v9.s[0]
+       fmla    v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
 
-       fmla    v20.4s, v0.4s, v8.2s[1]
-       fmla    v21.4s, v1.4s, v8.2s[1]
+       fmla    v20.4s, v0.4s, v8.s[1]
+       fmla    v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
-       fmla    v21.2s, v1.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
+       fmla    v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA, pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v20.2s, v0.2s, v8.2s[1]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ldr     s0 , [pA]
        add     pA, pA, #4
 
-       fmla    v16.2s, v8.2s, v0.2s[0]
+       fmla    v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v1.4s}, [pA]
        add     pA, pA, #16
 
-       fmla    v16.4s, v0.4s, v8.2s[0]
-       fmla    v17.4s, v1.4s, v8.2s[0]
+       fmla    v16.4s, v0.4s, v8.s[0]
+       fmla    v17.4s, v1.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
@@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s, v1.2s}, [pA]
        add     pA , pA, #16
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
-       fmla    v17.2s, v1.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
+       fmla    v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v0.2s}, [pA]
        add     pA , pA, #8
 
-       fmla    v16.2s, v0.2s, v8.2s[0]
+       fmla    v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
index 28ce3de..1cb695e 100644 (file)
@@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v17.16b, v17.16b, v17.16b
-       fmls    v17.2d, v0.2d, v9.2d[0]
+       fmls    v17.2d, v0.2d, v9.d[0]
 #else
-       fmul    v17.2d, v0.2d, v9.2d[0]
+       fmul    v17.2d, v0.2d, v9.d[0]
 #endif
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
 
-       fmul    v18.2d, v2.2d, v8.2d[0]
-       OP_ii   v18.2d, v3.2d, v9.2d[0]
+       fmul    v18.2d, v2.2d, v8.d[0]
+       OP_ii   v18.2d, v3.2d, v9.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v19.16b, v19.16b, v19.16b
-       fmls    v19.2d, v2.2d, v9.2d[0]
+       fmls    v19.2d, v2.2d, v9.d[0]
 #else
-       fmul    v19.2d, v2.2d, v9.2d[0]
+       fmul    v19.2d, v2.2d, v9.d[0]
 #endif
-       OP_ir   v19.2d, v3.2d, v8.2d[0]
+       OP_ir   v19.2d, v3.2d, v8.d[0]
 
-       fmul    v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
+       fmul    v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v21.16b, v21.16b, v21.16b
-       fmls    v21.2d, v0.2d, v9.2d[1]
+       fmls    v21.2d, v0.2d, v9.d[1]
 #else
-       fmul    v21.2d, v0.2d, v9.2d[1]
+       fmul    v21.2d, v0.2d, v9.d[1]
 #endif
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
 
-       fmul    v22.2d, v2.2d, v8.2d[1]
-       OP_ii   v22.2d, v3.2d, v9.2d[1]
+       fmul    v22.2d, v2.2d, v8.d[1]
+       OP_ii   v22.2d, v3.2d, v9.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v23.16b, v23.16b, v23.16b
-       fmls    v23.2d, v2.2d, v9.2d[1]
+       fmls    v23.2d, v2.2d, v9.d[1]
 #else
-       fmul    v23.2d, v2.2d, v9.2d[1]
+       fmul    v23.2d, v2.2d, v9.d[1]
 #endif
-       OP_ir   v23.2d, v3.2d, v8.2d[1]
+       OP_ir   v23.2d, v3.2d, v8.d[1]
 
-       fmul    v24.2d, v0.2d, v10.2d[0]
-       OP_ii   v24.2d, v1.2d, v11.2d[0]
+       fmul    v24.2d, v0.2d, v10.d[0]
+       OP_ii   v24.2d, v1.2d, v11.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v25.16b, v25.16b, v25.16b
-       fmls    v25.2d, v0.2d, v11.2d[0]
+       fmls    v25.2d, v0.2d, v11.d[0]
 #else
-       fmul    v25.2d, v0.2d, v11.2d[0]
+       fmul    v25.2d, v0.2d, v11.d[0]
 #endif
-       OP_ir   v25.2d, v1.2d, v10.2d[0]
+       OP_ir   v25.2d, v1.2d, v10.d[0]
 
-       fmul    v26.2d, v2.2d, v10.2d[0]
-       OP_ii   v26.2d, v3.2d, v11.2d[0]
+       fmul    v26.2d, v2.2d, v10.d[0]
+       OP_ii   v26.2d, v3.2d, v11.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v27.16b, v27.16b, v27.16b
-       fmls    v27.2d, v2.2d, v11.2d[0]
+       fmls    v27.2d, v2.2d, v11.d[0]
 #else
-       fmul    v27.2d, v2.2d, v11.2d[0]
+       fmul    v27.2d, v2.2d, v11.d[0]
 #endif
-       OP_ir   v27.2d, v3.2d, v10.2d[0]
+       OP_ir   v27.2d, v3.2d, v10.d[0]
 
-       fmul    v28.2d, v0.2d, v10.2d[1]
-       OP_ii   v28.2d, v1.2d, v11.2d[1]
+       fmul    v28.2d, v0.2d, v10.d[1]
+       OP_ii   v28.2d, v1.2d, v11.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v29.16b, v29.16b, v29.16b
-       fmls    v29.2d, v0.2d, v11.2d[1]
+       fmls    v29.2d, v0.2d, v11.d[1]
 #else
-       fmul    v29.2d, v0.2d, v11.2d[1]
+       fmul    v29.2d, v0.2d, v11.d[1]
 #endif
-       OP_ir   v29.2d, v1.2d, v10.2d[1]
+       OP_ir   v29.2d, v1.2d, v10.d[1]
 
-       fmul    v30.2d, v2.2d, v10.2d[1]
-       OP_ii   v30.2d, v3.2d, v11.2d[1]
+       fmul    v30.2d, v2.2d, v10.d[1]
+       OP_ii   v30.2d, v3.2d, v11.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v31.16b, v31.16b, v31.16b
-       fmls    v31.2d, v2.2d, v11.2d[1]
+       fmls    v31.2d, v2.2d, v11.d[1]
 #else
-       fmul    v31.2d, v2.2d, v11.2d[1]
+       fmul    v31.2d, v2.2d, v11.d[1]
 #endif
-       OP_ir   v31.2d, v3.2d, v10.2d[1]
+       OP_ir   v31.2d, v3.2d, v10.d[1]
 
        ld2     {v12.2d, v13.2d}, [pB]
        add     pB, pB, #32
@@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
 
        ld2     {v12.2d, v13.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v18.2d, v2.2d, v8.2d[0]
-       OP_ii   v18.2d, v3.2d, v9.2d[0]
-       OP_ri   v19.2d, v2.2d, v9.2d[0]
-       OP_ir   v19.2d, v3.2d, v8.2d[0]
+       OP_rr   v18.2d, v2.2d, v8.d[0]
+       OP_ii   v18.2d, v3.2d, v9.d[0]
+       OP_ri   v19.2d, v2.2d, v9.d[0]
+       OP_ir   v19.2d, v3.2d, v8.d[0]
 
        ld2     {v14.2d, v15.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
 
        ld2     {v4.2d, v5.2d} , [pA]           // For next round
        add     pA, pA, #32
 
-       OP_rr   v22.2d, v2.2d, v8.2d[1]
-       OP_ii   v22.2d, v3.2d, v9.2d[1]
-       OP_ri   v23.2d, v2.2d, v9.2d[1]
-       OP_ir   v23.2d, v3.2d, v8.2d[1]
+       OP_rr   v22.2d, v2.2d, v8.d[1]
+       OP_ii   v22.2d, v3.2d, v9.d[1]
+       OP_ri   v23.2d, v2.2d, v9.d[1]
+       OP_ir   v23.2d, v3.2d, v8.d[1]
 
        ld2     {v6.2d, v7.2d} , [pA]           // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.2d, v0.2d, v10.2d[0]
-       OP_ii   v24.2d, v1.2d, v11.2d[0]
-       OP_ri   v25.2d, v0.2d, v11.2d[0]
-       OP_ir   v25.2d, v1.2d, v10.2d[0]
+       OP_rr   v24.2d, v0.2d, v10.d[0]
+       OP_ii   v24.2d, v1.2d, v11.d[0]
+       OP_ri   v25.2d, v0.2d, v11.d[0]
+       OP_ir   v25.2d, v1.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v26.2d, v2.2d, v10.2d[0]
-       OP_ii   v26.2d, v3.2d, v11.2d[0]
-       OP_ri   v27.2d, v2.2d, v11.2d[0]
-       OP_ir   v27.2d, v3.2d, v10.2d[0]
+       OP_rr   v26.2d, v2.2d, v10.d[0]
+       OP_ii   v26.2d, v3.2d, v11.d[0]
+       OP_ri   v27.2d, v2.2d, v11.d[0]
+       OP_ir   v27.2d, v3.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v28.2d, v0.2d, v10.2d[1]
-       OP_ii   v28.2d, v1.2d, v11.2d[1]
-       OP_ri   v29.2d, v0.2d, v11.2d[1]
-       OP_ir   v29.2d, v1.2d, v10.2d[1]
+       OP_rr   v28.2d, v0.2d, v10.d[1]
+       OP_ii   v28.2d, v1.2d, v11.d[1]
+       OP_ri   v29.2d, v0.2d, v11.d[1]
+       OP_ir   v29.2d, v1.2d, v10.d[1]
 
-       OP_rr   v30.2d, v2.2d, v10.2d[1]
-       OP_ii   v30.2d, v3.2d, v11.2d[1]
-       OP_ri   v31.2d, v2.2d, v11.2d[1]
-       OP_ir   v31.2d, v3.2d, v10.2d[1]
+       OP_rr   v30.2d, v2.2d, v10.d[1]
+       OP_ii   v30.2d, v3.2d, v11.d[1]
+       OP_ri   v31.2d, v2.2d, v11.d[1]
+       OP_ir   v31.2d, v3.2d, v10.d[1]
 .endm
 
 .macro KERNEL4x4_M2
-       OP_rr   v16.2d, v4.2d, v12.2d[0]
-       OP_ii   v16.2d, v5.2d, v13.2d[0]
-       OP_ri   v17.2d, v4.2d, v13.2d[0]
-       OP_ir   v17.2d, v5.2d, v12.2d[0]
+       OP_rr   v16.2d, v4.2d, v12.d[0]
+       OP_ii   v16.2d, v5.2d, v13.d[0]
+       OP_ri   v17.2d, v4.2d, v13.d[0]
+       OP_ir   v17.2d, v5.2d, v12.d[0]
 
        ld2     {v8.2d, v9.2d}, [pB]            // For next round
        add     pB, pB, #32
 
-       OP_rr   v18.2d, v6.2d, v12.2d[0]
-       OP_ii   v18.2d, v7.2d, v13.2d[0]
-       OP_ri   v19.2d, v6.2d, v13.2d[0]
-       OP_ir   v19.2d, v7.2d, v12.2d[0]
+       OP_rr   v18.2d, v6.2d, v12.d[0]
+       OP_ii   v18.2d, v7.2d, v13.d[0]
+       OP_ri   v19.2d, v6.2d, v13.d[0]
+       OP_ir   v19.2d, v7.2d, v12.d[0]
 
        ld2     {v10.2d, v11.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.2d, v4.2d, v12.2d[1]
-       OP_ii   v20.2d, v5.2d, v13.2d[1]
-       OP_ri   v21.2d, v4.2d, v13.2d[1]
-       OP_ir   v21.2d, v5.2d, v12.2d[1]
+       OP_rr   v20.2d, v4.2d, v12.d[1]
+       OP_ii   v20.2d, v5.2d, v13.d[1]
+       OP_ri   v21.2d, v4.2d, v13.d[1]
+       OP_ir   v21.2d, v5.2d, v12.d[1]
 
        ld2     {v0.2d, v1.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v22.2d, v6.2d, v12.2d[1]
-       OP_ii   v22.2d, v7.2d, v13.2d[1]
-       OP_ri   v23.2d, v6.2d, v13.2d[1]
-       OP_ir   v23.2d, v7.2d, v12.2d[1]
+       OP_rr   v22.2d, v6.2d, v12.d[1]
+       OP_ii   v22.2d, v7.2d, v13.d[1]
+       OP_ri   v23.2d, v6.2d, v13.d[1]
+       OP_ir   v23.2d, v7.2d, v12.d[1]
 
        ld2     {v2.2d, v3.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.2d, v4.2d, v14.2d[0]
-       OP_ii   v24.2d, v5.2d, v15.2d[0]
-       OP_ri   v25.2d, v4.2d, v15.2d[0]
-       OP_ir   v25.2d, v5.2d, v14.2d[0]
+       OP_rr   v24.2d, v4.2d, v14.d[0]
+       OP_ii   v24.2d, v5.2d, v15.d[0]
+       OP_ri   v25.2d, v4.2d, v15.d[0]
+       OP_ir   v25.2d, v5.2d, v14.d[0]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v26.2d, v6.2d, v14.2d[0]
-       OP_ii   v26.2d, v7.2d, v15.2d[0]
-       OP_ri   v27.2d, v6.2d, v15.2d[0]
-       OP_ir   v27.2d, v7.2d, v14.2d[0]
+       OP_rr   v26.2d, v6.2d, v14.d[0]
+       OP_ii   v26.2d, v7.2d, v15.d[0]
+       OP_ri   v27.2d, v6.2d, v15.d[0]
+       OP_ir   v27.2d, v7.2d, v14.d[0]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v28.2d, v4.2d, v14.2d[1]
-       OP_ii   v28.2d, v5.2d, v15.2d[1]
-       OP_ri   v29.2d, v4.2d, v15.2d[1]
-       OP_ir   v29.2d, v5.2d, v14.2d[1]
+       OP_rr   v28.2d, v4.2d, v14.d[1]
+       OP_ii   v28.2d, v5.2d, v15.d[1]
+       OP_ri   v29.2d, v4.2d, v15.d[1]
+       OP_ir   v29.2d, v5.2d, v14.d[1]
 
-       OP_rr   v30.2d, v6.2d, v14.2d[1]
-       OP_ii   v30.2d, v7.2d, v15.2d[1]
-       OP_ri   v31.2d, v6.2d, v15.2d[1]
-       OP_ir   v31.2d, v7.2d, v14.2d[1]
+       OP_rr   v30.2d, v6.2d, v14.d[1]
+       OP_ii   v30.2d, v7.2d, v15.d[1]
+       OP_ri   v31.2d, v6.2d, v15.d[1]
+       OP_ir   v31.2d, v7.2d, v14.d[1]
 .endm
 
 .macro KERNEL4x4_E
-       OP_rr   v16.2d, v4.2d, v12.2d[0]
-       OP_ii   v16.2d, v5.2d, v13.2d[0]
-       OP_ri   v17.2d, v4.2d, v13.2d[0]
-       OP_ir   v17.2d, v5.2d, v12.2d[0]
-
-       OP_rr   v18.2d, v6.2d, v12.2d[0]
-       OP_ii   v18.2d, v7.2d, v13.2d[0]
-       OP_ri   v19.2d, v6.2d, v13.2d[0]
-       OP_ir   v19.2d, v7.2d, v12.2d[0]
-
-       OP_rr   v20.2d, v4.2d, v12.2d[1]
-       OP_ii   v20.2d, v5.2d, v13.2d[1]
-       OP_ri   v21.2d, v4.2d, v13.2d[1]
-       OP_ir   v21.2d, v5.2d, v12.2d[1]
-
-       OP_rr   v22.2d, v6.2d, v12.2d[1]
-       OP_ii   v22.2d, v7.2d, v13.2d[1]
-       OP_ri   v23.2d, v6.2d, v13.2d[1]
-       OP_ir   v23.2d, v7.2d, v12.2d[1]
-
-       OP_rr   v24.2d, v4.2d, v14.2d[0]
-       OP_ii   v24.2d, v5.2d, v15.2d[0]
-       OP_ri   v25.2d, v4.2d, v15.2d[0]
-       OP_ir   v25.2d, v5.2d, v14.2d[0]
-
-       OP_rr   v26.2d, v6.2d, v14.2d[0]
-       OP_ii   v26.2d, v7.2d, v15.2d[0]
-       OP_ri   v27.2d, v6.2d, v15.2d[0]
-       OP_ir   v27.2d, v7.2d, v14.2d[0]
-
-       OP_rr   v28.2d, v4.2d, v14.2d[1]
-       OP_ii   v28.2d, v5.2d, v15.2d[1]
-       OP_ri   v29.2d, v4.2d, v15.2d[1]
-       OP_ir   v29.2d, v5.2d, v14.2d[1]
-
-       OP_rr   v30.2d, v6.2d, v14.2d[1]
-       OP_ii   v30.2d, v7.2d, v15.2d[1]
-       OP_ri   v31.2d, v6.2d, v15.2d[1]
-       OP_ir   v31.2d, v7.2d, v14.2d[1]
+       OP_rr   v16.2d, v4.2d, v12.d[0]
+       OP_ii   v16.2d, v5.2d, v13.d[0]
+       OP_ri   v17.2d, v4.2d, v13.d[0]
+       OP_ir   v17.2d, v5.2d, v12.d[0]
+
+       OP_rr   v18.2d, v6.2d, v12.d[0]
+       OP_ii   v18.2d, v7.2d, v13.d[0]
+       OP_ri   v19.2d, v6.2d, v13.d[0]
+       OP_ir   v19.2d, v7.2d, v12.d[0]
+
+       OP_rr   v20.2d, v4.2d, v12.d[1]
+       OP_ii   v20.2d, v5.2d, v13.d[1]
+       OP_ri   v21.2d, v4.2d, v13.d[1]
+       OP_ir   v21.2d, v5.2d, v12.d[1]
+
+       OP_rr   v22.2d, v6.2d, v12.d[1]
+       OP_ii   v22.2d, v7.2d, v13.d[1]
+       OP_ri   v23.2d, v6.2d, v13.d[1]
+       OP_ir   v23.2d, v7.2d, v12.d[1]
+
+       OP_rr   v24.2d, v4.2d, v14.d[0]
+       OP_ii   v24.2d, v5.2d, v15.d[0]
+       OP_ri   v25.2d, v4.2d, v15.d[0]
+       OP_ir   v25.2d, v5.2d, v14.d[0]
+
+       OP_rr   v26.2d, v6.2d, v14.d[0]
+       OP_ii   v26.2d, v7.2d, v15.d[0]
+       OP_ri   v27.2d, v6.2d, v15.d[0]
+       OP_ir   v27.2d, v7.2d, v14.d[0]
+
+       OP_rr   v28.2d, v4.2d, v14.d[1]
+       OP_ii   v28.2d, v5.2d, v15.d[1]
+       OP_ri   v29.2d, v4.2d, v15.d[1]
+       OP_ir   v29.2d, v5.2d, v14.d[1]
+
+       OP_rr   v30.2d, v6.2d, v14.d[1]
+       OP_ii   v30.2d, v7.2d, v15.d[1]
+       OP_ri   v31.2d, v6.2d, v15.d[1]
+       OP_ir   v31.2d, v7.2d, v14.d[1]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
-
-       OP_rr   v18.2d, v2.2d, v8.2d[0]
-       OP_ii   v18.2d, v3.2d, v9.2d[0]
-       OP_ri   v19.2d, v2.2d, v9.2d[0]
-       OP_ir   v19.2d, v3.2d, v8.2d[0]
-
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
-
-       OP_rr   v22.2d, v2.2d, v8.2d[1]
-       OP_ii   v22.2d, v3.2d, v9.2d[1]
-       OP_ri   v23.2d, v2.2d, v9.2d[1]
-       OP_ir   v23.2d, v3.2d, v8.2d[1]
-
-       OP_rr   v24.2d, v0.2d, v10.2d[0]
-       OP_ii   v24.2d, v1.2d, v11.2d[0]
-       OP_ri   v25.2d, v0.2d, v11.2d[0]
-       OP_ir   v25.2d, v1.2d, v10.2d[0]
-
-       OP_rr   v26.2d, v2.2d, v10.2d[0]
-       OP_ii   v26.2d, v3.2d, v11.2d[0]
-       OP_ri   v27.2d, v2.2d, v11.2d[0]
-       OP_ir   v27.2d, v3.2d, v10.2d[0]
-
-       OP_rr   v28.2d, v0.2d, v10.2d[1]
-       OP_ii   v28.2d, v1.2d, v11.2d[1]
-       OP_ri   v29.2d, v0.2d, v11.2d[1]
-       OP_ir   v29.2d, v1.2d, v10.2d[1]
-
-       OP_rr   v30.2d, v2.2d, v10.2d[1]
-       OP_ii   v30.2d, v3.2d, v11.2d[1]
-       OP_ri   v31.2d, v2.2d, v11.2d[1]
-       OP_ir   v31.2d, v3.2d, v10.2d[1]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
+
+       OP_rr   v18.2d, v2.2d, v8.d[0]
+       OP_ii   v18.2d, v3.2d, v9.d[0]
+       OP_ri   v19.2d, v2.2d, v9.d[0]
+       OP_ir   v19.2d, v3.2d, v8.d[0]
+
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
+
+       OP_rr   v22.2d, v2.2d, v8.d[1]
+       OP_ii   v22.2d, v3.2d, v9.d[1]
+       OP_ri   v23.2d, v2.2d, v9.d[1]
+       OP_ir   v23.2d, v3.2d, v8.d[1]
+
+       OP_rr   v24.2d, v0.2d, v10.d[0]
+       OP_ii   v24.2d, v1.2d, v11.d[0]
+       OP_ri   v25.2d, v0.2d, v11.d[0]
+       OP_ir   v25.2d, v1.2d, v10.d[0]
+
+       OP_rr   v26.2d, v2.2d, v10.d[0]
+       OP_ii   v26.2d, v3.2d, v11.d[0]
+       OP_ri   v27.2d, v2.2d, v11.d[0]
+       OP_ir   v27.2d, v3.2d, v10.d[0]
+
+       OP_rr   v28.2d, v0.2d, v10.d[1]
+       OP_ii   v28.2d, v1.2d, v11.d[1]
+       OP_ri   v29.2d, v0.2d, v11.d[1]
+       OP_ir   v29.2d, v1.2d, v10.d[1]
+
+       OP_rr   v30.2d, v2.2d, v10.d[1]
+       OP_ii   v30.2d, v3.2d, v11.d[1]
+       OP_ri   v31.2d, v2.2d, v11.d[1]
+       OP_ir   v31.2d, v3.2d, v10.d[1]
 .endm
 
 .macro SAVE4x4
@@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
-
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
-
-       OP_rr   v24.2d, v0.2d, v10.2d[0]
-       OP_ii   v24.2d, v1.2d, v11.2d[0]
-       OP_ri   v25.2d, v0.2d, v11.2d[0]
-       OP_ir   v25.2d, v1.2d, v10.2d[0]
-
-       OP_rr   v28.2d, v0.2d, v10.2d[1]
-       OP_ii   v28.2d, v1.2d, v11.2d[1]
-       OP_ri   v29.2d, v0.2d, v11.2d[1]
-       OP_ir   v29.2d, v1.2d, v10.2d[1]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
+
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
+
+       OP_rr   v24.2d, v0.2d, v10.d[0]
+       OP_ii   v24.2d, v1.2d, v11.d[0]
+       OP_ri   v25.2d, v0.2d, v11.d[0]
+       OP_ir   v25.2d, v1.2d, v10.d[0]
+
+       OP_rr   v28.2d, v0.2d, v10.d[1]
+       OP_ii   v28.2d, v1.2d, v11.d[1]
+       OP_ri   v29.2d, v0.2d, v11.d[1]
+       OP_ir   v29.2d, v1.2d, v10.d[1]
 .endm
 
 .macro SAVE2x4
@@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.d, v1.d}[0], [pA]
        add     pA, pA, #16
 
-       OP_rr   d16, d0, v8.2d[0]
-       OP_ii   d16, d1, v9.2d[0]
-       OP_ri   d17, d0, v9.2d[0]
-       OP_ir   d17, d1, v8.2d[0]
-
-       OP_rr   d20, d0, v8.2d[1]
-       OP_ii   d20, d1, v9.2d[1]
-       OP_ri   d21, d0, v9.2d[1]
-       OP_ir   d21, d1, v8.2d[1]
-
-       OP_rr   d24, d0, v10.2d[0]
-       OP_ii   d24, d1, v11.2d[0]
-       OP_ri   d25, d0, v11.2d[0]
-       OP_ir   d25, d1, v10.2d[0]
-
-       OP_rr   d28, d0, v10.2d[1]
-       OP_ii   d28, d1, v11.2d[1]
-       OP_ri   d29, d0, v11.2d[1]
-       OP_ir   d29, d1, v10.2d[1]
+       OP_rr   d16, d0, v8.d[0]
+       OP_ii   d16, d1, v9.d[0]
+       OP_ri   d17, d0, v9.d[0]
+       OP_ir   d17, d1, v8.d[0]
+
+       OP_rr   d20, d0, v8.d[1]
+       OP_ii   d20, d1, v9.d[1]
+       OP_ri   d21, d0, v9.d[1]
+       OP_ir   d21, d1, v8.d[1]
+
+       OP_rr   d24, d0, v10.d[0]
+       OP_ii   d24, d1, v11.d[0]
+       OP_ri   d25, d0, v11.d[0]
+       OP_ir   d25, d1, v10.d[0]
+
+       OP_rr   d28, d0, v10.d[1]
+       OP_ii   d28, d1, v11.d[1]
+       OP_ri   d29, d0, v11.d[1]
+       OP_ir   d29, d1, v10.d[1]
 .endm
 
 .macro SAVE1x4
@@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
-
-       OP_rr   v18.2d, v2.2d, v8.2d[0]
-       OP_ii   v18.2d, v3.2d, v9.2d[0]
-       OP_ri   v19.2d, v2.2d, v9.2d[0]
-       OP_ir   v19.2d, v3.2d, v8.2d[0]
-
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
-
-       OP_rr   v22.2d, v2.2d, v8.2d[1]
-       OP_ii   v22.2d, v3.2d, v9.2d[1]
-       OP_ri   v23.2d, v2.2d, v9.2d[1]
-       OP_ir   v23.2d, v3.2d, v8.2d[1]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
+
+       OP_rr   v18.2d, v2.2d, v8.d[0]
+       OP_ii   v18.2d, v3.2d, v9.d[0]
+       OP_ri   v19.2d, v2.2d, v9.d[0]
+       OP_ir   v19.2d, v3.2d, v8.d[0]
+
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
+
+       OP_rr   v22.2d, v2.2d, v8.d[1]
+       OP_ii   v22.2d, v3.2d, v9.d[1]
+       OP_ri   v23.2d, v2.2d, v9.d[1]
+       OP_ir   v23.2d, v3.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
 
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.d, v1.d}[0], [pA]
        add     pA, pA, #16
 
-       OP_rr   d16, d0, v8.2d[0]
-       OP_ii   d16, d1, v9.2d[0]
-       OP_ri   d17, d0, v9.2d[0]
-       OP_ir   d17, d1, v8.2d[0]
+       OP_rr   d16, d0, v8.d[0]
+       OP_ii   d16, d1, v9.d[0]
+       OP_ri   d17, d0, v9.d[0]
+       OP_ir   d17, d1, v8.d[0]
 
-       OP_rr   d20, d0, v8.2d[1]
-       OP_ii   d20, d1, v9.2d[1]
-       OP_ri   d21, d0, v9.2d[1]
-       OP_ir   d21, d1, v8.2d[1]
+       OP_rr   d20, d0, v8.d[1]
+       OP_ii   d20, d1, v9.d[1]
+       OP_ri   d21, d0, v9.d[1]
+       OP_ir   d21, d1, v8.d[1]
 .endm
 
 .macro SAVE1x2
index 3ff8227..7945870 100644 (file)
@@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       fmul    v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
+       fmul    v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v17.16b, v17.16b, v17.16b
-       fmls    v17.2d, v0.2d, v9.2d[0]
+       fmls    v17.2d, v0.2d, v9.d[0]
 #else
-       fmul    v17.2d, v0.2d, v9.2d[0]
+       fmul    v17.2d, v0.2d, v9.d[0]
 #endif
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
 
-       fmul    v18.2d, v2.2d, v8.2d[0]
-       OP_ii   v18.2d, v3.2d, v9.2d[0]
+       fmul    v18.2d, v2.2d, v8.d[0]
+       OP_ii   v18.2d, v3.2d, v9.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v19.16b, v19.16b, v19.16b
-       fmls    v19.2d, v2.2d, v9.2d[0]
+       fmls    v19.2d, v2.2d, v9.d[0]
 #else
-       fmul    v19.2d, v2.2d, v9.2d[0]
+       fmul    v19.2d, v2.2d, v9.d[0]
 #endif
-       OP_ir   v19.2d, v3.2d, v8.2d[0]
+       OP_ir   v19.2d, v3.2d, v8.d[0]
 
-       fmul    v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
+       fmul    v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v21.16b, v21.16b, v21.16b
-       fmls    v21.2d, v0.2d, v9.2d[1]
+       fmls    v21.2d, v0.2d, v9.d[1]
 #else
-       fmul    v21.2d, v0.2d, v9.2d[1]
+       fmul    v21.2d, v0.2d, v9.d[1]
 #endif
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
 
-       fmul    v22.2d, v2.2d, v8.2d[1]
-       OP_ii   v22.2d, v3.2d, v9.2d[1]
+       fmul    v22.2d, v2.2d, v8.d[1]
+       OP_ii   v22.2d, v3.2d, v9.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v23.16b, v23.16b, v23.16b
-       fmls    v23.2d, v2.2d, v9.2d[1]
+       fmls    v23.2d, v2.2d, v9.d[1]
 #else
-       fmul    v23.2d, v2.2d, v9.2d[1]
+       fmul    v23.2d, v2.2d, v9.d[1]
 #endif
-       OP_ir   v23.2d, v3.2d, v8.2d[1]
+       OP_ir   v23.2d, v3.2d, v8.d[1]
 
-       fmul    v24.2d, v0.2d, v10.2d[0]
-       OP_ii   v24.2d, v1.2d, v11.2d[0]
+       fmul    v24.2d, v0.2d, v10.d[0]
+       OP_ii   v24.2d, v1.2d, v11.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v25.16b, v25.16b, v25.16b
-       fmls    v25.2d, v0.2d, v11.2d[0]
+       fmls    v25.2d, v0.2d, v11.d[0]
 #else
-       fmul    v25.2d, v0.2d, v11.2d[0]
+       fmul    v25.2d, v0.2d, v11.d[0]
 #endif
-       OP_ir   v25.2d, v1.2d, v10.2d[0]
+       OP_ir   v25.2d, v1.2d, v10.d[0]
 
-       fmul    v26.2d, v2.2d, v10.2d[0]
-       OP_ii   v26.2d, v3.2d, v11.2d[0]
+       fmul    v26.2d, v2.2d, v10.d[0]
+       OP_ii   v26.2d, v3.2d, v11.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v27.16b, v27.16b, v27.16b
-       fmls    v27.2d, v2.2d, v11.2d[0]
+       fmls    v27.2d, v2.2d, v11.d[0]
 #else
-       fmul    v27.2d, v2.2d, v11.2d[0]
+       fmul    v27.2d, v2.2d, v11.d[0]
 #endif
-       OP_ir   v27.2d, v3.2d, v10.2d[0]
+       OP_ir   v27.2d, v3.2d, v10.d[0]
 
-       fmul    v28.2d, v0.2d, v10.2d[1]
-       OP_ii   v28.2d, v1.2d, v11.2d[1]
+       fmul    v28.2d, v0.2d, v10.d[1]
+       OP_ii   v28.2d, v1.2d, v11.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v29.16b, v29.16b, v29.16b
-       fmls    v29.2d, v0.2d, v11.2d[1]
+       fmls    v29.2d, v0.2d, v11.d[1]
 #else
-       fmul    v29.2d, v0.2d, v11.2d[1]
+       fmul    v29.2d, v0.2d, v11.d[1]
 #endif
-       OP_ir   v29.2d, v1.2d, v10.2d[1]
+       OP_ir   v29.2d, v1.2d, v10.d[1]
 
-       fmul    v30.2d, v2.2d, v10.2d[1]
-       OP_ii   v30.2d, v3.2d, v11.2d[1]
+       fmul    v30.2d, v2.2d, v10.d[1]
+       OP_ii   v30.2d, v3.2d, v11.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
        eor     v31.16b, v31.16b, v31.16b
-       fmls    v31.2d, v2.2d, v11.2d[1]
+       fmls    v31.2d, v2.2d, v11.d[1]
 #else
-       fmul    v31.2d, v2.2d, v11.2d[1]
+       fmul    v31.2d, v2.2d, v11.d[1]
 #endif
-       OP_ir   v31.2d, v3.2d, v10.2d[1]
+       OP_ir   v31.2d, v3.2d, v10.d[1]
 
        ld2     {v12.2d, v13.2d}, [pB]
        add     pB, pB, #32
@@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
 
        ld2     {v12.2d, v13.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v18.2d, v2.2d, v8.2d[0]
-       OP_ii   v18.2d, v3.2d, v9.2d[0]
-       OP_ri   v19.2d, v2.2d, v9.2d[0]
-       OP_ir   v19.2d, v3.2d, v8.2d[0]
+       OP_rr   v18.2d, v2.2d, v8.d[0]
+       OP_ii   v18.2d, v3.2d, v9.d[0]
+       OP_ri   v19.2d, v2.2d, v9.d[0]
+       OP_ir   v19.2d, v3.2d, v8.d[0]
 
        ld2     {v14.2d, v15.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
 
        ld2     {v4.2d, v5.2d} , [pA]           // For next round
        add     pA, pA, #32
 
-       OP_rr   v22.2d, v2.2d, v8.2d[1]
-       OP_ii   v22.2d, v3.2d, v9.2d[1]
-       OP_ri   v23.2d, v2.2d, v9.2d[1]
-       OP_ir   v23.2d, v3.2d, v8.2d[1]
+       OP_rr   v22.2d, v2.2d, v8.d[1]
+       OP_ii   v22.2d, v3.2d, v9.d[1]
+       OP_ri   v23.2d, v2.2d, v9.d[1]
+       OP_ir   v23.2d, v3.2d, v8.d[1]
 
        ld2     {v6.2d, v7.2d} , [pA]           // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.2d, v0.2d, v10.2d[0]
-       OP_ii   v24.2d, v1.2d, v11.2d[0]
-       OP_ri   v25.2d, v0.2d, v11.2d[0]
-       OP_ir   v25.2d, v1.2d, v10.2d[0]
+       OP_rr   v24.2d, v0.2d, v10.d[0]
+       OP_ii   v24.2d, v1.2d, v11.d[0]
+       OP_ri   v25.2d, v0.2d, v11.d[0]
+       OP_ir   v25.2d, v1.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v26.2d, v2.2d, v10.2d[0]
-       OP_ii   v26.2d, v3.2d, v11.2d[0]
-       OP_ri   v27.2d, v2.2d, v11.2d[0]
-       OP_ir   v27.2d, v3.2d, v10.2d[0]
+       OP_rr   v26.2d, v2.2d, v10.d[0]
+       OP_ii   v26.2d, v3.2d, v11.d[0]
+       OP_ri   v27.2d, v2.2d, v11.d[0]
+       OP_ir   v27.2d, v3.2d, v10.d[0]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v28.2d, v0.2d, v10.2d[1]
-       OP_ii   v28.2d, v1.2d, v11.2d[1]
-       OP_ri   v29.2d, v0.2d, v11.2d[1]
-       OP_ir   v29.2d, v1.2d, v10.2d[1]
+       OP_rr   v28.2d, v0.2d, v10.d[1]
+       OP_ii   v28.2d, v1.2d, v11.d[1]
+       OP_ri   v29.2d, v0.2d, v11.d[1]
+       OP_ir   v29.2d, v1.2d, v10.d[1]
 
-       OP_rr   v30.2d, v2.2d, v10.2d[1]
-       OP_ii   v30.2d, v3.2d, v11.2d[1]
-       OP_ri   v31.2d, v2.2d, v11.2d[1]
-       OP_ir   v31.2d, v3.2d, v10.2d[1]
+       OP_rr   v30.2d, v2.2d, v10.d[1]
+       OP_ii   v30.2d, v3.2d, v11.d[1]
+       OP_ri   v31.2d, v2.2d, v11.d[1]
+       OP_ir   v31.2d, v3.2d, v10.d[1]
 .endm
 
 .macro KERNEL4x4_M2
-       OP_rr   v16.2d, v4.2d, v12.2d[0]
-       OP_ii   v16.2d, v5.2d, v13.2d[0]
-       OP_ri   v17.2d, v4.2d, v13.2d[0]
-       OP_ir   v17.2d, v5.2d, v12.2d[0]
+       OP_rr   v16.2d, v4.2d, v12.d[0]
+       OP_ii   v16.2d, v5.2d, v13.d[0]
+       OP_ri   v17.2d, v4.2d, v13.d[0]
+       OP_ir   v17.2d, v5.2d, v12.d[0]
 
        ld2     {v8.2d, v9.2d}, [pB]            // For next round
        add     pB, pB, #32
 
-       OP_rr   v18.2d, v6.2d, v12.2d[0]
-       OP_ii   v18.2d, v7.2d, v13.2d[0]
-       OP_ri   v19.2d, v6.2d, v13.2d[0]
-       OP_ir   v19.2d, v7.2d, v12.2d[0]
+       OP_rr   v18.2d, v6.2d, v12.d[0]
+       OP_ii   v18.2d, v7.2d, v13.d[0]
+       OP_ri   v19.2d, v6.2d, v13.d[0]
+       OP_ir   v19.2d, v7.2d, v12.d[0]
 
        ld2     {v10.2d, v11.2d}, [pB]          // For next round
        add     pB, pB, #32
 
-       OP_rr   v20.2d, v4.2d, v12.2d[1]
-       OP_ii   v20.2d, v5.2d, v13.2d[1]
-       OP_ri   v21.2d, v4.2d, v13.2d[1]
-       OP_ir   v21.2d, v5.2d, v12.2d[1]
+       OP_rr   v20.2d, v4.2d, v12.d[1]
+       OP_ii   v20.2d, v5.2d, v13.d[1]
+       OP_ri   v21.2d, v4.2d, v13.d[1]
+       OP_ir   v21.2d, v5.2d, v12.d[1]
 
        ld2     {v0.2d, v1.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v22.2d, v6.2d, v12.2d[1]
-       OP_ii   v22.2d, v7.2d, v13.2d[1]
-       OP_ri   v23.2d, v6.2d, v13.2d[1]
-       OP_ir   v23.2d, v7.2d, v12.2d[1]
+       OP_rr   v22.2d, v6.2d, v12.d[1]
+       OP_ii   v22.2d, v7.2d, v13.d[1]
+       OP_ri   v23.2d, v6.2d, v13.d[1]
+       OP_ir   v23.2d, v7.2d, v12.d[1]
 
        ld2     {v2.2d, v3.2d}, [pA]            // For next round
        add     pA, pA, #32
 
-       OP_rr   v24.2d, v4.2d, v14.2d[0]
-       OP_ii   v24.2d, v5.2d, v15.2d[0]
-       OP_ri   v25.2d, v4.2d, v15.2d[0]
-       OP_ir   v25.2d, v5.2d, v14.2d[0]
+       OP_rr   v24.2d, v4.2d, v14.d[0]
+       OP_ii   v24.2d, v5.2d, v15.d[0]
+       OP_ri   v25.2d, v4.2d, v15.d[0]
+       OP_ir   v25.2d, v5.2d, v14.d[0]
 
        prfm    PLDL1KEEP, [pA, #512]
 
-       OP_rr   v26.2d, v6.2d, v14.2d[0]
-       OP_ii   v26.2d, v7.2d, v15.2d[0]
-       OP_ri   v27.2d, v6.2d, v15.2d[0]
-       OP_ir   v27.2d, v7.2d, v14.2d[0]
+       OP_rr   v26.2d, v6.2d, v14.d[0]
+       OP_ii   v26.2d, v7.2d, v15.d[0]
+       OP_ri   v27.2d, v6.2d, v15.d[0]
+       OP_ir   v27.2d, v7.2d, v14.d[0]
 
        prfm    PLDL1KEEP, [pB, #512]
 
-       OP_rr   v28.2d, v4.2d, v14.2d[1]
-       OP_ii   v28.2d, v5.2d, v15.2d[1]
-       OP_ri   v29.2d, v4.2d, v15.2d[1]
-       OP_ir   v29.2d, v5.2d, v14.2d[1]
+       OP_rr   v28.2d, v4.2d, v14.d[1]
+       OP_ii   v28.2d, v5.2d, v15.d[1]
+       OP_ri   v29.2d, v4.2d, v15.d[1]
+       OP_ir   v29.2d, v5.2d, v14.d[1]
 
-       OP_rr   v30.2d, v6.2d, v14.2d[1]
-       OP_ii   v30.2d, v7.2d, v15.2d[1]
-       OP_ri   v31.2d, v6.2d, v15.2d[1]
-       OP_ir   v31.2d, v7.2d, v14.2d[1]
+       OP_rr   v30.2d, v6.2d, v14.d[1]
+       OP_ii   v30.2d, v7.2d, v15.d[1]
+       OP_ri   v31.2d, v6.2d, v15.d[1]
+       OP_ir   v31.2d, v7.2d, v14.d[1]
 .endm
 
 .macro KERNEL4x4_E
-       OP_rr   v16.2d, v4.2d, v12.2d[0]
-       OP_ii   v16.2d, v5.2d, v13.2d[0]
-       OP_ri   v17.2d, v4.2d, v13.2d[0]
-       OP_ir   v17.2d, v5.2d, v12.2d[0]
-
-       OP_rr   v18.2d, v6.2d, v12.2d[0]
-       OP_ii   v18.2d, v7.2d, v13.2d[0]
-       OP_ri   v19.2d, v6.2d, v13.2d[0]
-       OP_ir   v19.2d, v7.2d, v12.2d[0]
-
-       OP_rr   v20.2d, v4.2d, v12.2d[1]
-       OP_ii   v20.2d, v5.2d, v13.2d[1]
-       OP_ri   v21.2d, v4.2d, v13.2d[1]
-       OP_ir   v21.2d, v5.2d, v12.2d[1]
-
-       OP_rr   v22.2d, v6.2d, v12.2d[1]
-       OP_ii   v22.2d, v7.2d, v13.2d[1]
-       OP_ri   v23.2d, v6.2d, v13.2d[1]
-       OP_ir   v23.2d, v7.2d, v12.2d[1]
-
-       OP_rr   v24.2d, v4.2d, v14.2d[0]
-       OP_ii   v24.2d, v5.2d, v15.2d[0]
-       OP_ri   v25.2d, v4.2d, v15.2d[0]
-       OP_ir   v25.2d, v5.2d, v14.2d[0]
-
-       OP_rr   v26.2d, v6.2d, v14.2d[0]
-       OP_ii   v26.2d, v7.2d, v15.2d[0]
-       OP_ri   v27.2d, v6.2d, v15.2d[0]
-       OP_ir   v27.2d, v7.2d, v14.2d[0]
-
-       OP_rr   v28.2d, v4.2d, v14.2d[1]
-       OP_ii   v28.2d, v5.2d, v15.2d[1]
-       OP_ri   v29.2d, v4.2d, v15.2d[1]
-       OP_ir   v29.2d, v5.2d, v14.2d[1]
-
-       OP_rr   v30.2d, v6.2d, v14.2d[1]
-       OP_ii   v30.2d, v7.2d, v15.2d[1]
-       OP_ri   v31.2d, v6.2d, v15.2d[1]
-       OP_ir   v31.2d, v7.2d, v14.2d[1]
+       OP_rr   v16.2d, v4.2d, v12.d[0]
+       OP_ii   v16.2d, v5.2d, v13.d[0]
+       OP_ri   v17.2d, v4.2d, v13.d[0]
+       OP_ir   v17.2d, v5.2d, v12.d[0]
+
+       OP_rr   v18.2d, v6.2d, v12.d[0]
+       OP_ii   v18.2d, v7.2d, v13.d[0]
+       OP_ri   v19.2d, v6.2d, v13.d[0]
+       OP_ir   v19.2d, v7.2d, v12.d[0]
+
+       OP_rr   v20.2d, v4.2d, v12.d[1]
+       OP_ii   v20.2d, v5.2d, v13.d[1]
+       OP_ri   v21.2d, v4.2d, v13.d[1]
+       OP_ir   v21.2d, v5.2d, v12.d[1]
+
+       OP_rr   v22.2d, v6.2d, v12.d[1]
+       OP_ii   v22.2d, v7.2d, v13.d[1]
+       OP_ri   v23.2d, v6.2d, v13.d[1]
+       OP_ir   v23.2d, v7.2d, v12.d[1]
+
+       OP_rr   v24.2d, v4.2d, v14.d[0]
+       OP_ii   v24.2d, v5.2d, v15.d[0]
+       OP_ri   v25.2d, v4.2d, v15.d[0]
+       OP_ir   v25.2d, v5.2d, v14.d[0]
+
+       OP_rr   v26.2d, v6.2d, v14.d[0]
+       OP_ii   v26.2d, v7.2d, v15.d[0]
+       OP_ri   v27.2d, v6.2d, v15.d[0]
+       OP_ir   v27.2d, v7.2d, v14.d[0]
+
+       OP_rr   v28.2d, v4.2d, v14.d[1]
+       OP_ii   v28.2d, v5.2d, v15.d[1]
+       OP_ri   v29.2d, v4.2d, v15.d[1]
+       OP_ir   v29.2d, v5.2d, v14.d[1]
+
+       OP_rr   v30.2d, v6.2d, v14.d[1]
+       OP_ii   v30.2d, v7.2d, v15.d[1]
+       OP_ri   v31.2d, v6.2d, v15.d[1]
+       OP_ir   v31.2d, v7.2d, v14.d[1]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
-
-       OP_rr   v18.2d, v2.2d, v8.2d[0]
-       OP_ii   v18.2d, v3.2d, v9.2d[0]
-       OP_ri   v19.2d, v2.2d, v9.2d[0]
-       OP_ir   v19.2d, v3.2d, v8.2d[0]
-
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
-
-       OP_rr   v22.2d, v2.2d, v8.2d[1]
-       OP_ii   v22.2d, v3.2d, v9.2d[1]
-       OP_ri   v23.2d, v2.2d, v9.2d[1]
-       OP_ir   v23.2d, v3.2d, v8.2d[1]
-
-       OP_rr   v24.2d, v0.2d, v10.2d[0]
-       OP_ii   v24.2d, v1.2d, v11.2d[0]
-       OP_ri   v25.2d, v0.2d, v11.2d[0]
-       OP_ir   v25.2d, v1.2d, v10.2d[0]
-
-       OP_rr   v26.2d, v2.2d, v10.2d[0]
-       OP_ii   v26.2d, v3.2d, v11.2d[0]
-       OP_ri   v27.2d, v2.2d, v11.2d[0]
-       OP_ir   v27.2d, v3.2d, v10.2d[0]
-
-       OP_rr   v28.2d, v0.2d, v10.2d[1]
-       OP_ii   v28.2d, v1.2d, v11.2d[1]
-       OP_ri   v29.2d, v0.2d, v11.2d[1]
-       OP_ir   v29.2d, v1.2d, v10.2d[1]
-
-       OP_rr   v30.2d, v2.2d, v10.2d[1]
-       OP_ii   v30.2d, v3.2d, v11.2d[1]
-       OP_ri   v31.2d, v2.2d, v11.2d[1]
-       OP_ir   v31.2d, v3.2d, v10.2d[1]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
+
+       OP_rr   v18.2d, v2.2d, v8.d[0]
+       OP_ii   v18.2d, v3.2d, v9.d[0]
+       OP_ri   v19.2d, v2.2d, v9.d[0]
+       OP_ir   v19.2d, v3.2d, v8.d[0]
+
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
+
+       OP_rr   v22.2d, v2.2d, v8.d[1]
+       OP_ii   v22.2d, v3.2d, v9.d[1]
+       OP_ri   v23.2d, v2.2d, v9.d[1]
+       OP_ir   v23.2d, v3.2d, v8.d[1]
+
+       OP_rr   v24.2d, v0.2d, v10.d[0]
+       OP_ii   v24.2d, v1.2d, v11.d[0]
+       OP_ri   v25.2d, v0.2d, v11.d[0]
+       OP_ir   v25.2d, v1.2d, v10.d[0]
+
+       OP_rr   v26.2d, v2.2d, v10.d[0]
+       OP_ii   v26.2d, v3.2d, v11.d[0]
+       OP_ri   v27.2d, v2.2d, v11.d[0]
+       OP_ir   v27.2d, v3.2d, v10.d[0]
+
+       OP_rr   v28.2d, v0.2d, v10.d[1]
+       OP_ii   v28.2d, v1.2d, v11.d[1]
+       OP_ri   v29.2d, v0.2d, v11.d[1]
+       OP_ir   v29.2d, v1.2d, v10.d[1]
+
+       OP_rr   v30.2d, v2.2d, v10.d[1]
+       OP_ii   v30.2d, v3.2d, v11.d[1]
+       OP_ri   v31.2d, v2.2d, v11.d[1]
+       OP_ir   v31.2d, v3.2d, v10.d[1]
 .endm
 
 .macro SAVE4x4
@@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
-
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
-
-       OP_rr   v24.2d, v0.2d, v10.2d[0]
-       OP_ii   v24.2d, v1.2d, v11.2d[0]
-       OP_ri   v25.2d, v0.2d, v11.2d[0]
-       OP_ir   v25.2d, v1.2d, v10.2d[0]
-
-       OP_rr   v28.2d, v0.2d, v10.2d[1]
-       OP_ii   v28.2d, v1.2d, v11.2d[1]
-       OP_ri   v29.2d, v0.2d, v11.2d[1]
-       OP_ir   v29.2d, v1.2d, v10.2d[1]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
+
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
+
+       OP_rr   v24.2d, v0.2d, v10.d[0]
+       OP_ii   v24.2d, v1.2d, v11.d[0]
+       OP_ri   v25.2d, v0.2d, v11.d[0]
+       OP_ir   v25.2d, v1.2d, v10.d[0]
+
+       OP_rr   v28.2d, v0.2d, v10.d[1]
+       OP_ii   v28.2d, v1.2d, v11.d[1]
+       OP_ri   v29.2d, v0.2d, v11.d[1]
+       OP_ir   v29.2d, v1.2d, v10.d[1]
 .endm
 
 .macro SAVE2x4
@@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.d, v1.d}[0], [pA]
        add     pA, pA, #16
 
-       OP_rr   d16, d0, v8.2d[0]
-       OP_ii   d16, d1, v9.2d[0]
-       OP_ri   d17, d0, v9.2d[0]
-       OP_ir   d17, d1, v8.2d[0]
-
-       OP_rr   d20, d0, v8.2d[1]
-       OP_ii   d20, d1, v9.2d[1]
-       OP_ri   d21, d0, v9.2d[1]
-       OP_ir   d21, d1, v8.2d[1]
-
-       OP_rr   d24, d0, v10.2d[0]
-       OP_ii   d24, d1, v11.2d[0]
-       OP_ri   d25, d0, v11.2d[0]
-       OP_ir   d25, d1, v10.2d[0]
-
-       OP_rr   d28, d0, v10.2d[1]
-       OP_ii   d28, d1, v11.2d[1]
-       OP_ri   d29, d0, v11.2d[1]
-       OP_ir   d29, d1, v10.2d[1]
+       OP_rr   d16, d0, v8.d[0]
+       OP_ii   d16, d1, v9.d[0]
+       OP_ri   d17, d0, v9.d[0]
+       OP_ir   d17, d1, v8.d[0]
+
+       OP_rr   d20, d0, v8.d[1]
+       OP_ii   d20, d1, v9.d[1]
+       OP_ri   d21, d0, v9.d[1]
+       OP_ir   d21, d1, v8.d[1]
+
+       OP_rr   d24, d0, v10.d[0]
+       OP_ii   d24, d1, v11.d[0]
+       OP_ri   d25, d0, v11.d[0]
+       OP_ir   d25, d1, v10.d[0]
+
+       OP_rr   d28, d0, v10.d[1]
+       OP_ii   d28, d1, v11.d[1]
+       OP_ri   d29, d0, v11.d[1]
+       OP_ir   d29, d1, v10.d[1]
 .endm
 
 .macro SAVE1x4
@@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v2.2d, v3.2d}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
-
-       OP_rr   v18.2d, v2.2d, v8.2d[0]
-       OP_ii   v18.2d, v3.2d, v9.2d[0]
-       OP_ri   v19.2d, v2.2d, v9.2d[0]
-       OP_ir   v19.2d, v3.2d, v8.2d[0]
-
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
-
-       OP_rr   v22.2d, v2.2d, v8.2d[1]
-       OP_ii   v22.2d, v3.2d, v9.2d[1]
-       OP_ri   v23.2d, v2.2d, v9.2d[1]
-       OP_ir   v23.2d, v3.2d, v8.2d[1]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
+
+       OP_rr   v18.2d, v2.2d, v8.d[0]
+       OP_ii   v18.2d, v3.2d, v9.d[0]
+       OP_ri   v19.2d, v2.2d, v9.d[0]
+       OP_ir   v19.2d, v3.2d, v8.d[0]
+
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
+
+       OP_rr   v22.2d, v2.2d, v8.d[1]
+       OP_ii   v22.2d, v3.2d, v9.d[1]
+       OP_ri   v23.2d, v2.2d, v9.d[1]
+       OP_ir   v23.2d, v3.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.2d, v1.2d}, [pA]
        add     pA, pA, #32
 
-       OP_rr   v16.2d, v0.2d, v8.2d[0]
-       OP_ii   v16.2d, v1.2d, v9.2d[0]
-       OP_ri   v17.2d, v0.2d, v9.2d[0]
-       OP_ir   v17.2d, v1.2d, v8.2d[0]
+       OP_rr   v16.2d, v0.2d, v8.d[0]
+       OP_ii   v16.2d, v1.2d, v9.d[0]
+       OP_ri   v17.2d, v0.2d, v9.d[0]
+       OP_ir   v17.2d, v1.2d, v8.d[0]
 
-       OP_rr   v20.2d, v0.2d, v8.2d[1]
-       OP_ii   v20.2d, v1.2d, v9.2d[1]
-       OP_ri   v21.2d, v0.2d, v9.2d[1]
-       OP_ir   v21.2d, v1.2d, v8.2d[1]
+       OP_rr   v20.2d, v0.2d, v8.d[1]
+       OP_ii   v20.2d, v1.2d, v9.d[1]
+       OP_ri   v21.2d, v0.2d, v9.d[1]
+       OP_ir   v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld2     {v0.d, v1.d}[0], [pA]
        add     pA, pA, #16
 
-       OP_rr   d16, d0, v8.2d[0]
-       OP_ii   d16, d1, v9.2d[0]
-       OP_ri   d17, d0, v9.2d[0]
-       OP_ir   d17, d1, v8.2d[0]
+       OP_rr   d16, d0, v8.d[0]
+       OP_ii   d16, d1, v9.d[0]
+       OP_ri   d17, d0, v9.d[0]
+       OP_ir   d17, d1, v8.d[0]
 
-       OP_rr   d20, d0, v8.2d[1]
-       OP_ii   d20, d1, v9.2d[1]
-       OP_ri   d21, d0, v9.2d[1]
-       OP_ir   d21, d1, v8.2d[1]
+       OP_rr   d20, d0, v8.d[1]
+       OP_ii   d20, d1, v9.d[1]
+       OP_ri   d21, d0, v9.d[1]
+       OP_ir   d21, d1, v8.d[1]
 .endm
 
 .macro SAVE1x2