ld2 {v2.4s, v3.4s}, [ppA]
add ppA, ppA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- fmul v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.4s, v2.4s, v9.4s[0]
+ fmls v19.4s, v2.4s, v9.s[0]
#else
- fmul v19.4s, v2.4s, v9.4s[0]
+ fmul v19.4s, v2.4s, v9.s[0]
#endif
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- fmul v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.4s, v2.4s, v9.4s[1]
+ fmls v23.4s, v2.4s, v9.s[1]
#else
- fmul v23.4s, v2.4s, v9.4s[1]
+ fmul v23.4s, v2.4s, v9.s[1]
#endif
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- fmul v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
+ fmul v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.4s, v2.4s, v9.4s[2]
+ fmls v27.4s, v2.4s, v9.s[2]
#else
- fmul v27.4s, v2.4s, v9.4s[2]
+ fmul v27.4s, v2.4s, v9.s[2]
#endif
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- fmul v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
+ fmul v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.4s, v2.4s, v9.4s[3]
+ fmls v31.4s, v2.4s, v9.s[3]
#else
- fmul v31.4s, v2.4s, v9.4s[3]
+ fmul v31.4s, v2.4s, v9.s[3]
#endif
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
.endm
.macro KERNEL8x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // for next round
add pB, pB, #32
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
ld2 {v4.4s, v5.4s} , [pA] // for next round
add pA, pA, #32
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
ld2 {v6.4s, v7.4s} , [ppA] // for next round
add ppA, ppA, #32
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
prfm PLDL1KEEP, [ppA, #512]
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
.endm
.macro KERNEL8x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // for next round
add pB, pB, #32
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
ld2 {v0.4s, v1.4s}, [pA] // for next round
add pA, pA, #32
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
prfm PLDL1KEEP, [ppA, #512]
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
ld2 {v2.4s, v3.4s}, [ppA] // for next round
add ppA, ppA, #32
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
.endm
.macro KERNEL8x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
-
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
-
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
-
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
-
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
-
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
-
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
-
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
+
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
+
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
+
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
+
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
+
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
+
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
+
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
.endm
.macro KERNEL8x4_SUB
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v2.4s, v3.4s}, [ppA]
add ppA, ppA, #32
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
-
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
-
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
-
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
-
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
+
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
+
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
.endm
.macro SAVE8x4
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
-
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
-
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE4x4
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.4s[0]
- OP_ii v16.2s, v1.2s, v9.4s[0]
- OP_ri v17.2s, v0.2s, v9.4s[0]
- OP_ir v17.2s, v1.2s, v8.4s[0]
-
- OP_rr v20.2s, v0.2s, v8.4s[1]
- OP_ii v20.2s, v1.2s, v9.4s[1]
- OP_ri v21.2s, v0.2s, v9.4s[1]
- OP_ir v21.2s, v1.2s, v8.4s[1]
-
- OP_rr v24.2s, v0.2s, v8.4s[2]
- OP_ii v24.2s, v1.2s, v9.4s[2]
- OP_ri v25.2s, v0.2s, v9.4s[2]
- OP_ir v25.2s, v1.2s, v8.4s[2]
-
- OP_rr v28.2s, v0.2s, v8.4s[3]
- OP_ii v28.2s, v1.2s, v9.4s[3]
- OP_ri v29.2s, v0.2s, v9.4s[3]
- OP_ir v29.2s, v1.2s, v8.4s[3]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
+
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
+
+ OP_rr v24.2s, v0.2s, v8.s[2]
+ OP_ii v24.2s, v1.2s, v9.s[2]
+ OP_ri v25.2s, v0.2s, v9.s[2]
+ OP_ir v25.2s, v1.2s, v8.s[2]
+
+ OP_rr v28.2s, v0.2s, v8.s[3]
+ OP_ii v28.2s, v1.2s, v9.s[3]
+ OP_ri v29.2s, v0.2s, v9.s[3]
+ OP_ir v29.2s, v1.2s, v8.s[3]
.endm
.macro SAVE2x4
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.4s[0]
- OP_ii s16, s1, v9.4s[0]
- OP_ri s17, s0, v9.4s[0]
- OP_ir s17, s1, v8.4s[0]
-
- OP_rr s20, s0, v8.4s[1]
- OP_ii s20, s1, v9.4s[1]
- OP_ri s21, s0, v9.4s[1]
- OP_ir s21, s1, v8.4s[1]
-
- OP_rr s24, s0, v8.4s[2]
- OP_ii s24, s1, v9.4s[2]
- OP_ri s25, s0, v9.4s[2]
- OP_ir s25, s1, v8.4s[2]
-
- OP_rr s28, s0, v8.4s[3]
- OP_ii s28, s1, v9.4s[3]
- OP_ri s29, s0, v9.4s[3]
- OP_ir s29, s1, v8.4s[3]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
+
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
+
+ OP_rr s24, s0, v8.s[2]
+ OP_ii s24, s1, v9.s[2]
+ OP_ri s25, s0, v9.s[2]
+ OP_ir s25, s1, v8.s[2]
+
+ OP_rr s28, s0, v8.s[3]
+ OP_ii s28, s1, v9.s[3]
+ OP_ri s29, s0, v9.s[3]
+ OP_ir s29, s1, v8.s[3]
.endm
.macro SAVE1x4
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE4x2
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.2s[0]
- OP_ii v16.2s, v1.2s, v9.2s[0]
- OP_ri v17.2s, v0.2s, v9.2s[0]
- OP_ir v17.2s, v1.2s, v8.2s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.2s[1]
- OP_ii v20.2s, v1.2s, v9.2s[1]
- OP_ri v21.2s, v0.2s, v9.2s[1]
- OP_ir v21.2s, v1.2s, v8.2s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE2x2
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.2s[0]
- OP_ii s16, s1, v9.2s[0]
- OP_ri s17, s0, v9.2s[0]
- OP_ir s17, s1, v8.2s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.2s[1]
- OP_ii s20, s1, v9.2s[1]
- OP_ri s21, s0, v9.2s[1]
- OP_ir s21, s1, v8.2s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
.endm
.macro SAVE1x2
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.4s, v2.4s, v9.4s[0]
+ fmls v19.4s, v2.4s, v9.s[0]
#else
- fmul v19.4s, v2.4s, v9.4s[0]
+ fmul v19.4s, v2.4s, v9.s[0]
#endif
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.4s, v2.4s, v9.4s[1]
+ fmls v23.4s, v2.4s, v9.s[1]
#else
- fmul v23.4s, v2.4s, v9.4s[1]
+ fmul v23.4s, v2.4s, v9.s[1]
#endif
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
+ fmul v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.4s, v2.4s, v9.4s[2]
+ fmls v27.4s, v2.4s, v9.s[2]
#else
- fmul v27.4s, v2.4s, v9.4s[2]
+ fmul v27.4s, v2.4s, v9.s[2]
#endif
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- fmul v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
+ fmul v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.4s, v2.4s, v9.4s[3]
+ fmls v31.4s, v2.4s, v9.s[3]
#else
- fmul v31.4s, v2.4s, v9.4s[3]
+ fmul v31.4s, v2.4s, v9.s[3]
#endif
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
.endm
.macro KERNEL8x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
-
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
-
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
-
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
-
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
-
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
+
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
+
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
.endm
.macro KERNEL8x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
-
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
-
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
-
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
-
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
-
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
-
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
-
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
+
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
+
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
+
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
+
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
+
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
+
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
+
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
ld2 {v8.4s, v9.4s}, [pB]
add pB, pB, #32
.endm
.macro KERNEL8x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
-
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
-
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
-
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
-
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
-
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
-
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
-
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
+
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
+
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
+
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
+
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
+
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
+
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
+
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
.endm
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
-
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
-
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
-
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
-
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
-
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
+
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
+
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
.endm
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
-
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
-
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
-
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
+
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
+
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
+
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_SUB
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
-
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
-
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE4x4
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.4s[0]
- OP_ii v16.2s, v1.2s, v9.4s[0]
- OP_ri v17.2s, v0.2s, v9.4s[0]
- OP_ir v17.2s, v1.2s, v8.4s[0]
-
- OP_rr v20.2s, v0.2s, v8.4s[1]
- OP_ii v20.2s, v1.2s, v9.4s[1]
- OP_ri v21.2s, v0.2s, v9.4s[1]
- OP_ir v21.2s, v1.2s, v8.4s[1]
-
- OP_rr v24.2s, v0.2s, v8.4s[2]
- OP_ii v24.2s, v1.2s, v9.4s[2]
- OP_ri v25.2s, v0.2s, v9.4s[2]
- OP_ir v25.2s, v1.2s, v8.4s[2]
-
- OP_rr v28.2s, v0.2s, v8.4s[3]
- OP_ii v28.2s, v1.2s, v9.4s[3]
- OP_ri v29.2s, v0.2s, v9.4s[3]
- OP_ir v29.2s, v1.2s, v8.4s[3]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
+
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
+
+ OP_rr v24.2s, v0.2s, v8.s[2]
+ OP_ii v24.2s, v1.2s, v9.s[2]
+ OP_ri v25.2s, v0.2s, v9.s[2]
+ OP_ir v25.2s, v1.2s, v8.s[2]
+
+ OP_rr v28.2s, v0.2s, v8.s[3]
+ OP_ii v28.2s, v1.2s, v9.s[3]
+ OP_ri v29.2s, v0.2s, v9.s[3]
+ OP_ir v29.2s, v1.2s, v8.s[3]
.endm
.macro SAVE2x4
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.4s[0]
- OP_ii s16, s1, v9.4s[0]
- OP_ri s17, s0, v9.4s[0]
- OP_ir s17, s1, v8.4s[0]
-
- OP_rr s20, s0, v8.4s[1]
- OP_ii s20, s1, v9.4s[1]
- OP_ri s21, s0, v9.4s[1]
- OP_ir s21, s1, v8.4s[1]
-
- OP_rr s24, s0, v8.4s[2]
- OP_ii s24, s1, v9.4s[2]
- OP_ri s25, s0, v9.4s[2]
- OP_ir s25, s1, v8.4s[2]
-
- OP_rr s28, s0, v8.4s[3]
- OP_ii s28, s1, v9.4s[3]
- OP_ri s29, s0, v9.4s[3]
- OP_ir s29, s1, v8.4s[3]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
+
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
+
+ OP_rr s24, s0, v8.s[2]
+ OP_ii s24, s1, v9.s[2]
+ OP_ri s25, s0, v9.s[2]
+ OP_ir s25, s1, v8.s[2]
+
+ OP_rr s28, s0, v8.s[3]
+ OP_ii s28, s1, v9.s[3]
+ OP_ri s29, s0, v9.s[3]
+ OP_ir s29, s1, v8.s[3]
.endm
.macro SAVE1x4
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
-
- OP_rr v18.4s, v2.4s, v8.2s[0]
- OP_ii v18.4s, v3.4s, v9.2s[0]
- OP_ri v19.4s, v2.4s, v9.2s[0]
- OP_ir v19.4s, v3.4s, v8.2s[0]
-
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
-
- OP_rr v22.4s, v2.4s, v8.2s[1]
- OP_ii v22.4s, v3.4s, v9.2s[1]
- OP_ri v23.4s, v2.4s, v9.2s[1]
- OP_ir v23.4s, v3.4s, v8.2s[1]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
.endm
.macro SAVE8x2
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE4x2
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.2s[0]
- OP_ii v16.2s, v1.2s, v9.2s[0]
- OP_ri v17.2s, v0.2s, v9.2s[0]
- OP_ir v17.2s, v1.2s, v8.2s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.2s[1]
- OP_ii v20.2s, v1.2s, v9.2s[1]
- OP_ri v21.2s, v0.2s, v9.2s[1]
- OP_ir v21.2s, v1.2s, v8.2s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE2x2
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.2s[0]
- OP_ii s16, s1, v9.2s[0]
- OP_ri s17, s0, v9.2s[0]
- OP_ir s17, s1, v8.2s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.2s[1]
- OP_ii s20, s1, v9.2s[1]
- OP_ri s21, s0, v9.2s[1]
- OP_ir s21, s1, v8.2s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
.endm
.macro SAVE1x2
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v8.4s[1]
- OP_ri v17.4s, v0.4s, v8.4s[1]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v8.s[1]
+ OP_ri v17.4s, v0.4s, v8.s[1]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v8.4s[1]
- OP_ri v19.4s, v2.4s, v8.4s[1]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v8.s[1]
+ OP_ri v19.4s, v2.4s, v8.s[1]
+ OP_ir v19.4s, v3.4s, v8.s[0]
.endm
.macro SAVE8x1
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
-
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
-
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
-
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
+
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
+
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
+
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_SUB
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
-
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
-
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE4x4
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.4s[0]
- OP_ii v16.2s, v1.2s, v9.4s[0]
- OP_ri v17.2s, v0.2s, v9.4s[0]
- OP_ir v17.2s, v1.2s, v8.4s[0]
-
- OP_rr v20.2s, v0.2s, v8.4s[1]
- OP_ii v20.2s, v1.2s, v9.4s[1]
- OP_ri v21.2s, v0.2s, v9.4s[1]
- OP_ir v21.2s, v1.2s, v8.4s[1]
-
- OP_rr v24.2s, v0.2s, v8.4s[2]
- OP_ii v24.2s, v1.2s, v9.4s[2]
- OP_ri v25.2s, v0.2s, v9.4s[2]
- OP_ir v25.2s, v1.2s, v8.4s[2]
-
- OP_rr v28.2s, v0.2s, v8.4s[3]
- OP_ii v28.2s, v1.2s, v9.4s[3]
- OP_ri v29.2s, v0.2s, v9.4s[3]
- OP_ir v29.2s, v1.2s, v8.4s[3]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
+
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
+
+ OP_rr v24.2s, v0.2s, v8.s[2]
+ OP_ii v24.2s, v1.2s, v9.s[2]
+ OP_ri v25.2s, v0.2s, v9.s[2]
+ OP_ir v25.2s, v1.2s, v8.s[2]
+
+ OP_rr v28.2s, v0.2s, v8.s[3]
+ OP_ii v28.2s, v1.2s, v9.s[3]
+ OP_ri v29.2s, v0.2s, v9.s[3]
+ OP_ir v29.2s, v1.2s, v8.s[3]
.endm
.macro SAVE2x4
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.4s[0]
- OP_ii s16, s1, v9.4s[0]
- OP_ri s17, s0, v9.4s[0]
- OP_ir s17, s1, v8.4s[0]
-
- OP_rr s20, s0, v8.4s[1]
- OP_ii s20, s1, v9.4s[1]
- OP_ri s21, s0, v9.4s[1]
- OP_ir s21, s1, v8.4s[1]
-
- OP_rr s24, s0, v8.4s[2]
- OP_ii s24, s1, v9.4s[2]
- OP_ri s25, s0, v9.4s[2]
- OP_ir s25, s1, v8.4s[2]
-
- OP_rr s28, s0, v8.4s[3]
- OP_ii s28, s1, v9.4s[3]
- OP_ri s29, s0, v9.4s[3]
- OP_ir s29, s1, v8.4s[3]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
+
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
+
+ OP_rr s24, s0, v8.s[2]
+ OP_ii s24, s1, v9.s[2]
+ OP_ri s25, s0, v9.s[2]
+ OP_ir s25, s1, v8.s[2]
+
+ OP_rr s28, s0, v8.s[3]
+ OP_ii s28, s1, v9.s[3]
+ OP_ri s29, s0, v9.s[3]
+ OP_ir s29, s1, v8.s[3]
.endm
.macro SAVE1x4
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE4x2
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.2s[0]
- OP_ii v16.2s, v1.2s, v9.2s[0]
- OP_ri v17.2s, v0.2s, v9.2s[0]
- OP_ir v17.2s, v1.2s, v8.2s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.2s[1]
- OP_ii v20.2s, v1.2s, v9.2s[1]
- OP_ri v21.2s, v0.2s, v9.2s[1]
- OP_ir v21.2s, v1.2s, v8.2s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE2x2
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.2s[0]
- OP_ii s16, s1, v9.2s[0]
- OP_ri s17, s0, v9.2s[0]
- OP_ir s17, s1, v8.2s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.2s[1]
- OP_ii s20, s1, v9.2s[1]
- OP_ri s21, s0, v9.2s[1]
- OP_ir s21, s1, v8.2s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
.endm
.macro SAVE1x2
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.4s, v2.4s, v9.4s[0]
+ fmls v19.4s, v2.4s, v9.s[0]
#else
- fmul v19.4s, v2.4s, v9.4s[0]
+ fmul v19.4s, v2.4s, v9.s[0]
#endif
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.4s, v2.4s, v9.4s[1]
+ fmls v23.4s, v2.4s, v9.s[1]
#else
- fmul v23.4s, v2.4s, v9.4s[1]
+ fmul v23.4s, v2.4s, v9.s[1]
#endif
- OP_ir v23.4s, v3.4s, v8.4s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
+ fmul v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.4s, v2.4s, v9.4s[2]
+ fmls v27.4s, v2.4s, v9.s[2]
#else
- fmul v27.4s, v2.4s, v9.4s[2]
+ fmul v27.4s, v2.4s, v9.s[2]
#endif
- OP_ir v27.4s, v3.4s, v8.4s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
- fmul v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
+ fmul v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.4s, v2.4s, v9.4s[3]
+ fmls v31.4s, v2.4s, v9.s[3]
#else
- fmul v31.4s, v2.4s, v9.4s[3]
+ fmul v31.4s, v2.4s, v9.s[3]
#endif
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
.endm
.macro KERNEL8x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
-
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
-
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
-
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
-
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
-
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
+
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
+
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
.endm
.macro KERNEL8x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
-
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
-
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
-
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
-
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
-
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
-
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
-
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
+
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
+
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
+
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
+
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
+
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
+
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
+
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
ld2 {v8.4s, v9.4s}, [pB]
add pB, pB, #32
.endm
.macro KERNEL8x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
-
- OP_rr v18.4s, v6.4s, v12.4s[0]
- OP_ii v18.4s, v7.4s, v13.4s[0]
- OP_ri v19.4s, v6.4s, v13.4s[0]
- OP_ir v19.4s, v7.4s, v12.4s[0]
-
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
-
- OP_rr v22.4s, v6.4s, v12.4s[1]
- OP_ii v22.4s, v7.4s, v13.4s[1]
- OP_ri v23.4s, v6.4s, v13.4s[1]
- OP_ir v23.4s, v7.4s, v12.4s[1]
-
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
-
- OP_rr v26.4s, v6.4s, v12.4s[2]
- OP_ii v26.4s, v7.4s, v13.4s[2]
- OP_ri v27.4s, v6.4s, v13.4s[2]
- OP_ir v27.4s, v7.4s, v12.4s[2]
-
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
-
- OP_rr v30.4s, v6.4s, v12.4s[3]
- OP_ii v30.4s, v7.4s, v13.4s[3]
- OP_ri v31.4s, v6.4s, v13.4s[3]
- OP_ir v31.4s, v7.4s, v12.4s[3]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
+
+ OP_rr v18.4s, v6.4s, v12.s[0]
+ OP_ii v18.4s, v7.4s, v13.s[0]
+ OP_ri v19.4s, v6.4s, v13.s[0]
+ OP_ir v19.4s, v7.4s, v12.s[0]
+
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
+
+ OP_rr v22.4s, v6.4s, v12.s[1]
+ OP_ii v22.4s, v7.4s, v13.s[1]
+ OP_ri v23.4s, v6.4s, v13.s[1]
+ OP_ir v23.4s, v7.4s, v12.s[1]
+
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
+
+ OP_rr v26.4s, v6.4s, v12.s[2]
+ OP_ii v26.4s, v7.4s, v13.s[2]
+ OP_ri v27.4s, v6.4s, v13.s[2]
+ OP_ir v27.4s, v7.4s, v12.s[2]
+
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
+
+ OP_rr v30.4s, v6.4s, v12.s[3]
+ OP_ii v30.4s, v7.4s, v13.s[3]
+ OP_ri v31.4s, v6.4s, v13.s[3]
+ OP_ir v31.4s, v7.4s, v12.s[3]
.endm
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
-
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v9.4s[0]
- OP_ri v19.4s, v2.4s, v9.4s[0]
- OP_ir v19.4s, v3.4s, v8.4s[0]
-
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
-
- OP_rr v22.4s, v2.4s, v8.4s[1]
- OP_ii v22.4s, v3.4s, v9.4s[1]
- OP_ri v23.4s, v2.4s, v9.4s[1]
- OP_ir v23.4s, v3.4s, v8.4s[1]
-
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v26.4s, v2.4s, v8.4s[2]
- OP_ii v26.4s, v3.4s, v9.4s[2]
- OP_ri v27.4s, v2.4s, v9.4s[2]
- OP_ir v27.4s, v3.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
-
- OP_rr v30.4s, v2.4s, v8.4s[3]
- OP_ii v30.4s, v3.4s, v9.4s[3]
- OP_ri v31.4s, v2.4s, v9.4s[3]
- OP_ir v31.4s, v3.4s, v8.4s[3]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
+
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v26.4s, v2.4s, v8.s[2]
+ OP_ii v26.4s, v3.4s, v9.s[2]
+ OP_ri v27.4s, v2.4s, v9.s[2]
+ OP_ir v27.4s, v3.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
+
+ OP_rr v30.4s, v2.4s, v8.s[3]
+ OP_ii v30.4s, v3.4s, v9.s[3]
+ OP_ri v31.4s, v2.4s, v9.s[3]
+ OP_ir v31.4s, v3.4s, v8.s[3]
.endm
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- fmul v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
+ fmul v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.4s, v0.4s, v9.4s[0]
+ fmls v17.4s, v0.4s, v9.s[0]
#else
- fmul v17.4s, v0.4s, v9.4s[0]
+ fmul v17.4s, v0.4s, v9.s[0]
#endif
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
+ fmul v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.4s, v0.4s, v9.4s[1]
+ fmls v21.4s, v0.4s, v9.s[1]
#else
- fmul v21.4s, v0.4s, v9.4s[1]
+ fmul v21.4s, v0.4s, v9.s[1]
#endif
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
- fmul v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
+ fmul v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.4s, v0.4s, v9.4s[2]
+ fmls v25.4s, v0.4s, v9.s[2]
#else
- fmul v25.4s, v0.4s, v9.4s[2]
+ fmul v25.4s, v0.4s, v9.s[2]
#endif
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
+ fmul v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.4s, v0.4s, v9.4s[3]
+ fmls v29.4s, v0.4s, v9.s[3]
#else
- fmul v29.4s, v0.4s, v9.4s[3]
+ fmul v29.4s, v0.4s, v9.s[3]
#endif
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
ld2 {v12.4s, v13.4s}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x4_M1
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
ld2 {v12.4s, v13.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
ld2 {v4.4s, v5.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
ld2 {v8.4s, v9.4s}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
ld2 {v0.4s, v1.4s}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_E
- OP_rr v16.4s, v4.4s, v12.4s[0]
- OP_ii v16.4s, v5.4s, v13.4s[0]
- OP_ri v17.4s, v4.4s, v13.4s[0]
- OP_ir v17.4s, v5.4s, v12.4s[0]
-
- OP_rr v20.4s, v4.4s, v12.4s[1]
- OP_ii v20.4s, v5.4s, v13.4s[1]
- OP_ri v21.4s, v4.4s, v13.4s[1]
- OP_ir v21.4s, v5.4s, v12.4s[1]
-
- OP_rr v24.4s, v4.4s, v12.4s[2]
- OP_ii v24.4s, v5.4s, v13.4s[2]
- OP_ri v25.4s, v4.4s, v13.4s[2]
- OP_ir v25.4s, v5.4s, v12.4s[2]
-
- OP_rr v28.4s, v4.4s, v12.4s[3]
- OP_ii v28.4s, v5.4s, v13.4s[3]
- OP_ri v29.4s, v4.4s, v13.4s[3]
- OP_ir v29.4s, v5.4s, v12.4s[3]
+ OP_rr v16.4s, v4.4s, v12.s[0]
+ OP_ii v16.4s, v5.4s, v13.s[0]
+ OP_ri v17.4s, v4.4s, v13.s[0]
+ OP_ir v17.4s, v5.4s, v12.s[0]
+
+ OP_rr v20.4s, v4.4s, v12.s[1]
+ OP_ii v20.4s, v5.4s, v13.s[1]
+ OP_ri v21.4s, v4.4s, v13.s[1]
+ OP_ir v21.4s, v5.4s, v12.s[1]
+
+ OP_rr v24.4s, v4.4s, v12.s[2]
+ OP_ii v24.4s, v5.4s, v13.s[2]
+ OP_ri v25.4s, v4.4s, v13.s[2]
+ OP_ir v25.4s, v5.4s, v12.s[2]
+
+ OP_rr v28.4s, v4.4s, v12.s[3]
+ OP_ii v28.4s, v5.4s, v13.s[3]
+ OP_ri v29.4s, v4.4s, v13.s[3]
+ OP_ir v29.4s, v5.4s, v12.s[3]
.endm
.macro KERNEL4x4_SUB
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v9.4s[0]
- OP_ri v17.4s, v0.4s, v9.4s[0]
- OP_ir v17.4s, v1.4s, v8.4s[0]
-
- OP_rr v20.4s, v0.4s, v8.4s[1]
- OP_ii v20.4s, v1.4s, v9.4s[1]
- OP_ri v21.4s, v0.4s, v9.4s[1]
- OP_ir v21.4s, v1.4s, v8.4s[1]
-
- OP_rr v24.4s, v0.4s, v8.4s[2]
- OP_ii v24.4s, v1.4s, v9.4s[2]
- OP_ri v25.4s, v0.4s, v9.4s[2]
- OP_ir v25.4s, v1.4s, v8.4s[2]
-
- OP_rr v28.4s, v0.4s, v8.4s[3]
- OP_ii v28.4s, v1.4s, v9.4s[3]
- OP_ri v29.4s, v0.4s, v9.4s[3]
- OP_ir v29.4s, v1.4s, v8.4s[3]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v24.4s, v0.4s, v8.s[2]
+ OP_ii v24.4s, v1.4s, v9.s[2]
+ OP_ri v25.4s, v0.4s, v9.s[2]
+ OP_ir v25.4s, v1.4s, v8.s[2]
+
+ OP_rr v28.4s, v0.4s, v8.s[3]
+ OP_ii v28.4s, v1.4s, v9.s[3]
+ OP_ri v29.4s, v0.4s, v9.s[3]
+ OP_ir v29.4s, v1.4s, v8.s[3]
.endm
.macro SAVE4x4
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.4s[0]
- OP_ii v16.2s, v1.2s, v9.4s[0]
- OP_ri v17.2s, v0.2s, v9.4s[0]
- OP_ir v17.2s, v1.2s, v8.4s[0]
-
- OP_rr v20.2s, v0.2s, v8.4s[1]
- OP_ii v20.2s, v1.2s, v9.4s[1]
- OP_ri v21.2s, v0.2s, v9.4s[1]
- OP_ir v21.2s, v1.2s, v8.4s[1]
-
- OP_rr v24.2s, v0.2s, v8.4s[2]
- OP_ii v24.2s, v1.2s, v9.4s[2]
- OP_ri v25.2s, v0.2s, v9.4s[2]
- OP_ir v25.2s, v1.2s, v8.4s[2]
-
- OP_rr v28.2s, v0.2s, v8.4s[3]
- OP_ii v28.2s, v1.2s, v9.4s[3]
- OP_ri v29.2s, v0.2s, v9.4s[3]
- OP_ir v29.2s, v1.2s, v8.4s[3]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
+
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
+
+ OP_rr v24.2s, v0.2s, v8.s[2]
+ OP_ii v24.2s, v1.2s, v9.s[2]
+ OP_ri v25.2s, v0.2s, v9.s[2]
+ OP_ir v25.2s, v1.2s, v8.s[2]
+
+ OP_rr v28.2s, v0.2s, v8.s[3]
+ OP_ii v28.2s, v1.2s, v9.s[3]
+ OP_ri v29.2s, v0.2s, v9.s[3]
+ OP_ir v29.2s, v1.2s, v8.s[3]
.endm
.macro SAVE2x4
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.4s[0]
- OP_ii s16, s1, v9.4s[0]
- OP_ri s17, s0, v9.4s[0]
- OP_ir s17, s1, v8.4s[0]
-
- OP_rr s20, s0, v8.4s[1]
- OP_ii s20, s1, v9.4s[1]
- OP_ri s21, s0, v9.4s[1]
- OP_ir s21, s1, v8.4s[1]
-
- OP_rr s24, s0, v8.4s[2]
- OP_ii s24, s1, v9.4s[2]
- OP_ri s25, s0, v9.4s[2]
- OP_ir s25, s1, v8.4s[2]
-
- OP_rr s28, s0, v8.4s[3]
- OP_ii s28, s1, v9.4s[3]
- OP_ri s29, s0, v9.4s[3]
- OP_ir s29, s1, v8.4s[3]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
+
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
+
+ OP_rr s24, s0, v8.s[2]
+ OP_ii s24, s1, v9.s[2]
+ OP_ri s25, s0, v9.s[2]
+ OP_ir s25, s1, v8.s[2]
+
+ OP_rr s28, s0, v8.s[3]
+ OP_ii s28, s1, v9.s[3]
+ OP_ri s29, s0, v9.s[3]
+ OP_ir s29, s1, v8.s[3]
.endm
.macro SAVE1x4
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
-
- OP_rr v18.4s, v2.4s, v8.2s[0]
- OP_ii v18.4s, v3.4s, v9.2s[0]
- OP_ri v19.4s, v2.4s, v9.2s[0]
- OP_ir v19.4s, v3.4s, v8.2s[0]
-
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
-
- OP_rr v22.4s, v2.4s, v8.2s[1]
- OP_ii v22.4s, v3.4s, v9.2s[1]
- OP_ri v23.4s, v2.4s, v9.2s[1]
- OP_ir v23.4s, v3.4s, v8.2s[1]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
+
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v9.s[0]
+ OP_ri v19.4s, v2.4s, v9.s[0]
+ OP_ir v19.4s, v3.4s, v8.s[0]
+
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
+
+ OP_rr v22.4s, v2.4s, v8.s[1]
+ OP_ii v22.4s, v3.4s, v9.s[1]
+ OP_ri v23.4s, v2.4s, v9.s[1]
+ OP_ir v23.4s, v3.4s, v8.s[1]
.endm
.macro SAVE8x2
ld2 {v0.4s, v1.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.2s[0]
- OP_ii v16.4s, v1.4s, v9.2s[0]
- OP_ri v17.4s, v0.4s, v9.2s[0]
- OP_ir v17.4s, v1.4s, v8.2s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v9.s[0]
+ OP_ri v17.4s, v0.4s, v9.s[0]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v20.4s, v0.4s, v8.2s[1]
- OP_ii v20.4s, v1.4s, v9.2s[1]
- OP_ri v21.4s, v0.4s, v9.2s[1]
- OP_ir v21.4s, v1.4s, v8.2s[1]
+ OP_rr v20.4s, v0.4s, v8.s[1]
+ OP_ii v20.4s, v1.4s, v9.s[1]
+ OP_ri v21.4s, v0.4s, v9.s[1]
+ OP_ir v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE4x2
ld2 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- OP_rr v16.2s, v0.2s, v8.2s[0]
- OP_ii v16.2s, v1.2s, v9.2s[0]
- OP_ri v17.2s, v0.2s, v9.2s[0]
- OP_ir v17.2s, v1.2s, v8.2s[0]
+ OP_rr v16.2s, v0.2s, v8.s[0]
+ OP_ii v16.2s, v1.2s, v9.s[0]
+ OP_ri v17.2s, v0.2s, v9.s[0]
+ OP_ir v17.2s, v1.2s, v8.s[0]
- OP_rr v20.2s, v0.2s, v8.2s[1]
- OP_ii v20.2s, v1.2s, v9.2s[1]
- OP_ri v21.2s, v0.2s, v9.2s[1]
- OP_ir v21.2s, v1.2s, v8.2s[1]
+ OP_rr v20.2s, v0.2s, v8.s[1]
+ OP_ii v20.2s, v1.2s, v9.s[1]
+ OP_ri v21.2s, v0.2s, v9.s[1]
+ OP_ir v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE2x2
ld2 {v0.s, v1.s}[0], [pA]
add pA, pA, #8
- OP_rr s16, s0, v8.2s[0]
- OP_ii s16, s1, v9.2s[0]
- OP_ri s17, s0, v9.2s[0]
- OP_ir s17, s1, v8.2s[0]
+ OP_rr s16, s0, v8.s[0]
+ OP_ii s16, s1, v9.s[0]
+ OP_ri s17, s0, v9.s[0]
+ OP_ir s17, s1, v8.s[0]
- OP_rr s20, s0, v8.2s[1]
- OP_ii s20, s1, v9.2s[1]
- OP_ri s21, s0, v9.2s[1]
- OP_ir s21, s1, v8.2s[1]
+ OP_rr s20, s0, v8.s[1]
+ OP_ii s20, s1, v9.s[1]
+ OP_ri s21, s0, v9.s[1]
+ OP_ir s21, s1, v8.s[1]
.endm
.macro SAVE1x2
ld2 {v2.4s, v3.4s}, [pA]
add pA, pA, #32
- OP_rr v16.4s, v0.4s, v8.4s[0]
- OP_ii v16.4s, v1.4s, v8.4s[1]
- OP_ri v17.4s, v0.4s, v8.4s[1]
- OP_ir v17.4s, v1.4s, v8.4s[0]
+ OP_rr v16.4s, v0.4s, v8.s[0]
+ OP_ii v16.4s, v1.4s, v8.s[1]
+ OP_ri v17.4s, v0.4s, v8.s[1]
+ OP_ir v17.4s, v1.4s, v8.s[0]
- OP_rr v18.4s, v2.4s, v8.4s[0]
- OP_ii v18.4s, v3.4s, v8.4s[1]
- OP_ri v19.4s, v2.4s, v8.4s[1]
- OP_ir v19.4s, v3.4s, v8.4s[0]
+ OP_rr v18.4s, v2.4s, v8.s[0]
+ OP_ii v18.4s, v3.4s, v8.s[1]
+ OP_ri v19.4s, v2.4s, v8.s[1]
+ OP_ir v19.4s, v3.4s, v8.s[0]
.endm
.macro SAVE8x1
ldp q0, q1, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v11.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmul v20.2d, v0.2d, v9.2d[0]
- fmul v25.2d, v1.2d, v10.2d[0]
+ fmul v20.2d, v0.2d, v9.d[0]
+ fmul v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmul v18.2d, v2.2d, v8.2d[0]
- fmul v31.2d, v3.2d, v11.2d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ fmul v31.2d, v3.2d, v11.d[0]
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
- fmul v22.2d, v2.2d, v9.2d[0]
- fmul v27.2d, v3.2d, v10.2d[0]
+ fmul v22.2d, v2.2d, v9.d[0]
+ fmul v27.2d, v3.2d, v10.d[0]
ldp d12, d13, [pB]
add pB, pB, #16
- fmul v24.2d, v0.2d, v10.2d[0]
- fmul v21.2d, v1.2d, v9.2d[0]
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
ldp q4, q5, [pA] // for next round
add pA, pA, #32
- fmul v26.2d, v2.2d, v10.2d[0]
- fmul v23.2d, v3.2d, v9.2d[0]
+ fmul v26.2d, v2.2d, v10.d[0]
+ fmul v23.2d, v3.2d, v9.d[0]
ldp q6, q7, [ppA] // for next round
add ppA, ppA, #32
- fmul v28.2d, v0.2d, v11.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v28.2d, v0.2d, v11.d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
ldp d14, d15, [pB]
add pB, pB, #16
- fmul v30.2d, v2.2d, v11.2d[0]
- fmul v19.2d, v3.2d, v8.2d[0]
+ fmul v30.2d, v2.2d, v11.d[0]
+ fmul v19.2d, v3.2d, v8.d[0]
.endm
.macro KERNEL8x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
ldp d8, d9, [pB]
add pB, pB, #16
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v31.2d, v7.2d, v15.2d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
ldp d10, d11, [pB]
add pB, pB, #16
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v22.2d, v6.2d, v13.2d[0]
- fmla v27.2d, v7.2d, v14.2d[0]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v14.d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
ldp q0, q1, [pA]
add pA, pA, #32
- fmla v26.2d, v6.2d, v14.2d[0]
- fmla v23.2d, v7.2d, v13.2d[0]
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmla v30.2d, v6.2d, v15.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
.endm
.macro KERNEL8x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
ldp d12, d13, [pB]
add pB, pB, #16
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v31.2d, v3.2d, v11.2d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v31.2d, v3.2d, v11.d[0]
ldp d14, d15, [pB]
add pB, pB, #16
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmla v22.2d, v2.2d, v9.2d[0]
- fmla v27.2d, v3.2d, v10.2d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
+ fmla v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [ppA, #A_PRE_SIZE]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
ldp q4, q5, [pA]
add pA, pA, #32
- fmla v26.2d, v2.2d, v10.2d[0]
- fmla v23.2d, v3.2d, v9.2d[0]
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
ldp q6, q7, [ppA]
add ppA, ppA, #32
- fmla v30.2d, v2.2d, v11.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro KERNEL8x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v27.2d, v7.2d, v14.2d[0]
-
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v22.2d, v6.2d, v13.2d[0]
- fmla v31.2d, v7.2d, v15.2d[0]
-
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v26.2d, v6.2d, v14.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
-
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v30.2d, v6.2d, v15.2d[0]
- fmla v23.2d, v7.2d, v13.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v27.2d, v7.2d, v14.d[0]
+
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
+
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
+
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
.endm
.macro KERNEL8x4_SUB
ldp q0, q1, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
ldp q2, q3, [ppA]
add ppA, ppA, #32
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v31.2d, v3.2d, v11.2d[0]
- fmla v22.2d, v2.2d, v9.2d[0]
- fmla v27.2d, v3.2d, v10.2d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v31.2d, v3.2d, v11.d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
+ fmla v27.2d, v3.2d, v10.d[0]
- fmla v26.2d, v2.2d, v10.2d[0]
- fmla v23.2d, v3.2d, v9.2d[0]
- fmla v30.2d, v2.2d, v11.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x4
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
- fmul v18.2d, v0.2d, v8.2d[1]
- fmul v19.2d, v1.2d, v8.2d[1]
-
- fmul v20.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v9.2d[0]
- fmul v22.2d, v0.2d, v9.2d[1]
- fmul v23.2d, v1.2d, v9.2d[1]
-
- fmul v24.2d, v0.2d, v10.2d[0]
- fmul v25.2d, v1.2d, v10.2d[0]
- fmul v26.2d, v0.2d, v10.2d[1]
- fmul v27.2d, v1.2d, v10.2d[1]
-
- fmul v28.2d, v0.2d, v11.2d[0]
- fmul v29.2d, v1.2d, v11.2d[0]
- fmul v30.2d, v0.2d, v11.2d[1]
- fmul v31.2d, v1.2d, v11.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
+ fmul v18.2d, v0.2d, v8.d[1]
+ fmul v19.2d, v1.2d, v8.d[1]
+
+ fmul v20.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
+ fmul v22.2d, v0.2d, v9.d[1]
+ fmul v23.2d, v1.2d, v9.d[1]
+
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v25.2d, v1.2d, v10.d[0]
+ fmul v26.2d, v0.2d, v10.d[1]
+ fmul v27.2d, v1.2d, v10.d[1]
+
+ fmul v28.2d, v0.2d, v11.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
+ fmul v30.2d, v0.2d, v11.d[1]
+ fmul v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x8_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
- fmla v19.2d, v1.2d, v8.2d[1]
-
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
- fmla v23.2d, v1.2d, v9.2d[1]
-
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
- fmla v27.2d, v1.2d, v10.2d[1]
-
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
- fmla v31.2d, v1.2d, v11.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
+ fmla v19.2d, v1.2d, v8.d[1]
+
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
+ fmla v23.2d, v1.2d, v9.d[1]
+
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
+ fmla v27.2d, v1.2d, v10.d[1]
+
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
+ fmla v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
.endm
.macro KERNEL4x8_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v4.2d, v12.2d[1]
- fmla v19.2d, v5.2d, v12.2d[1]
-
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v4.2d, v13.2d[1]
- fmla v23.2d, v5.2d, v13.2d[1]
-
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v4.2d, v14.2d[1]
- fmla v27.2d, v5.2d, v14.2d[1]
-
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v4.2d, v15.2d[1]
- fmla v31.2d, v5.2d, v15.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v4.2d, v12.d[1]
+ fmla v19.2d, v5.2d, v12.d[1]
+
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v22.2d, v4.2d, v13.d[1]
+ fmla v23.2d, v5.2d, v13.d[1]
+
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v26.2d, v4.2d, v14.d[1]
+ fmla v27.2d, v5.2d, v14.d[1]
+
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v30.2d, v4.2d, v15.d[1]
+ fmla v31.2d, v5.2d, v15.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
.endm
.macro KERNEL4x8_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v4.2d, v12.2d[1]
- fmla v19.2d, v5.2d, v12.2d[1]
-
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v4.2d, v13.2d[1]
- fmla v23.2d, v5.2d, v13.2d[1]
-
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v4.2d, v14.2d[1]
- fmla v27.2d, v5.2d, v14.2d[1]
-
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v4.2d, v15.2d[1]
- fmla v31.2d, v5.2d, v15.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v4.2d, v12.d[1]
+ fmla v19.2d, v5.2d, v12.d[1]
+
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v22.2d, v4.2d, v13.d[1]
+ fmla v23.2d, v5.2d, v13.d[1]
+
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v26.2d, v4.2d, v14.d[1]
+ fmla v27.2d, v5.2d, v14.d[1]
+
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v30.2d, v4.2d, v15.d[1]
+ fmla v31.2d, v5.2d, v15.d[1]
.endm
.macro KERNEL4x8_SUB
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
- fmla v19.2d, v1.2d, v8.2d[1]
-
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
- fmla v23.2d, v1.2d, v9.2d[1]
-
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
- fmla v27.2d, v1.2d, v10.2d[1]
-
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
- fmla v31.2d, v1.2d, v11.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
+ fmla v19.2d, v1.2d, v8.d[1]
+
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
+ fmla v23.2d, v1.2d, v9.d[1]
+
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
+ fmla v27.2d, v1.2d, v10.d[1]
+
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
+ fmla v31.2d, v1.2d, v11.d[1]
.endm
.macro SAVE4x8
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
.endm
.macro SAVE2x8
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v9.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v29.2d, v1.2d, v9.d[1]
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v25.2d, v1.2d, v9.2d[0]
+ fmul v20.2d, v0.2d, v8.d[1]
+ fmul v25.2d, v1.2d, v9.d[0]
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v8.2d[1]
+ fmul v24.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v8.d[1]
- fmul v28.2d, v0.2d, v9.2d[1]
- fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v28.2d, v0.2d, v9.d[1]
+ fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_SUB
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
ldp d8, d9, [pB], #16
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v20.2d, v0.2d, v9.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v20.2d, v0.2d, v9.d[0]
ldp d10, d11, [pB], #16
- fmul v17.2d, v1.2d, v8.2d[0]
- fmul v21.2d, v1.2d, v9.2d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
ldp q2, q3, [pA], #32
- fmul v24.2d, v0.2d, v10.2d[0]
- fmul v28.2d, v0.2d, v11.2d[0]
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v28.2d, v0.2d, v11.d[0]
ldp q4, q5, [pA], #32
- fmul v25.2d, v1.2d, v10.2d[0]
- fmul v29.2d, v1.2d, v11.2d[0]
+ fmul v25.2d, v1.2d, v10.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
ldp d12, d13, [pB], #16
- fmul v18.2d, v2.2d, v8.2d[0]
- fmul v22.2d, v2.2d, v9.2d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ fmul v22.2d, v2.2d, v9.d[0]
ldp d14, d15, [pB], #16
- fmul v26.2d, v2.2d, v10.2d[0]
- fmul v30.2d, v2.2d, v11.2d[0]
+ fmul v26.2d, v2.2d, v10.d[0]
+ fmul v30.2d, v2.2d, v11.d[0]
ldp q6, q7, [pA], #32
- fmul v19.2d, v3.2d, v8.2d[0]
- fmul v27.2d, v3.2d, v10.2d[0]
+ fmul v19.2d, v3.2d, v8.d[0]
+ fmul v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmul v31.2d, v3.2d, v11.2d[0]
- fmul v23.2d, v3.2d, v9.2d[0]
+ fmul v31.2d, v3.2d, v11.d[0]
+ fmul v23.2d, v3.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNEL8x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
ldp q4, q5, [pA], #32
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
ldp d12, d13, [pB], #16
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
ldp d14, d15, [pB], #16
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v22.2d, v2.2d, v9.2d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmla v26.2d, v2.2d, v10.2d[0]
- fmla v30.2d, v2.2d, v11.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
- fmla v23.2d, v3.2d, v9.2d[0]
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
ldp q6, q7, [pA], #32
- fmla v27.2d, v3.2d, v10.2d[0]
- fmla v31.2d, v3.2d, v11.2d[0]
+ fmla v27.2d, v3.2d, v10.d[0]
+ fmla v31.2d, v3.2d, v11.d[0]
.endm
.macro KERNEL8x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
ldp q0, q1, [pA], #32
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
ldp d8, d9, [pB], #16
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
ldp d10, d11, [pB], #16
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v22.2d, v6.2d, v13.2d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v26.2d, v6.2d, v14.2d[0]
- fmla v30.2d, v6.2d, v15.2d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
- fmla v23.2d, v7.2d, v13.2d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
ldp q2, q3, [pA], #32
- fmla v27.2d, v7.2d, v14.2d[0]
- fmla v31.2d, v7.2d, v15.2d[0]
+ fmla v27.2d, v7.2d, v14.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v28.2d, v4.2d, v15.2d[0]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v28.2d, v4.2d, v15.d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v22.2d, v6.2d, v13.2d[0]
- fmla v26.2d, v6.2d, v14.2d[0]
- fmla v30.2d, v6.2d, v15.2d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v22.2d, v6.2d, v13.d[0]
+ fmla v26.2d, v6.2d, v14.d[0]
+ fmla v30.2d, v6.2d, v15.d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
- fmla v23.2d, v7.2d, v13.2d[0]
- fmla v27.2d, v7.2d, v14.2d[0]
- fmla v31.2d, v7.2d, v15.2d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
+ fmla v23.2d, v7.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v14.d[0]
+ fmla v31.2d, v7.2d, v15.d[0]
.endm
.macro KERNEL8x4_SUB
ldp d8, d9, [pB], #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v9.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v9.d[0]
ldp d10, d11, [pB], #16
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
ldp q2, q3, [pA], #32
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v28.2d, v0.2d, v11.2d[0]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v28.2d, v0.2d, v11.d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v22.2d, v2.2d, v9.2d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v22.2d, v2.2d, v9.d[0]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
- fmla v26.2d, v2.2d, v10.2d[0]
- fmla v30.2d, v2.2d, v11.2d[0]
+ fmla v26.2d, v2.2d, v10.d[0]
+ fmla v30.2d, v2.2d, v11.d[0]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
- fmla v19.2d, v3.2d, v8.2d[0]
- fmla v27.2d, v3.2d, v10.2d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
+ fmla v27.2d, v3.2d, v10.d[0]
- fmla v31.2d, v3.2d, v11.2d[0]
- fmla v23.2d, v3.2d, v9.2d[0]
+ fmla v31.2d, v3.2d, v11.d[0]
+ fmla v23.2d, v3.2d, v9.d[0]
.endm
.macro SAVE8x4
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v23.2d, v3.2d, v8.2d[1]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
+ fmla v22.2d, v2.2d, v8.d[1]
+ fmla v23.2d, v3.2d, v8.d[1]
.endm
.macro SAVE8x2
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x1
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v9.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v29.2d, v1.2d, v9.d[1]
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v25.2d, v1.2d, v9.2d[0]
+ fmul v20.2d, v0.2d, v8.d[1]
+ fmul v25.2d, v1.2d, v9.d[0]
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v8.2d[1]
+ fmul v24.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v8.d[1]
- fmul v28.2d, v0.2d, v9.2d[1]
- fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v28.2d, v0.2d, v9.d[1]
+ fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_SUB
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
- fmul v18.2d, v0.2d, v8.2d[1]
- fmul v19.2d, v1.2d, v8.2d[1]
-
- fmul v20.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v9.2d[0]
- fmul v22.2d, v0.2d, v9.2d[1]
- fmul v23.2d, v1.2d, v9.2d[1]
-
- fmul v24.2d, v0.2d, v10.2d[0]
- fmul v25.2d, v1.2d, v10.2d[0]
- fmul v26.2d, v0.2d, v10.2d[1]
- fmul v27.2d, v1.2d, v10.2d[1]
-
- fmul v28.2d, v0.2d, v11.2d[0]
- fmul v29.2d, v1.2d, v11.2d[0]
- fmul v30.2d, v0.2d, v11.2d[1]
- fmul v31.2d, v1.2d, v11.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
+ fmul v18.2d, v0.2d, v8.d[1]
+ fmul v19.2d, v1.2d, v8.d[1]
+
+ fmul v20.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v9.d[0]
+ fmul v22.2d, v0.2d, v9.d[1]
+ fmul v23.2d, v1.2d, v9.d[1]
+
+ fmul v24.2d, v0.2d, v10.d[0]
+ fmul v25.2d, v1.2d, v10.d[0]
+ fmul v26.2d, v0.2d, v10.d[1]
+ fmul v27.2d, v1.2d, v10.d[1]
+
+ fmul v28.2d, v0.2d, v11.d[0]
+ fmul v29.2d, v1.2d, v11.d[0]
+ fmul v30.2d, v0.2d, v11.d[1]
+ fmul v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x8_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
- fmla v19.2d, v1.2d, v8.2d[1]
-
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
- fmla v23.2d, v1.2d, v9.2d[1]
-
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
- fmla v27.2d, v1.2d, v10.2d[1]
-
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
- fmla v31.2d, v1.2d, v11.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
+ fmla v19.2d, v1.2d, v8.d[1]
+
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
+ fmla v23.2d, v1.2d, v9.d[1]
+
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
+ fmla v27.2d, v1.2d, v10.d[1]
+
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
+ fmla v31.2d, v1.2d, v11.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
.endm
.macro KERNEL4x8_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v4.2d, v12.2d[1]
- fmla v19.2d, v5.2d, v12.2d[1]
-
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v4.2d, v13.2d[1]
- fmla v23.2d, v5.2d, v13.2d[1]
-
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v4.2d, v14.2d[1]
- fmla v27.2d, v5.2d, v14.2d[1]
-
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v4.2d, v15.2d[1]
- fmla v31.2d, v5.2d, v15.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v4.2d, v12.d[1]
+ fmla v19.2d, v5.2d, v12.d[1]
+
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v22.2d, v4.2d, v13.d[1]
+ fmla v23.2d, v5.2d, v13.d[1]
+
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v26.2d, v4.2d, v14.d[1]
+ fmla v27.2d, v5.2d, v14.d[1]
+
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v30.2d, v4.2d, v15.d[1]
+ fmla v31.2d, v5.2d, v15.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
.endm
.macro KERNEL4x8_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v4.2d, v12.2d[1]
- fmla v19.2d, v5.2d, v12.2d[1]
-
- fmla v20.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v13.2d[0]
- fmla v22.2d, v4.2d, v13.2d[1]
- fmla v23.2d, v5.2d, v13.2d[1]
-
- fmla v24.2d, v4.2d, v14.2d[0]
- fmla v25.2d, v5.2d, v14.2d[0]
- fmla v26.2d, v4.2d, v14.2d[1]
- fmla v27.2d, v5.2d, v14.2d[1]
-
- fmla v28.2d, v4.2d, v15.2d[0]
- fmla v29.2d, v5.2d, v15.2d[0]
- fmla v30.2d, v4.2d, v15.2d[1]
- fmla v31.2d, v5.2d, v15.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v4.2d, v12.d[1]
+ fmla v19.2d, v5.2d, v12.d[1]
+
+ fmla v20.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v13.d[0]
+ fmla v22.2d, v4.2d, v13.d[1]
+ fmla v23.2d, v5.2d, v13.d[1]
+
+ fmla v24.2d, v4.2d, v14.d[0]
+ fmla v25.2d, v5.2d, v14.d[0]
+ fmla v26.2d, v4.2d, v14.d[1]
+ fmla v27.2d, v5.2d, v14.d[1]
+
+ fmla v28.2d, v4.2d, v15.d[0]
+ fmla v29.2d, v5.2d, v15.d[0]
+ fmla v30.2d, v4.2d, v15.d[1]
+ fmla v31.2d, v5.2d, v15.d[1]
.endm
.macro KERNEL4x8_SUB
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
- fmla v19.2d, v1.2d, v8.2d[1]
-
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
- fmla v23.2d, v1.2d, v9.2d[1]
-
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v25.2d, v1.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
- fmla v27.2d, v1.2d, v10.2d[1]
-
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v29.2d, v1.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
- fmla v31.2d, v1.2d, v11.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
+ fmla v19.2d, v1.2d, v8.d[1]
+
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
+ fmla v23.2d, v1.2d, v9.d[1]
+
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v25.2d, v1.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
+ fmla v27.2d, v1.2d, v10.d[1]
+
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v29.2d, v1.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
+ fmla v31.2d, v1.2d, v11.d[1]
.endm
.macro SAVE4x8
ld1 {v10.2d, v11.2d}, [pB]
add pB, pB, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v18.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v18.2d, v0.2d, v8.d[1]
- fmla v20.2d, v0.2d, v9.2d[0]
- fmla v22.2d, v0.2d, v9.2d[1]
+ fmla v20.2d, v0.2d, v9.d[0]
+ fmla v22.2d, v0.2d, v9.d[1]
- fmla v24.2d, v0.2d, v10.2d[0]
- fmla v26.2d, v0.2d, v10.2d[1]
+ fmla v24.2d, v0.2d, v10.d[0]
+ fmla v26.2d, v0.2d, v10.d[1]
- fmla v28.2d, v0.2d, v11.2d[0]
- fmla v30.2d, v0.2d, v11.2d[1]
+ fmla v28.2d, v0.2d, v11.d[0]
+ fmla v30.2d, v0.2d, v11.d[1]
.endm
.macro SAVE2x8
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v29.2d, v1.2d, v9.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v29.2d, v1.2d, v9.d[1]
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v25.2d, v1.2d, v9.2d[0]
+ fmul v20.2d, v0.2d, v8.d[1]
+ fmul v25.2d, v1.2d, v9.d[0]
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v21.2d, v1.2d, v8.2d[1]
+ fmul v24.2d, v0.2d, v9.d[0]
+ fmul v21.2d, v1.2d, v8.d[1]
- fmul v28.2d, v0.2d, v9.2d[1]
- fmul v17.2d, v1.2d, v8.2d[0]
+ fmul v28.2d, v0.2d, v9.d[1]
+ fmul v17.2d, v1.2d, v8.d[0]
ld1 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
ld1 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
ld1 {v4.2d, v5.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
ld1 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
ld1 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v29.2d, v5.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v29.2d, v5.2d, v13.d[1]
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v25.2d, v5.2d, v13.2d[0]
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v25.2d, v5.2d, v13.d[0]
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v21.2d, v5.2d, v12.2d[1]
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v21.2d, v5.2d, v12.d[1]
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v17.2d, v5.2d, v12.2d[0]
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v17.2d, v5.2d, v12.d[0]
.endm
.macro KERNEL4x4_SUB
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- fmul v17.2d, v1.2d, v8.2d[0]
- fmul v18.2d, v2.2d, v8.2d[0]
- fmul v19.2d, v3.2d, v8.2d[0]
-
- fmul v20.2d, v0.2d, v8.2d[1]
- fmul v21.2d, v1.2d, v8.2d[1]
- fmul v22.2d, v2.2d, v8.2d[1]
- fmul v23.2d, v3.2d, v8.2d[1]
-
- fmul v24.2d, v0.2d, v9.2d[0]
- fmul v25.2d, v1.2d, v9.2d[0]
- fmul v26.2d, v2.2d, v9.2d[0]
- fmul v27.2d, v3.2d, v9.2d[0]
-
- fmul v28.2d, v0.2d, v9.2d[1]
- fmul v29.2d, v1.2d, v9.2d[1]
- fmul v30.2d, v2.2d, v9.2d[1]
- fmul v31.2d, v3.2d, v9.2d[1]
+ fmul v16.2d, v0.2d, v8.d[0]
+ fmul v17.2d, v1.2d, v8.d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ fmul v19.2d, v3.2d, v8.d[0]
+
+ fmul v20.2d, v0.2d, v8.d[1]
+ fmul v21.2d, v1.2d, v8.d[1]
+ fmul v22.2d, v2.2d, v8.d[1]
+ fmul v23.2d, v3.2d, v8.d[1]
+
+ fmul v24.2d, v0.2d, v9.d[0]
+ fmul v25.2d, v1.2d, v9.d[0]
+ fmul v26.2d, v2.2d, v9.d[0]
+ fmul v27.2d, v3.2d, v9.d[0]
+
+ fmul v28.2d, v0.2d, v9.d[1]
+ fmul v29.2d, v1.2d, v9.d[1]
+ fmul v30.2d, v2.2d, v9.d[1]
+ fmul v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
.endm
.macro KERNEL8x4_M1
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
-
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v23.2d, v3.2d, v8.2d[1]
-
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v25.2d, v1.2d, v9.2d[0]
- fmla v26.2d, v2.2d, v9.2d[0]
- fmla v27.2d, v3.2d, v9.2d[0]
-
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v29.2d, v1.2d, v9.2d[1]
- fmla v30.2d, v2.2d, v9.2d[1]
- fmla v31.2d, v3.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
+
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
+ fmla v22.2d, v2.2d, v8.d[1]
+ fmla v23.2d, v3.2d, v8.d[1]
+
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v25.2d, v1.2d, v9.d[0]
+ fmla v26.2d, v2.2d, v9.d[0]
+ fmla v27.2d, v3.2d, v9.d[0]
+
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v29.2d, v1.2d, v9.d[1]
+ fmla v30.2d, v2.2d, v9.d[1]
+ fmla v31.2d, v3.2d, v9.d[1]
ld1 {v4.2d, v5.2d}, [pA]
add pA, pA, #32
.endm
.macro KERNEL8x4_M2
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
-
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v21.2d, v5.2d, v12.2d[1]
- fmla v22.2d, v6.2d, v12.2d[1]
- fmla v23.2d, v7.2d, v12.2d[1]
-
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v25.2d, v5.2d, v13.2d[0]
- fmla v26.2d, v6.2d, v13.2d[0]
- fmla v27.2d, v7.2d, v13.2d[0]
-
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v29.2d, v5.2d, v13.2d[1]
- fmla v30.2d, v6.2d, v13.2d[1]
- fmla v31.2d, v7.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
+
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v21.2d, v5.2d, v12.d[1]
+ fmla v22.2d, v6.2d, v12.d[1]
+ fmla v23.2d, v7.2d, v12.d[1]
+
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v25.2d, v5.2d, v13.d[0]
+ fmla v26.2d, v6.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v13.d[0]
+
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v29.2d, v5.2d, v13.d[1]
+ fmla v30.2d, v6.2d, v13.d[1]
+ fmla v31.2d, v7.2d, v13.d[1]
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
.endm
.macro KERNEL8x4_E
- fmla v16.2d, v4.2d, v12.2d[0]
- fmla v17.2d, v5.2d, v12.2d[0]
- fmla v18.2d, v6.2d, v12.2d[0]
- fmla v19.2d, v7.2d, v12.2d[0]
-
- fmla v20.2d, v4.2d, v12.2d[1]
- fmla v21.2d, v5.2d, v12.2d[1]
- fmla v22.2d, v6.2d, v12.2d[1]
- fmla v23.2d, v7.2d, v12.2d[1]
-
- fmla v24.2d, v4.2d, v13.2d[0]
- fmla v25.2d, v5.2d, v13.2d[0]
- fmla v26.2d, v6.2d, v13.2d[0]
- fmla v27.2d, v7.2d, v13.2d[0]
-
- fmla v28.2d, v4.2d, v13.2d[1]
- fmla v29.2d, v5.2d, v13.2d[1]
- fmla v30.2d, v6.2d, v13.2d[1]
- fmla v31.2d, v7.2d, v13.2d[1]
+ fmla v16.2d, v4.2d, v12.d[0]
+ fmla v17.2d, v5.2d, v12.d[0]
+ fmla v18.2d, v6.2d, v12.d[0]
+ fmla v19.2d, v7.2d, v12.d[0]
+
+ fmla v20.2d, v4.2d, v12.d[1]
+ fmla v21.2d, v5.2d, v12.d[1]
+ fmla v22.2d, v6.2d, v12.d[1]
+ fmla v23.2d, v7.2d, v12.d[1]
+
+ fmla v24.2d, v4.2d, v13.d[0]
+ fmla v25.2d, v5.2d, v13.d[0]
+ fmla v26.2d, v6.2d, v13.d[0]
+ fmla v27.2d, v7.2d, v13.d[0]
+
+ fmla v28.2d, v4.2d, v13.d[1]
+ fmla v29.2d, v5.2d, v13.d[1]
+ fmla v30.2d, v6.2d, v13.d[1]
+ fmla v31.2d, v7.2d, v13.d[1]
.endm
.macro KERNEL8x4_SUB
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
-
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v23.2d, v3.2d, v8.2d[1]
-
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v25.2d, v1.2d, v9.2d[0]
- fmla v26.2d, v2.2d, v9.2d[0]
- fmla v27.2d, v3.2d, v9.2d[0]
-
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v29.2d, v1.2d, v9.2d[1]
- fmla v30.2d, v2.2d, v9.2d[1]
- fmla v31.2d, v3.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
+
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
+ fmla v22.2d, v2.2d, v8.d[1]
+ fmla v23.2d, v3.2d, v8.d[1]
+
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v25.2d, v1.2d, v9.d[0]
+ fmla v26.2d, v2.2d, v9.d[0]
+ fmla v27.2d, v3.2d, v9.d[0]
+
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v29.2d, v1.2d, v9.d[1]
+ fmla v30.2d, v2.2d, v9.d[1]
+ fmla v31.2d, v3.2d, v9.d[1]
.endm
.macro SAVE8x4
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v29.2d, v1.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v29.2d, v1.2d, v9.d[1]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v25.2d, v1.2d, v9.2d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v25.2d, v1.2d, v9.d[0]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v21.2d, v1.2d, v8.d[1]
- fmla v28.2d, v0.2d, v9.2d[1]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x4
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v24.2d, v0.2d, v9.2d[0]
- fmla v28.2d, v0.2d, v9.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v24.2d, v0.2d, v9.d[0]
+ fmla v28.2d, v0.2d, v9.d[1]
.endm
.macro SAVE2x4
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
- fmla v22.2d, v2.2d, v8.2d[1]
- fmla v23.2d, v3.2d, v8.2d[1]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
+ fmla v22.2d, v2.2d, v8.d[1]
+ fmla v23.2d, v3.2d, v8.d[1]
.endm
.macro SAVE8x2
ld1 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
- fmla v21.2d, v1.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
+ fmla v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE4x2
ld1 {v0.2d}, [pA]
add pA, pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v20.2d, v0.2d, v8.2d[1]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v20.2d, v0.2d, v8.d[1]
.endm
.macro SAVE2x2
ldr d0 , [pA]
add pA, pA, #8
- fmla v16.2d, v8.2d, v0.2d[0]
+ fmla v16.2d, v8.2d, v0.d[0]
.endm
.macro SAVE1x2
ld1 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
- fmla v18.2d, v2.2d, v8.2d[0]
- fmla v19.2d, v3.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
+ fmla v18.2d, v2.2d, v8.d[0]
+ fmla v19.2d, v3.2d, v8.d[0]
.endm
.macro SAVE8x1
ld1 {v0.2d, v1.2d}, [pA]
add pA , pA, #32
- fmla v16.2d, v0.2d, v8.2d[0]
- fmla v17.2d, v1.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
+ fmla v17.2d, v1.2d, v8.d[0]
.endm
.macro SAVE4x1
ld1 {v0.2d}, [pA]
add pA , pA, #16
- fmla v16.2d, v0.2d, v8.2d[0]
+ fmla v16.2d, v0.2d, v8.d[0]
.endm
.macro SAVE2x1
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v18.4s, v2.4s, v8.2s[0]
- fmul v19.4s, v3.4s, v8.2s[0]
-
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v22.4s, v2.4s, v8.2s[1]
- fmul v23.4s, v3.4s, v8.2s[1]
-
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v26.4s, v2.4s, v9.2s[0]
- fmul v27.4s, v3.4s, v9.2s[0]
-
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
- fmul v30.4s, v2.4s, v9.2s[1]
- fmul v31.4s, v3.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ fmul v19.4s, v3.4s, v8.s[0]
+
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ fmul v23.4s, v3.4s, v8.s[1]
+
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v26.4s, v2.4s, v9.s[0]
+ fmul v27.4s, v3.4s, v9.s[0]
+
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
+ fmul v30.4s, v2.4s, v9.s[1]
+ fmul v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL16x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
-
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
-
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v26.4s, v2.4s, v9.2s[0]
- fmla v27.4s, v3.4s, v9.2s[0]
-
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
- fmla v30.4s, v2.4s, v9.2s[1]
- fmla v31.4s, v3.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
+
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
+
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v26.4s, v2.4s, v9.s[0]
+ fmla v27.4s, v3.4s, v9.s[0]
+
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
+ fmla v30.4s, v2.4s, v9.s[1]
+ fmla v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL16x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v18.4s, v6.4s, v12.2s[0]
- fmla v19.4s, v7.4s, v12.2s[0]
-
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v22.4s, v6.4s, v12.2s[1]
- fmla v23.4s, v7.4s, v12.2s[1]
-
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v26.4s, v6.4s, v13.2s[0]
- fmla v27.4s, v7.4s, v13.2s[0]
-
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
- fmla v30.4s, v6.4s, v13.2s[1]
- fmla v31.4s, v7.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
+
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v22.4s, v6.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
+
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v13.s[0]
+
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
+ fmla v30.4s, v6.4s, v13.s[1]
+ fmla v31.4s, v7.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL16x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v18.4s, v6.4s, v12.2s[0]
- fmla v19.4s, v7.4s, v12.2s[0]
-
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v22.4s, v6.4s, v12.2s[1]
- fmla v23.4s, v7.4s, v12.2s[1]
-
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v26.4s, v6.4s, v13.2s[0]
- fmla v27.4s, v7.4s, v13.2s[0]
-
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
- fmla v30.4s, v6.4s, v13.2s[1]
- fmla v31.4s, v7.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
+
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v22.4s, v6.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
+
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v13.s[0]
+
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
+ fmla v30.4s, v6.4s, v13.s[1]
+ fmla v31.4s, v7.4s, v13.s[1]
.endm
.macro KERNEL16x4_SUB
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
-
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
-
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v26.4s, v2.4s, v9.2s[0]
- fmla v27.4s, v3.4s, v9.2s[0]
-
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
- fmla v30.4s, v2.4s, v9.2s[1]
- fmla v31.4s, v3.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
+
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
+
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v26.4s, v2.4s, v9.s[0]
+ fmla v27.4s, v3.4s, v9.s[0]
+
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
+ fmla v30.4s, v2.4s, v9.s[1]
+ fmla v31.4s, v3.4s, v9.s[1]
.endm
.macro SAVE16x4
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
.endm
.macro KERNEL8x4_SUB
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
.endm
.macro SAVE8x4
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
.endm
.macro SAVE16x2
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE8x2
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
.endm
.macro SAVE16x1
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
.endm
.macro SAVE8x1
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
ld1 {v0.4s}, [pA_0]
add pA_0, pA_0, #16
- fmul v16.4s, v0.4s, v8.4s[0]
- fmul v20.4s, v0.4s, v8.4s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
ld1 {v2.4s}, [pA_1]
add pA_1, pA_1, #16
- fmul v24.4s, v0.4s, v8.4s[2]
- fmul v28.4s, v0.4s, v8.4s[3]
+ fmul v24.4s, v0.4s, v8.s[2]
+ fmul v28.4s, v0.4s, v8.s[3]
ld1 {v4.4s}, [pA_2]
add pA_2, pA_2, #16
- fmul v17.4s, v2.4s, v8.4s[0]
- fmul v21.4s, v2.4s, v8.4s[1]
+ fmul v17.4s, v2.4s, v8.s[0]
+ fmul v21.4s, v2.4s, v8.s[1]
ld1 {v6.4s}, [pA_3]
add pA_3, pA_3, #16
- fmul v25.4s, v2.4s, v8.4s[2]
- fmul v29.4s, v2.4s, v8.4s[3]
+ fmul v25.4s, v2.4s, v8.s[2]
+ fmul v29.4s, v2.4s, v8.s[3]
ld1 {v12.4s}, [pB] // for next round
add pB, pB, #16
- fmul v18.4s, v4.4s, v8.4s[0]
- fmul v19.4s, v6.4s, v8.4s[0]
+ fmul v18.4s, v4.4s, v8.s[0]
+ fmul v19.4s, v6.4s, v8.s[0]
ld1 {v1.4s}, [pA_0] // for next round
add pA_0, pA_0, #16
- fmul v22.4s, v4.4s, v8.4s[1]
- fmul v23.4s, v6.4s, v8.4s[1]
+ fmul v22.4s, v4.4s, v8.s[1]
+ fmul v23.4s, v6.4s, v8.s[1]
ld1 {v3.4s}, [pA_1] // for next round
add pA_1, pA_1, #16
- fmul v26.4s, v4.4s, v8.4s[2]
- fmul v27.4s, v6.4s, v8.4s[2]
+ fmul v26.4s, v4.4s, v8.s[2]
+ fmul v27.4s, v6.4s, v8.s[2]
ld1 {v5.4s}, [pA_2] // for next round
add pA_2, pA_2, #16
- fmul v30.4s, v4.4s, v8.4s[3]
- fmul v31.4s, v6.4s, v8.4s[3]
+ fmul v30.4s, v4.4s, v8.s[3]
+ fmul v31.4s, v6.4s, v8.s[3]
ld1 {v7.4s}, [pA_3] // for next round
add pA_3, pA_3, #16
.endm
.macro KERNEL16x4_M2
- fmla v16.4s, v1.4s, v12.4s[0]
- fmla v17.4s, v3.4s, v12.4s[0]
+ fmla v16.4s, v1.4s, v12.s[0]
+ fmla v17.4s, v3.4s, v12.s[0]
ld1 {v8.4s}, [pB] // for next round
add pB, pB, #16
- fmla v18.4s, v5.4s, v12.4s[0]
- fmla v19.4s, v7.4s, v12.4s[0]
+ fmla v18.4s, v5.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
ld1 {v0.4s}, [pA_0] // for next round
add pA_0, pA_0, #16
- fmla v20.4s, v1.4s, v12.4s[1]
- fmla v21.4s, v3.4s, v12.4s[1]
+ fmla v20.4s, v1.4s, v12.s[1]
+ fmla v21.4s, v3.4s, v12.s[1]
ld1 {v2.4s}, [pA_1] // for next round
add pA_1, pA_1, #16
- fmla v22.4s, v5.4s, v12.4s[1]
- fmla v23.4s, v7.4s, v12.4s[1]
+ fmla v22.4s, v5.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
ld1 {v4.4s}, [pA_2] // for next round
add pA_2, pA_2, #16
- fmla v24.4s, v1.4s, v12.4s[2]
- fmla v25.4s, v3.4s, v12.4s[2]
+ fmla v24.4s, v1.4s, v12.s[2]
+ fmla v25.4s, v3.4s, v12.s[2]
ld1 {v6.4s}, [pA_3] // for next round
add pA_3, pA_3, #16
- fmla v26.4s, v5.4s, v12.4s[2]
- fmla v27.4s, v7.4s, v12.4s[2]
+ fmla v26.4s, v5.4s, v12.s[2]
+ fmla v27.4s, v7.4s, v12.s[2]
prfm PLDL1KEEP, [pA_2, #512]
- fmla v28.4s, v1.4s, v12.4s[3]
- fmla v29.4s, v3.4s, v12.4s[3]
+ fmla v28.4s, v1.4s, v12.s[3]
+ fmla v29.4s, v3.4s, v12.s[3]
prfm PLDL1KEEP, [pA_3, #512]
- fmla v30.4s, v5.4s, v12.4s[3]
- fmla v31.4s, v7.4s, v12.4s[3]
+ fmla v30.4s, v5.4s, v12.s[3]
+ fmla v31.4s, v7.4s, v12.s[3]
prfm PLDL1KEEP, [pB, #512]
.endm
.macro KERNEL16x4_M1
- fmla v16.4s, v0.4s, v8.4s[0]
- fmla v17.4s, v2.4s, v8.4s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v2.4s, v8.s[0]
ld1 {v12.4s}, [pB] // for next round
add pB, pB, #16
- fmla v18.4s, v4.4s, v8.4s[0]
- fmla v19.4s, v6.4s, v8.4s[0]
+ fmla v18.4s, v4.4s, v8.s[0]
+ fmla v19.4s, v6.4s, v8.s[0]
ld1 {v1.4s}, [pA_0] // for next round
add pA_0, pA_0, #16
- fmla v20.4s, v0.4s, v8.4s[1]
- fmla v21.4s, v2.4s, v8.4s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v2.4s, v8.s[1]
ld1 {v3.4s}, [pA_1] // for next round
add pA_1, pA_1, #16
- fmla v22.4s, v4.4s, v8.4s[1]
- fmla v23.4s, v6.4s, v8.4s[1]
+ fmla v22.4s, v4.4s, v8.s[1]
+ fmla v23.4s, v6.4s, v8.s[1]
ld1 {v5.4s}, [pA_2] // for next round
add pA_2, pA_2, #16
- fmla v24.4s, v0.4s, v8.4s[2]
- fmla v25.4s, v2.4s, v8.4s[2]
+ fmla v24.4s, v0.4s, v8.s[2]
+ fmla v25.4s, v2.4s, v8.s[2]
ld1 {v7.4s}, [pA_3] // for next round
add pA_3, pA_3, #16
- fmla v26.4s, v4.4s, v8.4s[2]
- fmla v27.4s, v6.4s, v8.4s[2]
+ fmla v26.4s, v4.4s, v8.s[2]
+ fmla v27.4s, v6.4s, v8.s[2]
prfm PLDL1KEEP, [pA_0, #512]
- fmla v28.4s, v0.4s, v8.4s[3]
- fmla v29.4s, v2.4s, v8.4s[3]
+ fmla v28.4s, v0.4s, v8.s[3]
+ fmla v29.4s, v2.4s, v8.s[3]
prfm PLDL1KEEP, [pA_1, #512]
- fmla v30.4s, v4.4s, v8.4s[3]
- fmla v31.4s, v6.4s, v8.4s[3]
+ fmla v30.4s, v4.4s, v8.s[3]
+ fmla v31.4s, v6.4s, v8.s[3]
.endm
.macro KERNEL16x4_E
- fmla v16.4s, v1.4s, v12.4s[0]
- fmla v17.4s, v3.4s, v12.4s[0]
- fmla v18.4s, v5.4s, v12.4s[0]
- fmla v19.4s, v7.4s, v12.4s[0]
- fmla v20.4s, v1.4s, v12.4s[1]
- fmla v21.4s, v3.4s, v12.4s[1]
- fmla v22.4s, v5.4s, v12.4s[1]
- fmla v23.4s, v7.4s, v12.4s[1]
- fmla v24.4s, v1.4s, v12.4s[2]
- fmla v25.4s, v3.4s, v12.4s[2]
- fmla v26.4s, v5.4s, v12.4s[2]
- fmla v27.4s, v7.4s, v12.4s[2]
- fmla v28.4s, v1.4s, v12.4s[3]
- fmla v29.4s, v3.4s, v12.4s[3]
- fmla v30.4s, v5.4s, v12.4s[3]
- fmla v31.4s, v7.4s, v12.4s[3]
+ fmla v16.4s, v1.4s, v12.s[0]
+ fmla v17.4s, v3.4s, v12.s[0]
+ fmla v18.4s, v5.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
+ fmla v20.4s, v1.4s, v12.s[1]
+ fmla v21.4s, v3.4s, v12.s[1]
+ fmla v22.4s, v5.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
+ fmla v24.4s, v1.4s, v12.s[2]
+ fmla v25.4s, v3.4s, v12.s[2]
+ fmla v26.4s, v5.4s, v12.s[2]
+ fmla v27.4s, v7.4s, v12.s[2]
+ fmla v28.4s, v1.4s, v12.s[3]
+ fmla v29.4s, v3.4s, v12.s[3]
+ fmla v30.4s, v5.4s, v12.s[3]
+ fmla v31.4s, v7.4s, v12.s[3]
.endm
.macro KERNEL16x4_SUB
ld1 {v0.4s}, [pA_0]
add pA_0, pA_0, #16
- fmla v16.4s, v0.4s, v8.4s[0]
- fmla v20.4s, v0.4s, v8.4s[1]
- fmla v24.4s, v0.4s, v8.4s[2]
- fmla v28.4s, v0.4s, v8.4s[3]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v8.s[2]
+ fmla v28.4s, v0.4s, v8.s[3]
ld1 {v2.4s}, [pA_1]
add pA_1, pA_1, #16
- fmla v17.4s, v2.4s, v8.4s[0]
- fmla v21.4s, v2.4s, v8.4s[1]
- fmla v25.4s, v2.4s, v8.4s[2]
- fmla v29.4s, v2.4s, v8.4s[3]
+ fmla v17.4s, v2.4s, v8.s[0]
+ fmla v21.4s, v2.4s, v8.s[1]
+ fmla v25.4s, v2.4s, v8.s[2]
+ fmla v29.4s, v2.4s, v8.s[3]
ld1 {v4.4s}, [pA_2]
add pA_2, pA_2, #16
- fmla v18.4s, v4.4s, v8.4s[0]
- fmla v22.4s, v4.4s, v8.4s[1]
- fmla v26.4s, v4.4s, v8.4s[2]
- fmla v30.4s, v4.4s, v8.4s[3]
+ fmla v18.4s, v4.4s, v8.s[0]
+ fmla v22.4s, v4.4s, v8.s[1]
+ fmla v26.4s, v4.4s, v8.s[2]
+ fmla v30.4s, v4.4s, v8.s[3]
ld1 {v6.4s}, [pA_3]
add pA_3, pA_3, #16
- fmla v19.4s, v6.4s, v8.4s[0]
- fmla v23.4s, v6.4s, v8.4s[1]
- fmla v27.4s, v6.4s, v8.4s[2]
- fmla v31.4s, v6.4s, v8.4s[3]
+ fmla v19.4s, v6.4s, v8.s[0]
+ fmla v23.4s, v6.4s, v8.s[1]
+ fmla v27.4s, v6.4s, v8.s[2]
+ fmla v31.4s, v6.4s, v8.s[3]
.endm
.macro SAVE16x4
ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v2.2s, v3.2s}, [pA_1]
add pA_1, pA_1, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
- fmla v18.2s, v2.2s, v8.2s[0]
- fmla v31.2s, v3.2s, v9.2s[1]
- fmla v22.2s, v2.2s, v8.2s[1]
- fmla v27.2s, v3.2s, v9.2s[0]
+ fmla v18.2s, v2.2s, v8.s[0]
+ fmla v31.2s, v3.2s, v9.s[1]
+ fmla v22.2s, v2.2s, v8.s[1]
+ fmla v27.2s, v3.2s, v9.s[0]
- fmla v26.2s, v2.2s, v9.2s[0]
- fmla v23.2s, v3.2s, v8.2s[1]
- fmla v30.2s, v2.2s, v9.2s[1]
- fmla v19.2s, v3.2s, v8.2s[0]
+ fmla v26.2s, v2.2s, v9.s[0]
+ fmla v23.2s, v3.2s, v8.s[1]
+ fmla v30.2s, v2.2s, v9.s[1]
+ fmla v19.2s, v3.2s, v8.s[0]
.endm
.macro SAVE8x4
ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
ld1 {v0.2s}, [pA_0]
add pA_0, pA_0, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0, pA_0, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
ld1 {v0.2s}, [pA_0]
add pA_0, pA_0, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
ldr s0 , [pA_0]
add pA_0, pA_0, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
ld1 {v0.2s, v1.2s}, [pA_0]
add pA_0 , pA_0, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
ld1 {v0.2s}, [pA_0]
add pA_0 , pA_0, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v4.4s[0]
- fmul v17.4s, v1.4s, v4.4s[0]
- fmul v18.4s, v0.4s, v4.4s[1]
- fmul v19.4s, v1.4s, v4.4s[1]
- fmul v20.4s, v0.4s, v4.4s[2]
- fmul v21.4s, v1.4s, v4.4s[2]
- fmul v22.4s, v0.4s, v4.4s[3]
- fmul v23.4s, v1.4s, v4.4s[3]
- fmul v24.4s, v0.4s, v5.4s[0]
- fmul v25.4s, v1.4s, v5.4s[0]
- fmul v26.4s, v0.4s, v5.4s[1]
- fmul v27.4s, v1.4s, v5.4s[1]
- fmul v28.4s, v0.4s, v5.4s[2]
- fmul v29.4s, v1.4s, v5.4s[2]
- fmul v30.4s, v0.4s, v5.4s[3]
- fmul v31.4s, v1.4s, v5.4s[3]
+ fmul v16.4s, v0.4s, v4.s[0]
+ fmul v17.4s, v1.4s, v4.s[0]
+ fmul v18.4s, v0.4s, v4.s[1]
+ fmul v19.4s, v1.4s, v4.s[1]
+ fmul v20.4s, v0.4s, v4.s[2]
+ fmul v21.4s, v1.4s, v4.s[2]
+ fmul v22.4s, v0.4s, v4.s[3]
+ fmul v23.4s, v1.4s, v4.s[3]
+ fmul v24.4s, v0.4s, v5.s[0]
+ fmul v25.4s, v1.4s, v5.s[0]
+ fmul v26.4s, v0.4s, v5.s[1]
+ fmul v27.4s, v1.4s, v5.s[1]
+ fmul v28.4s, v0.4s, v5.s[2]
+ fmul v29.4s, v1.4s, v5.s[2]
+ fmul v30.4s, v0.4s, v5.s[3]
+ fmul v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x8_M1
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v17.4s, v1.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v19.4s, v1.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v21.4s, v1.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v23.4s, v1.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v25.4s, v1.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v27.4s, v1.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v29.4s, v1.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
- fmla v31.4s, v1.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v17.4s, v1.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v19.4s, v1.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v21.4s, v1.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v23.4s, v1.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v25.4s, v1.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v27.4s, v1.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v29.4s, v1.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
+ fmla v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x8_M2
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v17.4s, v3.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v19.4s, v3.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v21.4s, v3.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v23.4s, v3.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v25.4s, v3.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v27.4s, v3.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v29.4s, v3.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
- fmla v31.4s, v3.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v17.4s, v3.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v19.4s, v3.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v21.4s, v3.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v23.4s, v3.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v25.4s, v3.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v27.4s, v3.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v29.4s, v3.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
+ fmla v31.4s, v3.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x8_E
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v17.4s, v3.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v19.4s, v3.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v21.4s, v3.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v23.4s, v3.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v25.4s, v3.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v27.4s, v3.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v29.4s, v3.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
- fmla v31.4s, v3.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v17.4s, v3.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v19.4s, v3.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v21.4s, v3.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v23.4s, v3.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v25.4s, v3.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v27.4s, v3.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v29.4s, v3.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
+ fmla v31.4s, v3.4s, v7.s[3]
.endm
.macro KERNEL8x8_SUB
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v17.4s, v1.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v19.4s, v1.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v21.4s, v1.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v23.4s, v1.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v25.4s, v1.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v27.4s, v1.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v29.4s, v1.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
- fmla v31.4s, v1.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v17.4s, v1.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v19.4s, v1.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v21.4s, v1.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v23.4s, v1.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v25.4s, v1.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v27.4s, v1.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v29.4s, v1.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
+ fmla v31.4s, v1.4s, v5.s[3]
.endm
.macro SAVE8x8
ld1 {v0.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v4.4s[0]
- fmul v18.4s, v0.4s, v4.4s[1]
- fmul v20.4s, v0.4s, v4.4s[2]
- fmul v22.4s, v0.4s, v4.4s[3]
- fmul v24.4s, v0.4s, v5.4s[0]
- fmul v26.4s, v0.4s, v5.4s[1]
- fmul v28.4s, v0.4s, v5.4s[2]
- fmul v30.4s, v0.4s, v5.4s[3]
+ fmul v16.4s, v0.4s, v4.s[0]
+ fmul v18.4s, v0.4s, v4.s[1]
+ fmul v20.4s, v0.4s, v4.s[2]
+ fmul v22.4s, v0.4s, v4.s[3]
+ fmul v24.4s, v0.4s, v5.s[0]
+ fmul v26.4s, v0.4s, v5.s[1]
+ fmul v28.4s, v0.4s, v5.s[2]
+ fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x8_M1
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x8_M2
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x8_E
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
.endm
.macro KERNEL4x8_SUB
ld1 {v0.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
.endm
.macro SAVE4x8
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v4.4s[0]
- fmla v18.2s, v0.2s, v4.4s[1]
- fmla v20.2s, v0.2s, v4.4s[2]
- fmla v22.2s, v0.2s, v4.4s[3]
- fmla v24.2s, v0.2s, v5.4s[0]
- fmla v26.2s, v0.2s, v5.4s[1]
- fmla v28.2s, v0.2s, v5.4s[2]
- fmla v30.2s, v0.2s, v5.4s[3]
+ fmla v16.2s, v0.2s, v4.s[0]
+ fmla v18.2s, v0.2s, v4.s[1]
+ fmla v20.2s, v0.2s, v4.s[2]
+ fmla v22.2s, v0.2s, v4.s[3]
+ fmla v24.2s, v0.2s, v5.s[0]
+ fmla v26.2s, v0.2s, v5.s[1]
+ fmla v28.2s, v0.2s, v5.s[2]
+ fmla v30.2s, v0.2s, v5.s[3]
.endm
.macro SAVE2x8
ldr s0, [pA]
add pA, pA, #4
- fmla s16, s0, v4.4s[0]
- fmla s18, s0, v4.4s[1]
- fmla s20, s0, v4.4s[2]
- fmla s22, s0, v4.4s[3]
- fmla s24, s0, v5.4s[0]
- fmla s26, s0, v5.4s[1]
- fmla s28, s0, v5.4s[2]
- fmla s30, s0, v5.4s[3]
+ fmla s16, s0, v4.s[0]
+ fmla s18, s0, v4.s[1]
+ fmla s20, s0, v4.s[2]
+ fmla s22, s0, v4.s[3]
+ fmla s24, s0, v5.s[0]
+ fmla s26, s0, v5.s[1]
+ fmla s28, s0, v5.s[2]
+ fmla s30, s0, v5.s[3]
.endm
.macro SAVE1x8
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
.endm
.macro KERNEL8x4_SUB
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
.endm
.macro SAVE8x4
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE8x2
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
.endm
.macro SAVE8x1
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v18.4s, v2.4s, v8.2s[0]
- fmul v19.4s, v3.4s, v8.2s[0]
-
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v22.4s, v2.4s, v8.2s[1]
- fmul v23.4s, v3.4s, v8.2s[1]
-
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v26.4s, v2.4s, v9.2s[0]
- fmul v27.4s, v3.4s, v9.2s[0]
-
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
- fmul v30.4s, v2.4s, v9.2s[1]
- fmul v31.4s, v3.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v18.4s, v2.4s, v8.s[0]
+ fmul v19.4s, v3.4s, v8.s[0]
+
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v22.4s, v2.4s, v8.s[1]
+ fmul v23.4s, v3.4s, v8.s[1]
+
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v26.4s, v2.4s, v9.s[0]
+ fmul v27.4s, v3.4s, v9.s[0]
+
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
+ fmul v30.4s, v2.4s, v9.s[1]
+ fmul v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL16x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
-
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
-
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v26.4s, v2.4s, v9.2s[0]
- fmla v27.4s, v3.4s, v9.2s[0]
-
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
- fmla v30.4s, v2.4s, v9.2s[1]
- fmla v31.4s, v3.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
+
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
+
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v26.4s, v2.4s, v9.s[0]
+ fmla v27.4s, v3.4s, v9.s[0]
+
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
+ fmla v30.4s, v2.4s, v9.s[1]
+ fmla v31.4s, v3.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL16x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v18.4s, v6.4s, v12.2s[0]
- fmla v19.4s, v7.4s, v12.2s[0]
-
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v22.4s, v6.4s, v12.2s[1]
- fmla v23.4s, v7.4s, v12.2s[1]
-
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v26.4s, v6.4s, v13.2s[0]
- fmla v27.4s, v7.4s, v13.2s[0]
-
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
- fmla v30.4s, v6.4s, v13.2s[1]
- fmla v31.4s, v7.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
+
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v22.4s, v6.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
+
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v13.s[0]
+
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
+ fmla v30.4s, v6.4s, v13.s[1]
+ fmla v31.4s, v7.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL16x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v18.4s, v6.4s, v12.2s[0]
- fmla v19.4s, v7.4s, v12.2s[0]
-
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v22.4s, v6.4s, v12.2s[1]
- fmla v23.4s, v7.4s, v12.2s[1]
-
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v26.4s, v6.4s, v13.2s[0]
- fmla v27.4s, v7.4s, v13.2s[0]
-
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
- fmla v30.4s, v6.4s, v13.2s[1]
- fmla v31.4s, v7.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v18.4s, v6.4s, v12.s[0]
+ fmla v19.4s, v7.4s, v12.s[0]
+
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v22.4s, v6.4s, v12.s[1]
+ fmla v23.4s, v7.4s, v12.s[1]
+
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v26.4s, v6.4s, v13.s[0]
+ fmla v27.4s, v7.4s, v13.s[0]
+
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
+ fmla v30.4s, v6.4s, v13.s[1]
+ fmla v31.4s, v7.4s, v13.s[1]
.endm
.macro KERNEL16x4_SUB
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
-
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
-
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v26.4s, v2.4s, v9.2s[0]
- fmla v27.4s, v3.4s, v9.2s[0]
-
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
- fmla v30.4s, v2.4s, v9.2s[1]
- fmla v31.4s, v3.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
+
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
+
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v26.4s, v2.4s, v9.s[0]
+ fmla v27.4s, v3.4s, v9.s[0]
+
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
+ fmla v30.4s, v2.4s, v9.s[1]
+ fmla v31.4s, v3.4s, v9.s[1]
.endm
.macro SAVE16x4
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
.endm
.macro KERNEL8x4_SUB
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
.endm
.macro SAVE8x4
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v22.4s, v2.4s, v8.2s[1]
- fmla v23.4s, v3.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v22.4s, v2.4s, v8.s[1]
+ fmla v23.4s, v3.4s, v8.s[1]
.endm
.macro SAVE16x2
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE8x2
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
ld1 {v3.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v18.4s, v2.4s, v8.2s[0]
- fmla v19.4s, v3.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v18.4s, v2.4s, v8.s[0]
+ fmla v19.4s, v3.4s, v8.s[0]
.endm
.macro SAVE16x1
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
.endm
.macro SAVE8x1
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v4.4s[0]
- fmul v17.4s, v1.4s, v4.4s[0]
- fmul v18.4s, v0.4s, v4.4s[1]
- fmul v19.4s, v1.4s, v4.4s[1]
- fmul v20.4s, v0.4s, v4.4s[2]
- fmul v21.4s, v1.4s, v4.4s[2]
- fmul v22.4s, v0.4s, v4.4s[3]
- fmul v23.4s, v1.4s, v4.4s[3]
- fmul v24.4s, v0.4s, v5.4s[0]
- fmul v25.4s, v1.4s, v5.4s[0]
- fmul v26.4s, v0.4s, v5.4s[1]
- fmul v27.4s, v1.4s, v5.4s[1]
- fmul v28.4s, v0.4s, v5.4s[2]
- fmul v29.4s, v1.4s, v5.4s[2]
- fmul v30.4s, v0.4s, v5.4s[3]
- fmul v31.4s, v1.4s, v5.4s[3]
+ fmul v16.4s, v0.4s, v4.s[0]
+ fmul v17.4s, v1.4s, v4.s[0]
+ fmul v18.4s, v0.4s, v4.s[1]
+ fmul v19.4s, v1.4s, v4.s[1]
+ fmul v20.4s, v0.4s, v4.s[2]
+ fmul v21.4s, v1.4s, v4.s[2]
+ fmul v22.4s, v0.4s, v4.s[3]
+ fmul v23.4s, v1.4s, v4.s[3]
+ fmul v24.4s, v0.4s, v5.s[0]
+ fmul v25.4s, v1.4s, v5.s[0]
+ fmul v26.4s, v0.4s, v5.s[1]
+ fmul v27.4s, v1.4s, v5.s[1]
+ fmul v28.4s, v0.4s, v5.s[2]
+ fmul v29.4s, v1.4s, v5.s[2]
+ fmul v30.4s, v0.4s, v5.s[3]
+ fmul v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x8_M1
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v17.4s, v1.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v19.4s, v1.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v21.4s, v1.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v23.4s, v1.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v25.4s, v1.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v27.4s, v1.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v29.4s, v1.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
- fmla v31.4s, v1.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v17.4s, v1.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v19.4s, v1.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v21.4s, v1.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v23.4s, v1.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v25.4s, v1.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v27.4s, v1.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v29.4s, v1.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
+ fmla v31.4s, v1.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x8_M2
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v17.4s, v3.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v19.4s, v3.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v21.4s, v3.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v23.4s, v3.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v25.4s, v3.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v27.4s, v3.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v29.4s, v3.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
- fmla v31.4s, v3.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v17.4s, v3.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v19.4s, v3.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v21.4s, v3.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v23.4s, v3.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v25.4s, v3.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v27.4s, v3.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v29.4s, v3.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
+ fmla v31.4s, v3.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x8_E
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v17.4s, v3.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v19.4s, v3.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v21.4s, v3.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v23.4s, v3.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v25.4s, v3.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v27.4s, v3.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v29.4s, v3.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
- fmla v31.4s, v3.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v17.4s, v3.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v19.4s, v3.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v21.4s, v3.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v23.4s, v3.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v25.4s, v3.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v27.4s, v3.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v29.4s, v3.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
+ fmla v31.4s, v3.4s, v7.s[3]
.endm
.macro KERNEL8x8_SUB
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v17.4s, v1.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v19.4s, v1.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v21.4s, v1.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v23.4s, v1.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v25.4s, v1.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v27.4s, v1.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v29.4s, v1.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
- fmla v31.4s, v1.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v17.4s, v1.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v19.4s, v1.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v21.4s, v1.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v23.4s, v1.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v25.4s, v1.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v27.4s, v1.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v29.4s, v1.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
+ fmla v31.4s, v1.4s, v5.s[3]
.endm
.macro SAVE8x8
ld1 {v0.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v4.4s[0]
- fmul v18.4s, v0.4s, v4.4s[1]
- fmul v20.4s, v0.4s, v4.4s[2]
- fmul v22.4s, v0.4s, v4.4s[3]
- fmul v24.4s, v0.4s, v5.4s[0]
- fmul v26.4s, v0.4s, v5.4s[1]
- fmul v28.4s, v0.4s, v5.4s[2]
- fmul v30.4s, v0.4s, v5.4s[3]
+ fmul v16.4s, v0.4s, v4.s[0]
+ fmul v18.4s, v0.4s, v4.s[1]
+ fmul v20.4s, v0.4s, v4.s[2]
+ fmul v22.4s, v0.4s, v4.s[3]
+ fmul v24.4s, v0.4s, v5.s[0]
+ fmul v26.4s, v0.4s, v5.s[1]
+ fmul v28.4s, v0.4s, v5.s[2]
+ fmul v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x8_M1
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
ld1 {v6.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x8_M2
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
ld1 {v4.4s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x8_E
- fmla v16.4s, v2.4s, v6.4s[0]
- fmla v18.4s, v2.4s, v6.4s[1]
- fmla v20.4s, v2.4s, v6.4s[2]
- fmla v22.4s, v2.4s, v6.4s[3]
- fmla v24.4s, v2.4s, v7.4s[0]
- fmla v26.4s, v2.4s, v7.4s[1]
- fmla v28.4s, v2.4s, v7.4s[2]
- fmla v30.4s, v2.4s, v7.4s[3]
+ fmla v16.4s, v2.4s, v6.s[0]
+ fmla v18.4s, v2.4s, v6.s[1]
+ fmla v20.4s, v2.4s, v6.s[2]
+ fmla v22.4s, v2.4s, v6.s[3]
+ fmla v24.4s, v2.4s, v7.s[0]
+ fmla v26.4s, v2.4s, v7.s[1]
+ fmla v28.4s, v2.4s, v7.s[2]
+ fmla v30.4s, v2.4s, v7.s[3]
.endm
.macro KERNEL4x8_SUB
ld1 {v0.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v4.4s[0]
- fmla v18.4s, v0.4s, v4.4s[1]
- fmla v20.4s, v0.4s, v4.4s[2]
- fmla v22.4s, v0.4s, v4.4s[3]
- fmla v24.4s, v0.4s, v5.4s[0]
- fmla v26.4s, v0.4s, v5.4s[1]
- fmla v28.4s, v0.4s, v5.4s[2]
- fmla v30.4s, v0.4s, v5.4s[3]
+ fmla v16.4s, v0.4s, v4.s[0]
+ fmla v18.4s, v0.4s, v4.s[1]
+ fmla v20.4s, v0.4s, v4.s[2]
+ fmla v22.4s, v0.4s, v4.s[3]
+ fmla v24.4s, v0.4s, v5.s[0]
+ fmla v26.4s, v0.4s, v5.s[1]
+ fmla v28.4s, v0.4s, v5.s[2]
+ fmla v30.4s, v0.4s, v5.s[3]
.endm
.macro SAVE4x8
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v4.4s[0]
- fmla v18.2s, v0.2s, v4.4s[1]
- fmla v20.2s, v0.2s, v4.4s[2]
- fmla v22.2s, v0.2s, v4.4s[3]
- fmla v24.2s, v0.2s, v5.4s[0]
- fmla v26.2s, v0.2s, v5.4s[1]
- fmla v28.2s, v0.2s, v5.4s[2]
- fmla v30.2s, v0.2s, v5.4s[3]
+ fmla v16.2s, v0.2s, v4.s[0]
+ fmla v18.2s, v0.2s, v4.s[1]
+ fmla v20.2s, v0.2s, v4.s[2]
+ fmla v22.2s, v0.2s, v4.s[3]
+ fmla v24.2s, v0.2s, v5.s[0]
+ fmla v26.2s, v0.2s, v5.s[1]
+ fmla v28.2s, v0.2s, v5.s[2]
+ fmla v30.2s, v0.2s, v5.s[3]
.endm
.macro SAVE2x8
ldr s0, [pA]
add pA, pA, #4
- fmla s16, s0, v4.4s[0]
- fmla s18, s0, v4.4s[1]
- fmla s20, s0, v4.4s[2]
- fmla s22, s0, v4.4s[3]
- fmla s24, s0, v5.4s[0]
- fmla s26, s0, v5.4s[1]
- fmla s28, s0, v5.4s[2]
- fmla s30, s0, v5.4s[3]
+ fmla s16, s0, v4.s[0]
+ fmla s18, s0, v4.s[1]
+ fmla s20, s0, v4.s[2]
+ fmla s22, s0, v4.s[3]
+ fmla s24, s0, v5.s[0]
+ fmla s26, s0, v5.s[1]
+ fmla s28, s0, v5.s[2]
+ fmla s30, s0, v5.s[3]
.endm
.macro SAVE1x8
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmul v16.4s, v0.4s, v8.2s[0]
- fmul v17.4s, v1.4s, v8.2s[0]
- fmul v20.4s, v0.4s, v8.2s[1]
- fmul v21.4s, v1.4s, v8.2s[1]
- fmul v24.4s, v0.4s, v9.2s[0]
- fmul v25.4s, v1.4s, v9.2s[0]
- fmul v28.4s, v0.4s, v9.2s[1]
- fmul v29.4s, v1.4s, v9.2s[1]
+ fmul v16.4s, v0.4s, v8.s[0]
+ fmul v17.4s, v1.4s, v8.s[0]
+ fmul v20.4s, v0.4s, v8.s[1]
+ fmul v21.4s, v1.4s, v8.s[1]
+ fmul v24.4s, v0.4s, v9.s[0]
+ fmul v25.4s, v1.4s, v9.s[0]
+ fmul v28.4s, v0.4s, v9.s[1]
+ fmul v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_M1
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_M2
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL8x4_E
- fmla v16.4s, v4.4s, v12.2s[0]
- fmla v17.4s, v5.4s, v12.2s[0]
- fmla v20.4s, v4.4s, v12.2s[1]
- fmla v21.4s, v5.4s, v12.2s[1]
- fmla v24.4s, v4.4s, v13.2s[0]
- fmla v25.4s, v5.4s, v13.2s[0]
- fmla v28.4s, v4.4s, v13.2s[1]
- fmla v29.4s, v5.4s, v13.2s[1]
+ fmla v16.4s, v4.4s, v12.s[0]
+ fmla v17.4s, v5.4s, v12.s[0]
+ fmla v20.4s, v4.4s, v12.s[1]
+ fmla v21.4s, v5.4s, v12.s[1]
+ fmla v24.4s, v4.4s, v13.s[0]
+ fmla v25.4s, v5.4s, v13.s[0]
+ fmla v28.4s, v4.4s, v13.s[1]
+ fmla v29.4s, v5.4s, v13.s[1]
.endm
.macro KERNEL8x4_SUB
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
- fmla v24.4s, v0.4s, v9.2s[0]
- fmla v25.4s, v1.4s, v9.2s[0]
- fmla v28.4s, v0.4s, v9.2s[1]
- fmla v29.4s, v1.4s, v9.2s[1]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
+ fmla v24.4s, v0.4s, v9.s[0]
+ fmla v25.4s, v1.4s, v9.s[0]
+ fmla v28.4s, v0.4s, v9.s[1]
+ fmla v29.4s, v1.4s, v9.s[1]
.endm
.macro SAVE8x4
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmul v16.2s, v0.2s, v8.2s[0]
- fmul v29.2s, v1.2s, v9.2s[1]
+ fmul v16.2s, v0.2s, v8.s[0]
+ fmul v29.2s, v1.2s, v9.s[1]
- fmul v20.2s, v0.2s, v8.2s[1]
- fmul v25.2s, v1.2s, v9.2s[0]
+ fmul v20.2s, v0.2s, v8.s[1]
+ fmul v25.2s, v1.2s, v9.s[0]
- fmul v24.2s, v0.2s, v9.2s[0]
- fmul v21.2s, v1.2s, v8.2s[1]
+ fmul v24.2s, v0.2s, v9.s[0]
+ fmul v21.2s, v1.2s, v8.s[1]
- fmul v28.2s, v0.2s, v9.2s[1]
- fmul v17.2s, v1.2s, v8.2s[0]
+ fmul v28.2s, v0.2s, v9.s[1]
+ fmul v17.2s, v1.2s, v8.s[0]
ld1 {v12.2s, v13.2s}, [pB]
add pB, pB, #16
.endm
.macro KERNEL4x4_M1
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
ld1 {v12.2s, v13.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
ld1 {v4.2s, v5.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
prfm PLDL1KEEP, [pB, #512]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro KERNEL4x4_M2
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
ld1 {v8.2s, v9.2s}, [pB] // For next round
add pB, pB, #16
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
ld1 {v0.2s, v1.2s}, [pA] // For next round
add pA, pA, #16
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
prfm PLDL1KEEP, [pA, #512]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_E
- fmla v16.2s, v4.2s, v12.2s[0]
- fmla v29.2s, v5.2s, v13.2s[1]
+ fmla v16.2s, v4.2s, v12.s[0]
+ fmla v29.2s, v5.2s, v13.s[1]
- fmla v20.2s, v4.2s, v12.2s[1]
- fmla v25.2s, v5.2s, v13.2s[0]
+ fmla v20.2s, v4.2s, v12.s[1]
+ fmla v25.2s, v5.2s, v13.s[0]
- fmla v24.2s, v4.2s, v13.2s[0]
- fmla v21.2s, v5.2s, v12.2s[1]
+ fmla v24.2s, v4.2s, v13.s[0]
+ fmla v21.2s, v5.2s, v12.s[1]
- fmla v28.2s, v4.2s, v13.2s[1]
- fmla v17.2s, v5.2s, v12.2s[0]
+ fmla v28.2s, v4.2s, v13.s[1]
+ fmla v17.2s, v5.2s, v12.s[0]
.endm
.macro KERNEL4x4_SUB
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v29.2s, v1.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v29.2s, v1.2s, v9.s[1]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v25.2s, v1.2s, v9.2s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v25.2s, v1.2s, v9.s[0]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v21.2s, v1.2s, v8.s[1]
- fmla v28.2s, v0.2s, v9.2s[1]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x4
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v24.2s, v0.2s, v9.2s[0]
- fmla v28.2s, v0.2s, v9.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v24.2s, v0.2s, v9.s[0]
+ fmla v28.2s, v0.2s, v9.s[1]
.endm
.macro SAVE2x4
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
- fmla v20.4s, v0.4s, v8.2s[1]
- fmla v21.4s, v1.4s, v8.2s[1]
+ fmla v20.4s, v0.4s, v8.s[1]
+ fmla v21.4s, v1.4s, v8.s[1]
.endm
.macro SAVE8x2
ld1 {v0.2s, v1.2s}, [pA]
add pA, pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
- fmla v21.2s, v1.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
+ fmla v21.2s, v1.2s, v8.s[1]
.endm
.macro SAVE4x2
ld1 {v0.2s}, [pA]
add pA, pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v20.2s, v0.2s, v8.2s[1]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v20.2s, v0.2s, v8.s[1]
.endm
.macro SAVE2x2
ldr s0 , [pA]
add pA, pA, #4
- fmla v16.2s, v8.2s, v0.2s[0]
+ fmla v16.2s, v8.2s, v0.s[0]
.endm
.macro SAVE1x2
ld1 {v1.4s}, [pA]
add pA, pA, #16
- fmla v16.4s, v0.4s, v8.2s[0]
- fmla v17.4s, v1.4s, v8.2s[0]
+ fmla v16.4s, v0.4s, v8.s[0]
+ fmla v17.4s, v1.4s, v8.s[0]
.endm
.macro SAVE8x1
ld1 {v0.2s, v1.2s}, [pA]
add pA , pA, #16
- fmla v16.2s, v0.2s, v8.2s[0]
- fmla v17.2s, v1.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
+ fmla v17.2s, v1.2s, v8.s[0]
.endm
.macro SAVE4x1
ld1 {v0.2s}, [pA]
add pA , pA, #8
- fmla v16.2s, v0.2s, v8.2s[0]
+ fmla v16.2s, v0.2s, v8.s[0]
.endm
.macro SAVE2x1
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.2d, v0.2d, v9.2d[0]
+ fmls v17.2d, v0.2d, v9.d[0]
#else
- fmul v17.2d, v0.2d, v9.2d[0]
+ fmul v17.2d, v0.2d, v9.d[0]
#endif
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- fmul v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.2d, v2.2d, v9.2d[0]
+ fmls v19.2d, v2.2d, v9.d[0]
#else
- fmul v19.2d, v2.2d, v9.2d[0]
+ fmul v19.2d, v2.2d, v9.d[0]
#endif
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
- fmul v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
+ fmul v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.2d, v0.2d, v9.2d[1]
+ fmls v21.2d, v0.2d, v9.d[1]
#else
- fmul v21.2d, v0.2d, v9.2d[1]
+ fmul v21.2d, v0.2d, v9.d[1]
#endif
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- fmul v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
+ fmul v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.2d, v2.2d, v9.2d[1]
+ fmls v23.2d, v2.2d, v9.d[1]
#else
- fmul v23.2d, v2.2d, v9.2d[1]
+ fmul v23.2d, v2.2d, v9.d[1]
#endif
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
- fmul v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
+ fmul v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.2d, v0.2d, v11.2d[0]
+ fmls v25.2d, v0.2d, v11.d[0]
#else
- fmul v25.2d, v0.2d, v11.2d[0]
+ fmul v25.2d, v0.2d, v11.d[0]
#endif
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
- fmul v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
+ fmul v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.2d, v2.2d, v11.2d[0]
+ fmls v27.2d, v2.2d, v11.d[0]
#else
- fmul v27.2d, v2.2d, v11.2d[0]
+ fmul v27.2d, v2.2d, v11.d[0]
#endif
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
- fmul v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
+ fmul v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.2d, v0.2d, v11.2d[1]
+ fmls v29.2d, v0.2d, v11.d[1]
#else
- fmul v29.2d, v0.2d, v11.2d[1]
+ fmul v29.2d, v0.2d, v11.d[1]
#endif
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- fmul v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
+ fmul v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.2d, v2.2d, v11.2d[1]
+ fmls v31.2d, v2.2d, v11.d[1]
#else
- fmul v31.2d, v2.2d, v11.2d[1]
+ fmul v31.2d, v2.2d, v11.d[1]
#endif
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x4_M1
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
add pA, pA, #32
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
- OP_ri v27.2d, v2.2d, v11.2d[0]
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_rr v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
+ OP_ri v27.2d, v2.2d, v11.d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- OP_rr v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
- OP_ri v31.2d, v2.2d, v11.2d[1]
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_rr v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
+ OP_ri v31.2d, v2.2d, v11.d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.2d, v4.2d, v12.2d[0]
- OP_ii v16.2d, v5.2d, v13.2d[0]
- OP_ri v17.2d, v4.2d, v13.2d[0]
- OP_ir v17.2d, v5.2d, v12.2d[0]
+ OP_rr v16.2d, v4.2d, v12.d[0]
+ OP_ii v16.2d, v5.2d, v13.d[0]
+ OP_ri v17.2d, v4.2d, v13.d[0]
+ OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v18.2d, v6.2d, v12.2d[0]
- OP_ii v18.2d, v7.2d, v13.2d[0]
- OP_ri v19.2d, v6.2d, v13.2d[0]
- OP_ir v19.2d, v7.2d, v12.2d[0]
+ OP_rr v18.2d, v6.2d, v12.d[0]
+ OP_ii v18.2d, v7.2d, v13.d[0]
+ OP_ri v19.2d, v6.2d, v13.d[0]
+ OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.2d, v4.2d, v12.2d[1]
- OP_ii v20.2d, v5.2d, v13.2d[1]
- OP_ri v21.2d, v4.2d, v13.2d[1]
- OP_ir v21.2d, v5.2d, v12.2d[1]
+ OP_rr v20.2d, v4.2d, v12.d[1]
+ OP_ii v20.2d, v5.2d, v13.d[1]
+ OP_ri v21.2d, v4.2d, v13.d[1]
+ OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- OP_rr v22.2d, v6.2d, v12.2d[1]
- OP_ii v22.2d, v7.2d, v13.2d[1]
- OP_ri v23.2d, v6.2d, v13.2d[1]
- OP_ir v23.2d, v7.2d, v12.2d[1]
+ OP_rr v22.2d, v6.2d, v12.d[1]
+ OP_ii v22.2d, v7.2d, v13.d[1]
+ OP_ri v23.2d, v6.2d, v13.d[1]
+ OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.2d, v4.2d, v14.2d[0]
- OP_ii v24.2d, v5.2d, v15.2d[0]
- OP_ri v25.2d, v4.2d, v15.2d[0]
- OP_ir v25.2d, v5.2d, v14.2d[0]
+ OP_rr v24.2d, v4.2d, v14.d[0]
+ OP_ii v24.2d, v5.2d, v15.d[0]
+ OP_ri v25.2d, v4.2d, v15.d[0]
+ OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v26.2d, v6.2d, v14.2d[0]
- OP_ii v26.2d, v7.2d, v15.2d[0]
- OP_ri v27.2d, v6.2d, v15.2d[0]
- OP_ir v27.2d, v7.2d, v14.2d[0]
+ OP_rr v26.2d, v6.2d, v14.d[0]
+ OP_ii v26.2d, v7.2d, v15.d[0]
+ OP_ri v27.2d, v6.2d, v15.d[0]
+ OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.2d, v4.2d, v14.2d[1]
- OP_ii v28.2d, v5.2d, v15.2d[1]
- OP_ri v29.2d, v4.2d, v15.2d[1]
- OP_ir v29.2d, v5.2d, v14.2d[1]
+ OP_rr v28.2d, v4.2d, v14.d[1]
+ OP_ii v28.2d, v5.2d, v15.d[1]
+ OP_ri v29.2d, v4.2d, v15.d[1]
+ OP_ir v29.2d, v5.2d, v14.d[1]
- OP_rr v30.2d, v6.2d, v14.2d[1]
- OP_ii v30.2d, v7.2d, v15.2d[1]
- OP_ri v31.2d, v6.2d, v15.2d[1]
- OP_ir v31.2d, v7.2d, v14.2d[1]
+ OP_rr v30.2d, v6.2d, v14.d[1]
+ OP_ii v30.2d, v7.2d, v15.d[1]
+ OP_ri v31.2d, v6.2d, v15.d[1]
+ OP_ir v31.2d, v7.2d, v14.d[1]
.endm
.macro KERNEL4x4_E
- OP_rr v16.2d, v4.2d, v12.2d[0]
- OP_ii v16.2d, v5.2d, v13.2d[0]
- OP_ri v17.2d, v4.2d, v13.2d[0]
- OP_ir v17.2d, v5.2d, v12.2d[0]
-
- OP_rr v18.2d, v6.2d, v12.2d[0]
- OP_ii v18.2d, v7.2d, v13.2d[0]
- OP_ri v19.2d, v6.2d, v13.2d[0]
- OP_ir v19.2d, v7.2d, v12.2d[0]
-
- OP_rr v20.2d, v4.2d, v12.2d[1]
- OP_ii v20.2d, v5.2d, v13.2d[1]
- OP_ri v21.2d, v4.2d, v13.2d[1]
- OP_ir v21.2d, v5.2d, v12.2d[1]
-
- OP_rr v22.2d, v6.2d, v12.2d[1]
- OP_ii v22.2d, v7.2d, v13.2d[1]
- OP_ri v23.2d, v6.2d, v13.2d[1]
- OP_ir v23.2d, v7.2d, v12.2d[1]
-
- OP_rr v24.2d, v4.2d, v14.2d[0]
- OP_ii v24.2d, v5.2d, v15.2d[0]
- OP_ri v25.2d, v4.2d, v15.2d[0]
- OP_ir v25.2d, v5.2d, v14.2d[0]
-
- OP_rr v26.2d, v6.2d, v14.2d[0]
- OP_ii v26.2d, v7.2d, v15.2d[0]
- OP_ri v27.2d, v6.2d, v15.2d[0]
- OP_ir v27.2d, v7.2d, v14.2d[0]
-
- OP_rr v28.2d, v4.2d, v14.2d[1]
- OP_ii v28.2d, v5.2d, v15.2d[1]
- OP_ri v29.2d, v4.2d, v15.2d[1]
- OP_ir v29.2d, v5.2d, v14.2d[1]
-
- OP_rr v30.2d, v6.2d, v14.2d[1]
- OP_ii v30.2d, v7.2d, v15.2d[1]
- OP_ri v31.2d, v6.2d, v15.2d[1]
- OP_ir v31.2d, v7.2d, v14.2d[1]
+ OP_rr v16.2d, v4.2d, v12.d[0]
+ OP_ii v16.2d, v5.2d, v13.d[0]
+ OP_ri v17.2d, v4.2d, v13.d[0]
+ OP_ir v17.2d, v5.2d, v12.d[0]
+
+ OP_rr v18.2d, v6.2d, v12.d[0]
+ OP_ii v18.2d, v7.2d, v13.d[0]
+ OP_ri v19.2d, v6.2d, v13.d[0]
+ OP_ir v19.2d, v7.2d, v12.d[0]
+
+ OP_rr v20.2d, v4.2d, v12.d[1]
+ OP_ii v20.2d, v5.2d, v13.d[1]
+ OP_ri v21.2d, v4.2d, v13.d[1]
+ OP_ir v21.2d, v5.2d, v12.d[1]
+
+ OP_rr v22.2d, v6.2d, v12.d[1]
+ OP_ii v22.2d, v7.2d, v13.d[1]
+ OP_ri v23.2d, v6.2d, v13.d[1]
+ OP_ir v23.2d, v7.2d, v12.d[1]
+
+ OP_rr v24.2d, v4.2d, v14.d[0]
+ OP_ii v24.2d, v5.2d, v15.d[0]
+ OP_ri v25.2d, v4.2d, v15.d[0]
+ OP_ir v25.2d, v5.2d, v14.d[0]
+
+ OP_rr v26.2d, v6.2d, v14.d[0]
+ OP_ii v26.2d, v7.2d, v15.d[0]
+ OP_ri v27.2d, v6.2d, v15.d[0]
+ OP_ir v27.2d, v7.2d, v14.d[0]
+
+ OP_rr v28.2d, v4.2d, v14.d[1]
+ OP_ii v28.2d, v5.2d, v15.d[1]
+ OP_ri v29.2d, v4.2d, v15.d[1]
+ OP_ir v29.2d, v5.2d, v14.d[1]
+
+ OP_rr v30.2d, v6.2d, v14.d[1]
+ OP_ii v30.2d, v7.2d, v15.d[1]
+ OP_ri v31.2d, v6.2d, v15.d[1]
+ OP_ir v31.2d, v7.2d, v14.d[1]
.endm
.macro KERNEL4x4_SUB
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
-
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
-
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
-
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
-
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
-
- OP_rr v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
- OP_ri v27.2d, v2.2d, v11.2d[0]
- OP_ir v27.2d, v3.2d, v10.2d[0]
-
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
-
- OP_rr v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
- OP_ri v31.2d, v2.2d, v11.2d[1]
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
+
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
+
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
+
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
+
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
+
+ OP_rr v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
+ OP_ri v27.2d, v2.2d, v11.d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
+
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
+
+ OP_rr v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
+ OP_ri v31.2d, v2.2d, v11.d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
.endm
.macro SAVE4x4
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
-
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
-
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
-
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
+
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
+
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
+
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
.endm
.macro SAVE2x4
ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16
- OP_rr d16, d0, v8.2d[0]
- OP_ii d16, d1, v9.2d[0]
- OP_ri d17, d0, v9.2d[0]
- OP_ir d17, d1, v8.2d[0]
-
- OP_rr d20, d0, v8.2d[1]
- OP_ii d20, d1, v9.2d[1]
- OP_ri d21, d0, v9.2d[1]
- OP_ir d21, d1, v8.2d[1]
-
- OP_rr d24, d0, v10.2d[0]
- OP_ii d24, d1, v11.2d[0]
- OP_ri d25, d0, v11.2d[0]
- OP_ir d25, d1, v10.2d[0]
-
- OP_rr d28, d0, v10.2d[1]
- OP_ii d28, d1, v11.2d[1]
- OP_ri d29, d0, v11.2d[1]
- OP_ir d29, d1, v10.2d[1]
+ OP_rr d16, d0, v8.d[0]
+ OP_ii d16, d1, v9.d[0]
+ OP_ri d17, d0, v9.d[0]
+ OP_ir d17, d1, v8.d[0]
+
+ OP_rr d20, d0, v8.d[1]
+ OP_ii d20, d1, v9.d[1]
+ OP_ri d21, d0, v9.d[1]
+ OP_ir d21, d1, v8.d[1]
+
+ OP_rr d24, d0, v10.d[0]
+ OP_ii d24, d1, v11.d[0]
+ OP_ri d25, d0, v11.d[0]
+ OP_ir d25, d1, v10.d[0]
+
+ OP_rr d28, d0, v10.d[1]
+ OP_ii d28, d1, v11.d[1]
+ OP_ri d29, d0, v11.d[1]
+ OP_ir d29, d1, v10.d[1]
.endm
.macro SAVE1x4
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
-
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
-
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
-
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
+
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
+
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
+
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
.endm
.macro SAVE4x2
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE2x2
ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16
- OP_rr d16, d0, v8.2d[0]
- OP_ii d16, d1, v9.2d[0]
- OP_ri d17, d0, v9.2d[0]
- OP_ir d17, d1, v8.2d[0]
+ OP_rr d16, d0, v8.d[0]
+ OP_ii d16, d1, v9.d[0]
+ OP_ri d17, d0, v9.d[0]
+ OP_ir d17, d1, v8.d[0]
- OP_rr d20, d0, v8.2d[1]
- OP_ii d20, d1, v9.2d[1]
- OP_ri d21, d0, v9.2d[1]
- OP_ir d21, d1, v8.2d[1]
+ OP_rr d20, d0, v8.d[1]
+ OP_ii d20, d1, v9.d[1]
+ OP_ri d21, d0, v9.d[1]
+ OP_ir d21, d1, v8.d[1]
.endm
.macro SAVE1x2
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- fmul v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
+ fmul v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v17.16b, v17.16b, v17.16b
- fmls v17.2d, v0.2d, v9.2d[0]
+ fmls v17.2d, v0.2d, v9.d[0]
#else
- fmul v17.2d, v0.2d, v9.2d[0]
+ fmul v17.2d, v0.2d, v9.d[0]
#endif
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- fmul v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
+ fmul v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v19.16b, v19.16b, v19.16b
- fmls v19.2d, v2.2d, v9.2d[0]
+ fmls v19.2d, v2.2d, v9.d[0]
#else
- fmul v19.2d, v2.2d, v9.2d[0]
+ fmul v19.2d, v2.2d, v9.d[0]
#endif
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
- fmul v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
+ fmul v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v21.16b, v21.16b, v21.16b
- fmls v21.2d, v0.2d, v9.2d[1]
+ fmls v21.2d, v0.2d, v9.d[1]
#else
- fmul v21.2d, v0.2d, v9.2d[1]
+ fmul v21.2d, v0.2d, v9.d[1]
#endif
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
- fmul v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
+ fmul v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v23.16b, v23.16b, v23.16b
- fmls v23.2d, v2.2d, v9.2d[1]
+ fmls v23.2d, v2.2d, v9.d[1]
#else
- fmul v23.2d, v2.2d, v9.2d[1]
+ fmul v23.2d, v2.2d, v9.d[1]
#endif
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
- fmul v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
+ fmul v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v25.16b, v25.16b, v25.16b
- fmls v25.2d, v0.2d, v11.2d[0]
+ fmls v25.2d, v0.2d, v11.d[0]
#else
- fmul v25.2d, v0.2d, v11.2d[0]
+ fmul v25.2d, v0.2d, v11.d[0]
#endif
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
- fmul v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
+ fmul v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v27.16b, v27.16b, v27.16b
- fmls v27.2d, v2.2d, v11.2d[0]
+ fmls v27.2d, v2.2d, v11.d[0]
#else
- fmul v27.2d, v2.2d, v11.2d[0]
+ fmul v27.2d, v2.2d, v11.d[0]
#endif
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
- fmul v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
+ fmul v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v29.16b, v29.16b, v29.16b
- fmls v29.2d, v0.2d, v11.2d[1]
+ fmls v29.2d, v0.2d, v11.d[1]
#else
- fmul v29.2d, v0.2d, v11.2d[1]
+ fmul v29.2d, v0.2d, v11.d[1]
#endif
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- fmul v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
+ fmul v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
eor v31.16b, v31.16b, v31.16b
- fmls v31.2d, v2.2d, v11.2d[1]
+ fmls v31.2d, v2.2d, v11.d[1]
#else
- fmul v31.2d, v2.2d, v11.2d[1]
+ fmul v31.2d, v2.2d, v11.d[1]
#endif
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
ld2 {v12.2d, v13.2d}, [pB]
add pB, pB, #32
.endm
.macro KERNEL4x4_M1
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
ld2 {v12.2d, v13.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
ld2 {v14.2d, v15.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
ld2 {v4.2d, v5.2d} , [pA] // For next round
add pA, pA, #32
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
ld2 {v6.2d, v7.2d} , [pA] // For next round
add pA, pA, #32
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
- OP_ri v27.2d, v2.2d, v11.2d[0]
- OP_ir v27.2d, v3.2d, v10.2d[0]
+ OP_rr v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
+ OP_ri v27.2d, v2.2d, v11.d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
- OP_rr v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
- OP_ri v31.2d, v2.2d, v11.2d[1]
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_rr v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
+ OP_ri v31.2d, v2.2d, v11.d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
.endm
.macro KERNEL4x4_M2
- OP_rr v16.2d, v4.2d, v12.2d[0]
- OP_ii v16.2d, v5.2d, v13.2d[0]
- OP_ri v17.2d, v4.2d, v13.2d[0]
- OP_ir v17.2d, v5.2d, v12.2d[0]
+ OP_rr v16.2d, v4.2d, v12.d[0]
+ OP_ii v16.2d, v5.2d, v13.d[0]
+ OP_ri v17.2d, v4.2d, v13.d[0]
+ OP_ir v17.2d, v5.2d, v12.d[0]
ld2 {v8.2d, v9.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v18.2d, v6.2d, v12.2d[0]
- OP_ii v18.2d, v7.2d, v13.2d[0]
- OP_ri v19.2d, v6.2d, v13.2d[0]
- OP_ir v19.2d, v7.2d, v12.2d[0]
+ OP_rr v18.2d, v6.2d, v12.d[0]
+ OP_ii v18.2d, v7.2d, v13.d[0]
+ OP_ri v19.2d, v6.2d, v13.d[0]
+ OP_ir v19.2d, v7.2d, v12.d[0]
ld2 {v10.2d, v11.2d}, [pB] // For next round
add pB, pB, #32
- OP_rr v20.2d, v4.2d, v12.2d[1]
- OP_ii v20.2d, v5.2d, v13.2d[1]
- OP_ri v21.2d, v4.2d, v13.2d[1]
- OP_ir v21.2d, v5.2d, v12.2d[1]
+ OP_rr v20.2d, v4.2d, v12.d[1]
+ OP_ii v20.2d, v5.2d, v13.d[1]
+ OP_ri v21.2d, v4.2d, v13.d[1]
+ OP_ir v21.2d, v5.2d, v12.d[1]
ld2 {v0.2d, v1.2d}, [pA] // For next round
add pA, pA, #32
- OP_rr v22.2d, v6.2d, v12.2d[1]
- OP_ii v22.2d, v7.2d, v13.2d[1]
- OP_ri v23.2d, v6.2d, v13.2d[1]
- OP_ir v23.2d, v7.2d, v12.2d[1]
+ OP_rr v22.2d, v6.2d, v12.d[1]
+ OP_ii v22.2d, v7.2d, v13.d[1]
+ OP_ri v23.2d, v6.2d, v13.d[1]
+ OP_ir v23.2d, v7.2d, v12.d[1]
ld2 {v2.2d, v3.2d}, [pA] // For next round
add pA, pA, #32
- OP_rr v24.2d, v4.2d, v14.2d[0]
- OP_ii v24.2d, v5.2d, v15.2d[0]
- OP_ri v25.2d, v4.2d, v15.2d[0]
- OP_ir v25.2d, v5.2d, v14.2d[0]
+ OP_rr v24.2d, v4.2d, v14.d[0]
+ OP_ii v24.2d, v5.2d, v15.d[0]
+ OP_ri v25.2d, v4.2d, v15.d[0]
+ OP_ir v25.2d, v5.2d, v14.d[0]
prfm PLDL1KEEP, [pA, #512]
- OP_rr v26.2d, v6.2d, v14.2d[0]
- OP_ii v26.2d, v7.2d, v15.2d[0]
- OP_ri v27.2d, v6.2d, v15.2d[0]
- OP_ir v27.2d, v7.2d, v14.2d[0]
+ OP_rr v26.2d, v6.2d, v14.d[0]
+ OP_ii v26.2d, v7.2d, v15.d[0]
+ OP_ri v27.2d, v6.2d, v15.d[0]
+ OP_ir v27.2d, v7.2d, v14.d[0]
prfm PLDL1KEEP, [pB, #512]
- OP_rr v28.2d, v4.2d, v14.2d[1]
- OP_ii v28.2d, v5.2d, v15.2d[1]
- OP_ri v29.2d, v4.2d, v15.2d[1]
- OP_ir v29.2d, v5.2d, v14.2d[1]
+ OP_rr v28.2d, v4.2d, v14.d[1]
+ OP_ii v28.2d, v5.2d, v15.d[1]
+ OP_ri v29.2d, v4.2d, v15.d[1]
+ OP_ir v29.2d, v5.2d, v14.d[1]
- OP_rr v30.2d, v6.2d, v14.2d[1]
- OP_ii v30.2d, v7.2d, v15.2d[1]
- OP_ri v31.2d, v6.2d, v15.2d[1]
- OP_ir v31.2d, v7.2d, v14.2d[1]
+ OP_rr v30.2d, v6.2d, v14.d[1]
+ OP_ii v30.2d, v7.2d, v15.d[1]
+ OP_ri v31.2d, v6.2d, v15.d[1]
+ OP_ir v31.2d, v7.2d, v14.d[1]
.endm
.macro KERNEL4x4_E
- OP_rr v16.2d, v4.2d, v12.2d[0]
- OP_ii v16.2d, v5.2d, v13.2d[0]
- OP_ri v17.2d, v4.2d, v13.2d[0]
- OP_ir v17.2d, v5.2d, v12.2d[0]
-
- OP_rr v18.2d, v6.2d, v12.2d[0]
- OP_ii v18.2d, v7.2d, v13.2d[0]
- OP_ri v19.2d, v6.2d, v13.2d[0]
- OP_ir v19.2d, v7.2d, v12.2d[0]
-
- OP_rr v20.2d, v4.2d, v12.2d[1]
- OP_ii v20.2d, v5.2d, v13.2d[1]
- OP_ri v21.2d, v4.2d, v13.2d[1]
- OP_ir v21.2d, v5.2d, v12.2d[1]
-
- OP_rr v22.2d, v6.2d, v12.2d[1]
- OP_ii v22.2d, v7.2d, v13.2d[1]
- OP_ri v23.2d, v6.2d, v13.2d[1]
- OP_ir v23.2d, v7.2d, v12.2d[1]
-
- OP_rr v24.2d, v4.2d, v14.2d[0]
- OP_ii v24.2d, v5.2d, v15.2d[0]
- OP_ri v25.2d, v4.2d, v15.2d[0]
- OP_ir v25.2d, v5.2d, v14.2d[0]
-
- OP_rr v26.2d, v6.2d, v14.2d[0]
- OP_ii v26.2d, v7.2d, v15.2d[0]
- OP_ri v27.2d, v6.2d, v15.2d[0]
- OP_ir v27.2d, v7.2d, v14.2d[0]
-
- OP_rr v28.2d, v4.2d, v14.2d[1]
- OP_ii v28.2d, v5.2d, v15.2d[1]
- OP_ri v29.2d, v4.2d, v15.2d[1]
- OP_ir v29.2d, v5.2d, v14.2d[1]
-
- OP_rr v30.2d, v6.2d, v14.2d[1]
- OP_ii v30.2d, v7.2d, v15.2d[1]
- OP_ri v31.2d, v6.2d, v15.2d[1]
- OP_ir v31.2d, v7.2d, v14.2d[1]
+ OP_rr v16.2d, v4.2d, v12.d[0]
+ OP_ii v16.2d, v5.2d, v13.d[0]
+ OP_ri v17.2d, v4.2d, v13.d[0]
+ OP_ir v17.2d, v5.2d, v12.d[0]
+
+ OP_rr v18.2d, v6.2d, v12.d[0]
+ OP_ii v18.2d, v7.2d, v13.d[0]
+ OP_ri v19.2d, v6.2d, v13.d[0]
+ OP_ir v19.2d, v7.2d, v12.d[0]
+
+ OP_rr v20.2d, v4.2d, v12.d[1]
+ OP_ii v20.2d, v5.2d, v13.d[1]
+ OP_ri v21.2d, v4.2d, v13.d[1]
+ OP_ir v21.2d, v5.2d, v12.d[1]
+
+ OP_rr v22.2d, v6.2d, v12.d[1]
+ OP_ii v22.2d, v7.2d, v13.d[1]
+ OP_ri v23.2d, v6.2d, v13.d[1]
+ OP_ir v23.2d, v7.2d, v12.d[1]
+
+ OP_rr v24.2d, v4.2d, v14.d[0]
+ OP_ii v24.2d, v5.2d, v15.d[0]
+ OP_ri v25.2d, v4.2d, v15.d[0]
+ OP_ir v25.2d, v5.2d, v14.d[0]
+
+ OP_rr v26.2d, v6.2d, v14.d[0]
+ OP_ii v26.2d, v7.2d, v15.d[0]
+ OP_ri v27.2d, v6.2d, v15.d[0]
+ OP_ir v27.2d, v7.2d, v14.d[0]
+
+ OP_rr v28.2d, v4.2d, v14.d[1]
+ OP_ii v28.2d, v5.2d, v15.d[1]
+ OP_ri v29.2d, v4.2d, v15.d[1]
+ OP_ir v29.2d, v5.2d, v14.d[1]
+
+ OP_rr v30.2d, v6.2d, v14.d[1]
+ OP_ii v30.2d, v7.2d, v15.d[1]
+ OP_ri v31.2d, v6.2d, v15.d[1]
+ OP_ir v31.2d, v7.2d, v14.d[1]
.endm
.macro KERNEL4x4_SUB
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
-
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
-
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
-
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
-
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
-
- OP_rr v26.2d, v2.2d, v10.2d[0]
- OP_ii v26.2d, v3.2d, v11.2d[0]
- OP_ri v27.2d, v2.2d, v11.2d[0]
- OP_ir v27.2d, v3.2d, v10.2d[0]
-
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
-
- OP_rr v30.2d, v2.2d, v10.2d[1]
- OP_ii v30.2d, v3.2d, v11.2d[1]
- OP_ri v31.2d, v2.2d, v11.2d[1]
- OP_ir v31.2d, v3.2d, v10.2d[1]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
+
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
+
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
+
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
+
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
+
+ OP_rr v26.2d, v2.2d, v10.d[0]
+ OP_ii v26.2d, v3.2d, v11.d[0]
+ OP_ri v27.2d, v2.2d, v11.d[0]
+ OP_ir v27.2d, v3.2d, v10.d[0]
+
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
+
+ OP_rr v30.2d, v2.2d, v10.d[1]
+ OP_ii v30.2d, v3.2d, v11.d[1]
+ OP_ri v31.2d, v2.2d, v11.d[1]
+ OP_ir v31.2d, v3.2d, v10.d[1]
.endm
.macro SAVE4x4
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
-
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
-
- OP_rr v24.2d, v0.2d, v10.2d[0]
- OP_ii v24.2d, v1.2d, v11.2d[0]
- OP_ri v25.2d, v0.2d, v11.2d[0]
- OP_ir v25.2d, v1.2d, v10.2d[0]
-
- OP_rr v28.2d, v0.2d, v10.2d[1]
- OP_ii v28.2d, v1.2d, v11.2d[1]
- OP_ri v29.2d, v0.2d, v11.2d[1]
- OP_ir v29.2d, v1.2d, v10.2d[1]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
+
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
+
+ OP_rr v24.2d, v0.2d, v10.d[0]
+ OP_ii v24.2d, v1.2d, v11.d[0]
+ OP_ri v25.2d, v0.2d, v11.d[0]
+ OP_ir v25.2d, v1.2d, v10.d[0]
+
+ OP_rr v28.2d, v0.2d, v10.d[1]
+ OP_ii v28.2d, v1.2d, v11.d[1]
+ OP_ri v29.2d, v0.2d, v11.d[1]
+ OP_ir v29.2d, v1.2d, v10.d[1]
.endm
.macro SAVE2x4
ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16
- OP_rr d16, d0, v8.2d[0]
- OP_ii d16, d1, v9.2d[0]
- OP_ri d17, d0, v9.2d[0]
- OP_ir d17, d1, v8.2d[0]
-
- OP_rr d20, d0, v8.2d[1]
- OP_ii d20, d1, v9.2d[1]
- OP_ri d21, d0, v9.2d[1]
- OP_ir d21, d1, v8.2d[1]
-
- OP_rr d24, d0, v10.2d[0]
- OP_ii d24, d1, v11.2d[0]
- OP_ri d25, d0, v11.2d[0]
- OP_ir d25, d1, v10.2d[0]
-
- OP_rr d28, d0, v10.2d[1]
- OP_ii d28, d1, v11.2d[1]
- OP_ri d29, d0, v11.2d[1]
- OP_ir d29, d1, v10.2d[1]
+ OP_rr d16, d0, v8.d[0]
+ OP_ii d16, d1, v9.d[0]
+ OP_ri d17, d0, v9.d[0]
+ OP_ir d17, d1, v8.d[0]
+
+ OP_rr d20, d0, v8.d[1]
+ OP_ii d20, d1, v9.d[1]
+ OP_ri d21, d0, v9.d[1]
+ OP_ir d21, d1, v8.d[1]
+
+ OP_rr d24, d0, v10.d[0]
+ OP_ii d24, d1, v11.d[0]
+ OP_ri d25, d0, v11.d[0]
+ OP_ir d25, d1, v10.d[0]
+
+ OP_rr d28, d0, v10.d[1]
+ OP_ii d28, d1, v11.d[1]
+ OP_ri d29, d0, v11.d[1]
+ OP_ir d29, d1, v10.d[1]
.endm
.macro SAVE1x4
ld2 {v2.2d, v3.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
-
- OP_rr v18.2d, v2.2d, v8.2d[0]
- OP_ii v18.2d, v3.2d, v9.2d[0]
- OP_ri v19.2d, v2.2d, v9.2d[0]
- OP_ir v19.2d, v3.2d, v8.2d[0]
-
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
-
- OP_rr v22.2d, v2.2d, v8.2d[1]
- OP_ii v22.2d, v3.2d, v9.2d[1]
- OP_ri v23.2d, v2.2d, v9.2d[1]
- OP_ir v23.2d, v3.2d, v8.2d[1]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
+
+ OP_rr v18.2d, v2.2d, v8.d[0]
+ OP_ii v18.2d, v3.2d, v9.d[0]
+ OP_ri v19.2d, v2.2d, v9.d[0]
+ OP_ir v19.2d, v3.2d, v8.d[0]
+
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
+
+ OP_rr v22.2d, v2.2d, v8.d[1]
+ OP_ii v22.2d, v3.2d, v9.d[1]
+ OP_ri v23.2d, v2.2d, v9.d[1]
+ OP_ir v23.2d, v3.2d, v8.d[1]
.endm
.macro SAVE4x2
ld2 {v0.2d, v1.2d}, [pA]
add pA, pA, #32
- OP_rr v16.2d, v0.2d, v8.2d[0]
- OP_ii v16.2d, v1.2d, v9.2d[0]
- OP_ri v17.2d, v0.2d, v9.2d[0]
- OP_ir v17.2d, v1.2d, v8.2d[0]
+ OP_rr v16.2d, v0.2d, v8.d[0]
+ OP_ii v16.2d, v1.2d, v9.d[0]
+ OP_ri v17.2d, v0.2d, v9.d[0]
+ OP_ir v17.2d, v1.2d, v8.d[0]
- OP_rr v20.2d, v0.2d, v8.2d[1]
- OP_ii v20.2d, v1.2d, v9.2d[1]
- OP_ri v21.2d, v0.2d, v9.2d[1]
- OP_ir v21.2d, v1.2d, v8.2d[1]
+ OP_rr v20.2d, v0.2d, v8.d[1]
+ OP_ii v20.2d, v1.2d, v9.d[1]
+ OP_ri v21.2d, v0.2d, v9.d[1]
+ OP_ir v21.2d, v1.2d, v8.d[1]
.endm
.macro SAVE2x2
ld2 {v0.d, v1.d}[0], [pA]
add pA, pA, #16
- OP_rr d16, d0, v8.2d[0]
- OP_ii d16, d1, v9.2d[0]
- OP_ri d17, d0, v9.2d[0]
- OP_ir d17, d1, v8.2d[0]
+ OP_rr d16, d0, v8.d[0]
+ OP_ii d16, d1, v9.d[0]
+ OP_ri d17, d0, v9.d[0]
+ OP_ir d17, d1, v8.d[0]
- OP_rr d20, d0, v8.2d[1]
- OP_ii d20, d1, v9.2d[1]
- OP_ri d21, d0, v9.2d[1]
- OP_ir d21, d1, v8.2d[1]
+ OP_rr d20, d0, v8.d[1]
+ OP_ii d20, d1, v9.d[1]
+ OP_ri d21, d0, v9.d[1]
+ OP_ir d21, d1, v8.d[1]
.endm
.macro SAVE1x2