From 8a40f1355e9711ce3d661c214f1644075c1e497b Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:50:38 +0530 Subject: [PATCH] Improvements to GEMV kernels --- kernel/arm64/gemv_n.S | 9 ++ kernel/arm64/gemv_t.S | 17 +++- kernel/arm64/zgemv_n.S | 249 +++++++++++++++++++------------------------------ kernel/arm64/zgemv_t.S | 9 +- 4 files changed, 128 insertions(+), 156 deletions(-) diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S index 6279c22..162f721 100644 --- a/kernel/arm64/gemv_n.S +++ b/kernel/arm64/gemv_n.S @@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SHZ 3 #endif +#define A_PRE_SIZE 768 +#define Y_PRE_SIZE 768 + /******************************************************************************/ .macro SAVE_REGS @@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.4s, v3.4s}, [A_PTR], #32 ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 fmla v4.4s, v1.4s, v2.4s + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.4s, v1.4s, v3.4s st1 {v4.4s, v5.4s}, [Y_OPTR], #32 ld1 {v6.4s, v7.4s}, [A_PTR], #32 ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 fmla v8.4s, v1.4s, v6.4s + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.4s, v1.4s, v7.4s st1 {v8.4s, v9.4s}, [Y_OPTR], #32 #else //DOUBLE ld1 {v2.2d, v3.2d}, [A_PTR], #32 ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 fmla v4.2d, v1.2d, v2.2d + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.2d, v1.2d, v3.2d st1 {v4.2d, v5.2d}, [Y_OPTR], #32 ld1 {v6.2d, v7.2d}, [A_PTR], #32 ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 fmla v8.2d, v1.2d, v6.2d + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.2d, v1.2d, v7.2d st1 {v8.2d, v9.2d}, [Y_OPTR], #32 ld1 {v10.2d, v11.2d}, [A_PTR], #32 ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 fmla v12.2d, v1.2d, v10.2d + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v13.2d, v1.2d, v11.2d st1 {v12.2d, v13.2d}, [Y_OPTR], #32 ld1 {v14.2d, v15.2d}, [A_PTR], #32 ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 fmla v16.2d, v1.2d, v14.2d + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v17.2d, v1.2d, v15.2d st1 {v16.2d, v17.2d}, [Y_OPTR], #32 #endif diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S index 0145af6..28325f7 100644 --- a/kernel/arm64/gemv_t.S +++ b/kernel/arm64/gemv_t.S @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define J x11 /* loop variable */ #define I x12 /* loop variable */ +#define X_PREFETCH_SIZE 768 +#define A_PREFETCH_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 fmla v1.4s, v5.4s, v9.4s + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v6.4s, v10.4s + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v7.4s, v11.4s + ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 fmla v4.4s, v8.4s, v12.4s - ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 fmla v1.4s, v13.4s, v17.4s + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v14.4s, v18.4s + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v15.4s, v19.4s fmla v4.4s, v16.4s, v20.4s #else ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d #endif diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S index 9e285e2..a28d1b0 100644 --- a/kernel/arm64/zgemv_n.S +++ b/kernel/arm64/zgemv_n.S @@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y_OPTR x13 /* loop Y vector address */ #define X_PTR x14 /* loop X vector address */ +#define A_PRE_SIZE 768 +#define Y_PRE_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define ALPHA_R s0 #define ALPHA_I s1 -#define ALPHA_R_COPY s7 -#define ALPHA_I_COPY s8 #define SHZ 3 #else #define ALPHA_R d0 #define ALPHA_I d1 -#define ALPHA_R_COPY d7 -#define ALPHA_I_COPY d8 #define SHZ 4 #endif @@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT - /********** INIT FOR F4 LOOP **********/ - fmov ALPHA_R_COPY, ALPHA_R - fmov ALPHA_I_COPY, ALPHA_I -#if !defined(DOUBLE) - ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) - ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) - ins v7.d[1], v7.d[0] - ins v8.d[1], v8.d[0] -#else - ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) - ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) -#endif - - /******* INIT FOR F1 AND S1 LOOP ******/ #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) eor v2.16b, v2.16b, v2.16b @@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro INIT_LOOP - /********** INIT_LOOP FOR F4 LOOP **********/ #if !defined(DOUBLE) - ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] - ins v10.s[0], v9.s[1] - ins v9.s[1], v9.s[0] // [R(X), R(X)] - ins v10.s[1], v10.s[0] // [I(X), I(X)] - ins v9.d[1], v9.d[0] - ins v10.d[1], v10.d[0] + ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] + ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] + fmul v2.2s, v0.2s, v2.2s + fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] + ins v3.s[0], v2.s[1] + + /********** INIT_LOOP FOR F4 LOOP **********/ #if !defined(CONJ) #if !defined(XCONJ) - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] - fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + dup v22.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v23.4s, v25.s[0] // -I[TEMP] + dup v24.4s, v3.s[0] // I[TEMP] #else - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] - fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + dup v22.4s, v2.s[0] // R[TEMP] + dup v23.4s, v3.s[0] // I[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v24.4s, v25.s[0] // -I[TEMP] #endif #else // CONJ #if !defined(XCONJ) - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] - fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] - fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s2 + dup v22.4s, v25.s[0] // R[TEMP] + dup v23.4s, v3.s[0] // I[TEMP] + dup v24.4s, v3.s[0] // I[TEMP] #else - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - eor v12.16b, v12.16b, v12.16b - fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] - fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] + dup v21.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s2 + dup v22.4s, v25.s[0] // R[TEMP] + + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v23.4s, v25.s[0] // I[TEMP] + dup v24.4s, v25.s[0] // I[TEMP] #endif #endif // CONJ + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ - ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] - ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] - fmul v2.2s, v0.2s, v2.2s - fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] - ins v3.s[0], v2.s[1] #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b @@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif // CONJ #else // DOUBLE + ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] + ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] + fmul v2.2d, v0.2d, v2.2d + fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] + ins v3.d[0], v2.d[1] // I(TEMP) - /********** INIT_LOOP FOR F4 LOOP **********/ - ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] - ins v10.d[0], v9.d[1] - ins v9.d[1], v9.d[0] // [R(X), R(X)] - ins v10.d[1], v10.d[0] // [I(X), I(X)] + /****** INIT_LOOP FOR F4 LOOP ******/ #if !defined(CONJ) #if !defined(XCONJ) - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] - fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + dup v22.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v23.2d, v25.d[0] // -I[TEMP] + dup v24.2d, v3.d[0] // I[TEMP] #else - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] - fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + dup v22.2d, v2.d[0] // R[TEMP] + dup v23.2d, v3.d[0] // I[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v24.2d, v25.d[0] // -I[TEMP] #endif #else // CONJ #if !defined(XCONJ) - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] - fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] - fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d2 + dup v22.2d, v25.d[0] // R[TEMP] + dup v23.2d, v3.d[0] // I[TEMP] + dup v24.2d, v3.d[0] // I[TEMP] #else - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - eor v12.16b, v12.16b, v12.16b - fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] - fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] + dup v21.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d2 + dup v22.2d, v25.d[0] // R[TEMP] + + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v23.2d, v25.d[0] // I[TEMP] + dup v24.2d, v25.d[0] // I[TEMP] #endif #endif // CONJ + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ - ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] - ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] - fmul v2.2d, v0.2d, v2.2d - fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] - ins v3.d[0], v2.d[1] // I(TEMP) #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b @@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v13.4s, v14.4s}, [A_PTR], #32 ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] - fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] - fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] -#else - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] - fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] - fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] - fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] - fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] -#else - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] - fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] - fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] -#endif -#endif // CONJ + + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] + + fmla v15.4s, v21.4s, v13.4s + fmla v15.4s, v23.4s, v14.4s + fmla v16.4s, v22.4s, v14.4s + fmla v16.4s, v24.4s, v13.4s + st2 {v15.4s, v16.4s}, [Y_OPTR], #32 #else // DOUBLE ld2 {v13.2d, v14.2d}, [A_PTR], #32 ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] - fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] - fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] - fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] - fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] - fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] - fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] - fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] - fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] -#endif -#endif // CONJ + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] + + fmla v15.2d, v21.2d, v13.2d + fmla v15.2d, v23.2d, v14.2d + fmla v16.2d, v22.2d, v14.2d + fmla v16.2d, v24.2d, v13.2d + st2 {v15.2d, v16.2d}, [Y_OPTR], #32 ld2 {v17.2d, v18.2d}, [A_PTR], #32 ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#endif -#endif // CONJ + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] + + fmla v19.2d, v21.2d, v17.2d + fmla v19.2d, v23.2d, v18.2d + fmla v20.2d, v22.2d, v18.2d + fmla v20.2d, v24.2d, v17.2d + st2 {v19.2d, v20.2d}, [Y_OPTR], #32 #endif @@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP: zgemv_n_kernel_F4: - KERNEL_F1 - KERNEL_F1 - KERNEL_F1 - KERNEL_F1 + KERNEL_F4 subs I, I, #1 bne zgemv_n_kernel_F4 diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S index e61c171..79ce9bc 100644 --- a/kernel/arm64/zgemv_t.S +++ b/kernel/arm64/zgemv_t.S @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define J x11 /* loop variable */ #define I x12 /* loop variable */ +#define A_PRE_SIZE 768 +#define X_PRE_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v11.4s, v12.4s}, [X_PTR], #32 ld2 {v13.4s, v14.4s}, [A_PTR], #32 + prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] + prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] @@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else // DOUBLE ld2 {v11.2d, v12.2d}, [X_PTR], #32 ld2 {v13.2d, v14.2d}, [A_PTR], #32 - prfm PLDL1STRM, [X_PTR, #512] + prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] @@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v17.2d, v18.2d}, [X_PTR], #32 ld2 {v19.2d, v20.2d}, [A_PTR], #32 - prfm PLDL1STRM, [A_PTR, #512] + prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] -- 2.7.4