Improvements to GEMV kernels
authorAshwin Sekhar T K <ashwin@broadcom.com>
Thu, 14 Jul 2016 08:20:38 +0000 (13:50 +0530)
committerAshwin Sekhar T K <ashwin@broadcom.com>
Thu, 14 Jul 2016 08:20:38 +0000 (13:50 +0530)
kernel/arm64/gemv_n.S
kernel/arm64/gemv_t.S
kernel/arm64/zgemv_n.S
kernel/arm64/zgemv_t.S

index 6279c22..162f721 100644 (file)
@@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SHZ    3
 #endif
 
+#define A_PRE_SIZE 768
+#define Y_PRE_SIZE 768
+
 /******************************************************************************/
 
 .macro SAVE_REGS
@@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v2.4s, v3.4s}, [A_PTR], #32
        ld1     {v4.4s, v5.4s}, [Y_IPTR], #32
        fmla    v4.4s, v1.4s, v2.4s
+       prfm    PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
        fmla    v5.4s, v1.4s, v3.4s
        st1     {v4.4s, v5.4s}, [Y_OPTR], #32
 
        ld1     {v6.4s, v7.4s}, [A_PTR], #32
        ld1     {v8.4s, v9.4s}, [Y_IPTR], #32
        fmla    v8.4s, v1.4s, v6.4s
+       prfm    PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
        fmla    v9.4s, v1.4s, v7.4s
        st1     {v8.4s, v9.4s}, [Y_OPTR], #32
 #else //DOUBLE
        ld1     {v2.2d, v3.2d}, [A_PTR], #32
        ld1     {v4.2d, v5.2d}, [Y_IPTR], #32
        fmla    v4.2d, v1.2d, v2.2d
+       prfm    PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
        fmla    v5.2d, v1.2d, v3.2d
        st1     {v4.2d, v5.2d}, [Y_OPTR], #32
 
        ld1     {v6.2d, v7.2d}, [A_PTR], #32
        ld1     {v8.2d, v9.2d}, [Y_IPTR], #32
        fmla    v8.2d, v1.2d, v6.2d
+       prfm    PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
        fmla    v9.2d, v1.2d, v7.2d
        st1     {v8.2d, v9.2d}, [Y_OPTR], #32
 
        ld1     {v10.2d, v11.2d}, [A_PTR], #32
        ld1     {v12.2d, v13.2d}, [Y_IPTR], #32
        fmla    v12.2d, v1.2d, v10.2d
+       prfm    PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
        fmla    v13.2d, v1.2d, v11.2d
        st1     {v12.2d, v13.2d}, [Y_OPTR], #32
 
        ld1     {v14.2d, v15.2d}, [A_PTR], #32
        ld1     {v16.2d, v17.2d}, [Y_IPTR], #32
        fmla    v16.2d, v1.2d, v14.2d
+       prfm    PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
        fmla    v17.2d, v1.2d, v15.2d
        st1     {v16.2d, v17.2d}, [Y_OPTR], #32
 #endif
index 0145af6..28325f7 100644 (file)
@@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define J      x11     /* loop variable */
 #define I      x12     /* loop variable */
 
+#define X_PREFETCH_SIZE        768
+#define A_PREFETCH_SIZE        768
+
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
@@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        ld1     {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
        ld1     {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
        fmla    v1.4s, v5.4s, v9.4s
+       prfm    PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
        fmla    v2.4s, v6.4s, v10.4s
+       prfm    PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
        fmla    v3.4s, v7.4s, v11.4s
+       ld1     {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
        fmla    v4.4s, v8.4s, v12.4s
 
-       ld1     {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
        ld1     {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
        fmla    v1.4s, v13.4s, v17.4s
+       prfm    PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
        fmla    v2.4s, v14.4s, v18.4s
+       prfm    PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
        fmla    v3.4s, v15.4s, v19.4s
        fmla    v4.4s, v16.4s, v20.4s
 #else
        ld1     {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
        ld1     {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
        fmla    v1.2d, v5.2d, v9.2d
+       prfm    PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
        fmla    v2.2d, v6.2d, v10.2d
+       prfm    PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
        fmla    v3.2d, v7.2d, v11.2d
        fmla    v4.2d, v8.2d, v12.2d
 
        ld1     {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
        ld1     {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
        fmla    v1.2d, v13.2d, v17.2d
+       prfm    PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
        fmla    v2.2d, v14.2d, v18.2d
+       prfm    PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
        fmla    v3.2d, v15.2d, v19.2d
        fmla    v4.2d, v16.2d, v20.2d
 
        ld1     {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
        ld1     {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
        fmla    v1.2d, v5.2d, v9.2d
+       prfm    PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
        fmla    v2.2d, v6.2d, v10.2d
+       prfm    PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
        fmla    v3.2d, v7.2d, v11.2d
        fmla    v4.2d, v8.2d, v12.2d
 
        ld1     {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
        ld1     {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
        fmla    v1.2d, v13.2d, v17.2d
+       prfm    PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE]
        fmla    v2.2d, v14.2d, v18.2d
+       prfm    PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE]
        fmla    v3.2d, v15.2d, v19.2d
        fmla    v4.2d, v16.2d, v20.2d
 #endif
index 9e285e2..a28d1b0 100644 (file)
@@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define        Y_OPTR  x13     /* loop Y vector address */
 #define        X_PTR   x14     /* loop X vector address */
 
+#define A_PRE_SIZE     768
+#define Y_PRE_SIZE     768
+
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
@@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define ALPHA_R                s0
 #define ALPHA_I                s1
-#define ALPHA_R_COPY   s7
-#define ALPHA_I_COPY   s8
 #define SHZ            3
 #else
 #define ALPHA_R                d0
 #define ALPHA_I                d1
-#define ALPHA_R_COPY   d7
-#define ALPHA_I_COPY   d8
 #define SHZ            4
 #endif
 
@@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro INIT
-       /********** INIT FOR F4 LOOP **********/
-       fmov    ALPHA_R_COPY, ALPHA_R
-       fmov    ALPHA_I_COPY, ALPHA_I
-#if !defined(DOUBLE)
-       ins     v7.s[1], v7.s[0]                // R(ALPHA), R(ALPHA) 
-       ins     v8.s[1], v8.s[0]                // I(ALPHA), I(ALPHA) 
-       ins     v7.d[1], v7.d[0]
-       ins     v8.d[1], v8.d[0]
-#else
-       ins     v7.d[1], v7.d[0]                // R(ALPHA), R(ALPHA)
-       ins     v8.d[1], v8.d[0]                // I(ALPHA), I(ALPHA)
-#endif
-
-       /******* INIT FOR F1 AND S1 LOOP ******/
 #if !defined(DOUBLE)
        ins     v0.s[1], v0.s[0]                // R(ALPHA), R(ALPHA)
        eor     v2.16b, v2.16b, v2.16b
@@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro INIT_LOOP
-       /********** INIT_LOOP FOR F4 LOOP **********/
 #if !defined(DOUBLE)
-       ld1     {v9.2s}, [X_PTR]                // [I(X), R(X)]
-       ins     v10.s[0], v9.s[1]
-       ins     v9.s[1], v9.s[0]                // [R(X), R(X)]
-       ins     v10.s[1], v10.s[0]              // [I(X), I(X)]
-       ins     v9.d[1], v9.d[0]
-       ins     v10.d[1], v10.d[0]
+       ld1     {v2.2s}, [X_PTR]                // [I(X), R(X)]
+       ext     v3.8b, v2.8b, v2.8b, #4         // [R(X), I(X)]
+       fmul    v2.2s, v0.2s, v2.2s
+       fmla    v2.2s, v1.2s, v3.2s             // [I(TEMP), R(TEMP)]
+       ins     v3.s[0], v2.s[1]
+
+       /********** INIT_LOOP FOR F4 LOOP **********/
 #if !defined(CONJ)
 #if !defined(XCONJ)
-       fmul    v11.4s, v9.4s, v7.4s            // [+ R(X) * R(ALPHA)]
-       fmls    v11.4s, v10.4s, v8.4s           // [- I(X) * I(ALPHA)]
-       fmul    v12.4s, v9.4s, v8.4s            // [+ R(X) * I(ALPHA)]
-       fmla    v12.4s, v10.4s, v7.4s           // [+ I(X) * R(ALPHA)]
+       dup     v21.4s, v2.s[0]                 // R[TEMP]
+       dup     v22.4s, v2.s[0]                 // R[TEMP]
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    s25, s25, s3
+       dup     v23.4s, v25.s[0]                // -I[TEMP]
+       dup     v24.4s, v3.s[0]                 // I[TEMP]
 #else
-       fmul    v11.4s, v9.4s, v7.4s            // [+ R(X) * R(ALPHA)]
-       fmla    v11.4s, v10.4s, v8.4s           // [+ I(X) * I(ALPHA)]
-       fmul    v12.4s, v9.4s, v8.4s            // [+ R(X) * I(ALPHA)]
-       fmls    v12.4s, v10.4s, v7.4s           // [- I(X) * R(ALPHA)]
+       dup     v21.4s, v2.s[0]                 // R[TEMP]
+       dup     v22.4s, v2.s[0]                 // R[TEMP]
+       dup     v23.4s, v3.s[0]                 // I[TEMP]
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    s25, s25, s3
+       dup     v24.4s, v25.s[0]                // -I[TEMP]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
-       fmul    v11.4s, v9.4s, v7.4s            // [+ R(X) * R(ALPHA)]
-       fmls    v11.4s, v10.4s, v8.4s           // [+ I(X) * I(ALPHA)]
-       fmul    v12.4s, v10.4s, v7.4s           // [+ I(X) * R(ALPHA)]
-       fmls    v12.4s, v9.4s, v8.4s            // [- R(X) * I(ALPHA)]
+       dup     v21.4s, v2.s[0]                 // R[TEMP]
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    s25, s25, s2
+       dup     v22.4s, v25.s[0]                // R[TEMP]
+       dup     v23.4s, v3.s[0]                 // I[TEMP]
+       dup     v24.4s, v3.s[0]                 // I[TEMP]
 #else
-       fmul    v11.4s, v9.4s, v7.4s            // [+ R(X) * R(ALPHA)]
-       fmls    v11.4s, v10.4s, v8.4s           // [- I(X) * I(ALPHA)]
-       eor     v12.16b, v12.16b, v12.16b
-       fmls    v12.4s, v9.4s, v8.4s            // [- R(X) * I(ALPHA)]
-       fmla    v12.4s, v10.4s, v7.4s           // [- I(X) * R(ALPHA)]
+       dup     v21.4s, v2.s[0]                 // R[TEMP]
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    s25, s25, s2
+       dup     v22.4s, v25.s[0]                // R[TEMP]
+
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    s25, s25, s3
+       dup     v23.4s, v25.s[0]                // I[TEMP]
+       dup     v24.4s, v25.s[0]                // I[TEMP]
 #endif
 #endif // CONJ
 
+
        /****** INIT_LOOP FOR F1 AND S1 LOOP ******/
-       ld1     {v2.2s}, [X_PTR]                // [I(X), R(X)]
-       ext     v3.8b, v2.8b, v2.8b, #4         // [R(X), I(X)]
-       fmul    v2.2s, v0.2s, v2.2s
-       fmla    v2.2s, v1.2s, v3.2s             // [I(TEMP), R(TEMP)]
-       ins     v3.s[0], v2.s[1]
 #if !defined(CONJ)
 #if !defined(XCONJ)
        eor     v4.16b, v4.16b, v4.16b
@@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif // CONJ
 
 #else // DOUBLE
+       ld1     {v2.2d}, [X_PTR]                // [I(X), R(X)]
+       ext     v3.16b, v2.16b, v2.16b, #8      // [R(X), I(X)]
+       fmul    v2.2d, v0.2d, v2.2d
+       fmla    v2.2d, v1.2d, v3.2d             // [I(TEMP), R(TEMP)]
+       ins     v3.d[0], v2.d[1]                // I(TEMP)
 
-       /********** INIT_LOOP FOR F4 LOOP **********/
-       ld1     {v9.2d}, [X_PTR]                // [I(X), R(X)]
-       ins     v10.d[0], v9.d[1]
-       ins     v9.d[1], v9.d[0]                // [R(X), R(X)]
-       ins     v10.d[1], v10.d[0]              // [I(X), I(X)]
+       /****** INIT_LOOP FOR F4 LOOP ******/
 #if !defined(CONJ)
 #if !defined(XCONJ)
-       fmul    v11.2d, v9.2d, v7.2d            // [+ R(X) * R(ALPHA)]
-       fmls    v11.2d, v10.2d, v8.2d           // [- I(X) * I(ALPHA)]
-       fmul    v12.2d, v9.2d, v8.2d            // [+ R(X) * I(ALPHA)]
-       fmla    v12.2d, v10.2d, v7.2d           // [+ I(X) * R(ALPHA)]
+       dup     v21.2d, v2.d[0]                 // R[TEMP]
+       dup     v22.2d, v2.d[0]                 // R[TEMP]
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    d25, d25, d3
+       dup     v23.2d, v25.d[0]                // -I[TEMP]
+       dup     v24.2d, v3.d[0]                 // I[TEMP]
 #else
-       fmul    v11.2d, v9.2d, v7.2d            // [+ R(X) * R(ALPHA)]
-       fmla    v11.2d, v10.2d, v8.2d           // [+ I(X) * I(ALPHA)]
-       fmul    v12.2d, v9.2d, v8.2d            // [+ R(X) * I(ALPHA)]
-       fmls    v12.2d, v10.2d, v7.2d           // [- I(X) * R(ALPHA)]
+       dup     v21.2d, v2.d[0]                 // R[TEMP]
+       dup     v22.2d, v2.d[0]                 // R[TEMP]
+       dup     v23.2d, v3.d[0]                 // I[TEMP]
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    d25, d25, d3
+       dup     v24.2d, v25.d[0]                // -I[TEMP]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
-       fmul    v11.2d, v9.2d, v7.2d            // [+ R(X) * R(ALPHA)]
-       fmls    v11.2d, v10.2d, v8.2d           // [+ I(X) * I(ALPHA)]
-       fmul    v12.2d, v10.2d, v7.2d           // [+ I(X) * R(ALPHA)]
-       fmls    v12.2d, v9.2d, v8.2d            // [- R(X) * I(ALPHA)]
+       dup     v21.2d, v2.d[0]                 // R[TEMP]
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    d25, d25, d2
+       dup     v22.2d, v25.d[0]                // R[TEMP]
+       dup     v23.2d, v3.d[0]                 // I[TEMP]
+       dup     v24.2d, v3.d[0]                 // I[TEMP]
 #else
-       fmul    v11.2d, v9.2d, v7.2d            // [+ R(X) * R(ALPHA)]
-       fmls    v11.2d, v10.2d, v8.2d           // [- I(X) * I(ALPHA)]
-       eor     v12.16b, v12.16b, v12.16b
-       fmls    v12.2d, v9.2d, v8.2d            // [- R(X) * I(ALPHA)]
-       fmla    v12.2d, v10.2d, v7.2d           // [- I(X) * R(ALPHA)]
+       dup     v21.2d, v2.d[0]                 // R[TEMP]
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    d25, d25, d2
+       dup     v22.2d, v25.d[0]                // R[TEMP]
+
+       eor     v25.16b, v25.16b, v25.16b
+       fsub    d25, d25, d3
+       dup     v23.2d, v25.d[0]                // I[TEMP]
+       dup     v24.2d, v25.d[0]                // I[TEMP]
 #endif
 #endif // CONJ
 
+
        /****** INIT_LOOP FOR F1 AND S1 LOOP ******/
-       ld1     {v2.2d}, [X_PTR]                // [I(X), R(X)]
-       ext     v3.16b, v2.16b, v2.16b, #8      // [R(X), I(X)]
-       fmul    v2.2d, v0.2d, v2.2d
-       fmla    v2.2d, v1.2d, v3.2d             // [I(TEMP), R(TEMP)]
-       ins     v3.d[0], v2.d[1]                // I(TEMP)
 #if !defined(CONJ)
 #if !defined(XCONJ)
        eor     v4.16b, v4.16b, v4.16b
@@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        ld2     {v13.4s, v14.4s}, [A_PTR], #32
        ld2     {v15.4s, v16.4s}, [Y_IPTR], #32
-#if !defined(CONJ)
-#if !defined(XCONJ)
-       fmla    v15.4s, v11.4s, v13.4s          // [+ R(ALPHA * X) * A_R]
-       fmls    v15.4s, v12.4s, v14.4s          // [- I(ALPHA * X) * A_I]
-       fmla    v16.4s, v11.4s, v14.4s          // [+ R(ALPHA * X) * A_I]
-       fmla    v16.4s, v12.4s, v13.4s          // [+ I(ALPHA * X) * A_R]
-#else
-       fmla    v15.4s, v11.4s, v13.4s          // [+ R(ALPHA * X) * A_R]
-       fmla    v15.4s, v12.4s, v14.4s          // [+ I(ALPHA * X) * A_I]
-       fmla    v16.4s, v11.4s, v14.4s          // [+ R(ALPHA * X) * A_I]
-       fmls    v16.4s, v12.4s, v13.4s          // [- I(ALPHA * X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
-       fmla    v15.4s, v11.4s, v13.4s          // [+ R(ALPHA * X) * A_R]
-       fmla    v15.4s, v12.4s, v14.4s          // [+ I(ALPHA * X) * A_I]
-       fmls    v16.4s, v11.4s, v14.4s          // [- R(ALPHA * X) * A_I]
-       fmla    v16.4s, v12.4s, v13.4s          // [+ I(ALPHA * X) * A_R]
-#else
-       fmla    v15.4s, v11.4s, v13.4s          // [+ R(ALPHA * X) * A_R]
-       fmls    v15.4s, v12.4s, v14.4s          // [- I(ALPHA * X) * A_I]
-       fmls    v16.4s, v11.4s, v14.4s          // [- R(ALPHA * X) * A_I]
-       fmls    v16.4s, v12.4s, v13.4s          // [- I(ALPHA * X) * A_R]
-#endif
-#endif // CONJ
+
+       prfm    PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
+       prfm    PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
+
+       fmla    v15.4s, v21.4s, v13.4s
+       fmla    v15.4s, v23.4s, v14.4s
+       fmla    v16.4s, v22.4s, v14.4s
+       fmla    v16.4s, v24.4s, v13.4s
+
        st2     {v15.4s, v16.4s}, [Y_OPTR], #32
 
 #else // DOUBLE
 
        ld2     {v13.2d, v14.2d}, [A_PTR], #32
        ld2     {v15.2d, v16.2d}, [Y_IPTR], #32
-#if !defined(CONJ)
-#if !defined(XCONJ)
-       fmla    v15.2d, v11.2d, v13.2d          // [+ R(ALPHA * X) * A_R]
-       fmls    v15.2d, v12.2d, v14.2d          // [- I(ALPHA * X) * A_I]
-       fmla    v16.2d, v11.2d, v14.2d          // [+ R(ALPHA * X) * A_I]
-       fmla    v16.2d, v12.2d, v13.2d          // [+ I(ALPHA * X) * A_R]
-#else
-       fmla    v15.2d, v11.2d, v13.2d          // [+ R(ALPHA * X) * A_R]
-       fmla    v15.2d, v12.2d, v14.2d          // [+ I(ALPHA * X) * A_I]
-       fmla    v16.2d, v11.2d, v14.2d          // [+ R(ALPHA * X) * A_I]
-       fmls    v16.2d, v12.2d, v13.2d          // [- I(ALPHA * X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
-       fmla    v15.2d, v11.2d, v13.2d          // [+ R(ALPHA * X) * A_R]
-       fmla    v15.2d, v12.2d, v14.2d          // [+ I(ALPHA * X) * A_I]
-       fmls    v16.2d, v11.2d, v14.2d          // [- R(ALPHA * X) * A_I]
-       fmla    v16.2d, v12.2d, v13.2d          // [+ I(ALPHA * X) * A_R]
-#else
-       fmla    v15.2d, v11.2d, v13.2d          // [+ R(ALPHA * X) * A_R]
-       fmls    v15.2d, v12.2d, v14.2d          // [- I(ALPHA * X) * A_I]
-       fmls    v16.2d, v11.2d, v14.2d          // [- R(ALPHA * X) * A_I]
-       fmls    v16.2d, v12.2d, v13.2d          // [- I(ALPHA * X) * A_R]
-#endif
-#endif // CONJ
+       prfm    PLDL1KEEP, [A_PTR, #A_PRE_SIZE]
+
+       fmla    v15.2d, v21.2d, v13.2d
+       fmla    v15.2d, v23.2d, v14.2d
+       fmla    v16.2d, v22.2d, v14.2d
+       fmla    v16.2d, v24.2d, v13.2d
+
        st2     {v15.2d, v16.2d}, [Y_OPTR], #32
 
        ld2     {v17.2d, v18.2d}, [A_PTR], #32
        ld2     {v19.2d, v20.2d}, [Y_IPTR], #32
-#if !defined(CONJ)
-#if !defined(XCONJ)
-       fmla    v19.2d, v11.2d, v17.2d          // [+ R(ALPHA * X) * A_R]
-       fmls    v19.2d, v12.2d, v18.2d          // [- I(ALPHA * X) * A_I]
-       fmla    v20.2d, v11.2d, v18.2d          // [+ R(ALPHA * X) * A_I]
-       fmla    v20.2d, v12.2d, v17.2d          // [+ I(ALPHA * X) * A_R]
-#else
-       fmla    v19.2d, v11.2d, v17.2d          // [+ R(ALPHA * X) * A_R]
-       fmla    v19.2d, v12.2d, v18.2d          // [- I(ALPHA * X) * A_I]
-       fmla    v20.2d, v11.2d, v18.2d          // [+ R(ALPHA * X) * A_I]
-       fmls    v20.2d, v12.2d, v17.2d          // [+ I(ALPHA * X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
-       fmla    v19.2d, v11.2d, v17.2d          // [+ R(ALPHA * X) * A_R]
-       fmla    v19.2d, v12.2d, v18.2d          // [- I(ALPHA * X) * A_I]
-       fmls    v20.2d, v11.2d, v18.2d          // [+ R(ALPHA * X) * A_I]
-       fmla    v20.2d, v12.2d, v17.2d          // [+ I(ALPHA * X) * A_R]
-#else
-       fmla    v19.2d, v11.2d, v17.2d          // [+ R(ALPHA * X) * A_R]
-       fmls    v19.2d, v12.2d, v18.2d          // [- I(ALPHA * X) * A_I]
-       fmls    v20.2d, v11.2d, v18.2d          // [+ R(ALPHA * X) * A_I]
-       fmls    v20.2d, v12.2d, v17.2d          // [+ I(ALPHA * X) * A_R]
-#endif
-#endif // CONJ
+       prfm    PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE]
+
+       fmla    v19.2d, v21.2d, v17.2d
+       fmla    v19.2d, v23.2d, v18.2d
+       fmla    v20.2d, v22.2d, v18.2d
+       fmla    v20.2d, v24.2d, v17.2d
+
        st2     {v19.2d, v20.2d}, [Y_OPTR], #32
 
 #endif
@@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP:
 
 zgemv_n_kernel_F4:
 
-       KERNEL_F1
-       KERNEL_F1
-       KERNEL_F1
-       KERNEL_F1
+       KERNEL_F4
 
        subs    I, I, #1
        bne     zgemv_n_kernel_F4
index e61c171..79ce9bc 100644 (file)
@@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define J      x11     /* loop variable */
 #define I      x12     /* loop variable */
 
+#define A_PRE_SIZE     768
+#define X_PRE_SIZE     768
+
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
@@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        ld2     {v11.4s, v12.4s}, [X_PTR], #32
        ld2     {v13.4s, v14.4s}, [A_PTR], #32
+       prfm    PLDL1STRM, [X_PTR, #X_PRE_SIZE]
+       prfm    PLDL1STRM, [A_PTR, #A_PRE_SIZE]
 
 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
        fmla    v9.4s,  v11.4s, v13.4s          // [+ R(X) * A_R]
@@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else // DOUBLE
        ld2     {v11.2d, v12.2d}, [X_PTR], #32
        ld2     {v13.2d, v14.2d}, [A_PTR], #32
-       prfm    PLDL1STRM, [X_PTR, #512]
+       prfm    PLDL1STRM, [X_PTR, #X_PRE_SIZE]
 
 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
        fmla    v9.2d,  v11.2d, v13.2d          // [+ R(X) * A_R]
@@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
        ld2     {v17.2d, v18.2d}, [X_PTR], #32
        ld2     {v19.2d, v20.2d}, [A_PTR], #32
-       prfm    PLDL1STRM, [A_PTR, #512]
+       prfm    PLDL1STRM, [A_PTR, #A_PRE_SIZE]
 
 #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
        fmla    v15.2d, v17.2d, v19.2d          // [+ R(X) * A_R]