From 98965da2e8b26a59547270ff9cb7dacfa51363fb Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 20 Nov 2015 01:15:04 +0530 Subject: [PATCH] lapack-test fixes for Cortex A57 --- kernel/arm64/KERNEL.CORTEXA57 | 16 +-- kernel/arm64/cgemm_kernel_4x4.S | 48 ++++--- kernel/arm64/ctrmm_kernel_4x4.S | 24 ++-- kernel/arm64/{idamax.S => iamax.S} | 17 ++- kernel/arm64/isamax.S | 213 ----------------------------- kernel/arm64/rot.S | 6 +- kernel/arm64/zaxpy.S | 123 ++++++++++------- kernel/arm64/zgemm_kernel_4x4.S | 48 ++++--- kernel/arm64/zgemv_n.S | 48 ++++--- kernel/arm64/zgemv_t.S | 76 ++++------- kernel/arm64/zscal.S | 266 ++++++++++++++++++++++++++----------- kernel/arm64/ztrmm_kernel_4x4.S | 48 ++++--- 12 files changed, 455 insertions(+), 478 deletions(-) rename kernel/arm64/{idamax.S => iamax.S} (93%) delete mode 100644 kernel/arm64/isamax.S diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 4d18dee..0c48b9f 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -5,8 +5,8 @@ DAMAXKERNEL = amax.S CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S -ISAMAXKERNEL = isamax.S -IDAMAXKERNEL = idamax.S +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S ICAMAXKERNEL = izamax.S IZAMAXKERNEL = izamax.S @@ -25,22 +25,22 @@ DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S -DOTKERNEL = dot.S +SDOTKERNEL = dot.S DDOTKERNEL = dot.S CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S -SNRM2KERNEL = snrm2.S -DNRM2KERNEL = dnrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S +#SNRM2KERNEL = snrm2.S +#DNRM2KERNEL = dnrm2.S +#CNRM2KERNEL = znrm2.S +#ZNRM2KERNEL = znrm2.S SROTKERNEL = rot.S DROTKERNEL = rot.S CROTKERNEL = zrot.S ZROTKERNEL = zrot.S -SCALKERNEL = scal.S +SSCALKERNEL = scal.S DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S index cec2384..7a70264 100644 --- a/kernel/arm64/cgemm_kernel_4x4.S +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -181,73 +181,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v16.4s, v0.4s, v8.4s[0] OP_ii v16.4s, v1.4s, v9.4s[0] - fmul v17.4s, v0.4s, v9.4s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v17.4s, v17.4s + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] #endif OP_ir v17.4s, v1.4s, v8.4s[0] fmul v20.4s, v0.4s, v8.4s[1] OP_ii v20.4s, v1.4s, v9.4s[1] - fmul v21.4s, v0.4s, v9.4s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v21.4s, v21.4s + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] #endif OP_ir v21.4s, v1.4s, v8.4s[1] fmul v24.4s, v0.4s, v8.4s[2] OP_ii v24.4s, v1.4s, v9.4s[2] - fmul v25.4s, v0.4s, v9.4s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v25.4s, v25.4s + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] #endif OP_ir v25.4s, v1.4s, v8.4s[2] fmul v28.4s, v0.4s, v8.4s[3] OP_ii v28.4s, v1.4s, v9.4s[3] - fmul v29.4s, v0.4s, v9.4s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v29.4s, v29.4s + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] #endif OP_ir v29.4s, v1.4s, v8.4s[3] fmul v18.4s, v2.4s, v8.4s[0] OP_ii v18.4s, v3.4s, v9.4s[0] - fmul v19.4s, v2.4s, v9.4s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v19.4s, v19.4s + eor v19.16b, v19.16b, v19.16b + fmls v19.4s, v2.4s, v9.4s[0] +#else + fmul v19.4s, v2.4s, v9.4s[0] #endif OP_ir v19.4s, v3.4s, v8.4s[0] fmul v22.4s, v2.4s, v8.4s[1] OP_ii v22.4s, v3.4s, v9.4s[1] - fmul v23.4s, v2.4s, v9.4s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v23.4s, v23.4s + eor v23.16b, v23.16b, v23.16b + fmls v23.4s, v2.4s, v9.4s[1] +#else + fmul v23.4s, v2.4s, v9.4s[1] #endif OP_ir v23.4s, v3.4s, v8.4s[1] fmul v26.4s, v2.4s, v8.4s[2] OP_ii v26.4s, v3.4s, v9.4s[2] - fmul v27.4s, v2.4s, v9.4s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v27.4s, v27.4s + eor v27.16b, v27.16b, v27.16b + fmls v27.4s, v2.4s, v9.4s[2] +#else + fmul v27.4s, v2.4s, v9.4s[2] #endif OP_ir v27.4s, v3.4s, v8.4s[2] fmul v30.4s, v2.4s, v8.4s[3] OP_ii v30.4s, v3.4s, v9.4s[3] - fmul v31.4s, v2.4s, v9.4s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v31.4s, v31.4s + eor v31.16b, v31.16b, v31.16b + fmls v31.4s, v2.4s, v9.4s[3] +#else + fmul v31.4s, v2.4s, v9.4s[3] #endif OP_ir v31.4s, v3.4s, v8.4s[3] diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S index 7b02111..be0e9bd 100644 --- a/kernel/arm64/ctrmm_kernel_4x4.S +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -172,37 +172,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v16.4s, v0.4s, v8.4s[0] OP_ii v16.4s, v1.4s, v9.4s[0] - fmul v17.4s, v0.4s, v9.4s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v17.4s, v17.4s + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] #endif OP_ir v17.4s, v1.4s, v8.4s[0] fmul v20.4s, v0.4s, v8.4s[1] OP_ii v20.4s, v1.4s, v9.4s[1] - fmul v21.4s, v0.4s, v9.4s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v21.4s, v21.4s + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] #endif OP_ir v21.4s, v1.4s, v8.4s[1] fmul v24.4s, v0.4s, v8.4s[2] OP_ii v24.4s, v1.4s, v9.4s[2] - fmul v25.4s, v0.4s, v9.4s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v25.4s, v25.4s + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] #endif OP_ir v25.4s, v1.4s, v8.4s[2] fmul v28.4s, v0.4s, v8.4s[3] OP_ii v28.4s, v1.4s, v9.4s[3] - fmul v29.4s, v0.4s, v9.4s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v29.4s, v29.4s + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] #endif OP_ir v29.4s, v1.4s, v8.4s[3] diff --git a/kernel/arm64/idamax.S b/kernel/arm64/iamax.S similarity index 93% rename from kernel/arm64/idamax.S rename to kernel/arm64/iamax.S index fd42658..575c15e 100644 --- a/kernel/arm64/idamax.S +++ b/kernel/arm64/iamax.S @@ -45,16 +45,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define COND ge #endif +#if !defined(DOUBLE) +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else #define MAXF d0 #define TMPF d1 #define TMPVF {v1.d}[0] #define SZ 8 +#endif /******************************************************************************/ .macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + ld1 {v0.s}[0], [X], INC_X +#else lsl INC_X, INC_X, #3 ld1 {v0.d}[0], [X], INC_X +#endif mov Z, #1 mov INDEX, Z fabs MAXF, MAXF @@ -107,9 +119,8 @@ iamax_kernel_S1: iamax_kernel_S10: KERNEL_S1 - - subs I, I, #1 - bne iamax_kernel_S10 + subs I, I, #1 + bne iamax_kernel_S10 iamax_kernel_L999: diff --git a/kernel/arm64/isamax.S b/kernel/arm64/isamax.S deleted file mode 100644 index 309b1c1..0000000 --- a/kernel/arm64/isamax.S +++ /dev/null @@ -1,213 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N x0 /* vector length */ -#define X x1 /* X vector address */ -#define INC_X x2 /* X stride */ -#define INDEX x3 /* index of max/min value */ -#define Z x4 /* vector index */ -#define I x5 /* loop variable */ -#define X_COPY x6 /* copy of X address */ -#define MAXF_Z x7 - -/******************************************************************************* -* Macro definitions -*******************************************************************************/ - -#define MAXF s5 -#define TMPF s6 -#define TMPVF {v6.s}[0] -#define SZ 4 - -/******************************************************************************/ - -.macro INIT_F1 - ldr MAXF, [X], #SZ - mov Z, #1 - mov INDEX, Z - fabs MAXF, MAXF -.endm - -.macro KERNEL_F1 - ldr TMPF, [X], #SZ - add Z, Z, #1 - fabs TMPF, TMPF - fcmp TMPF, MAXF - fcsel MAXF, MAXF, TMPF, le - csel INDEX, INDEX, Z, le -.endm - -.macro INIT_F4 - ld1 {v0.4s}, [X], #16 - fabs v0.4s, v0.4s - fmaxv MAXF, v0.4s - mov Z, #5 - mov MAXF_Z, #1 -.endm - -.macro KERNEL_F4 - ld1 {v0.4s}, [X], #16 - fabs v0.4s, v0.4s - fmaxv TMPF, v0.4s - PRFM PLDL1KEEP, [X, #512] - fcmp TMPF, MAXF - fcsel MAXF, MAXF, TMPF, le - csel MAXF_Z, MAXF_Z, Z, le - add Z, Z, #4 -.endm - - -.macro KERNEL_F4_FINALIZE - mov INDEX, MAXF_Z - sub MAXF_Z, MAXF_Z, #1 - lsl MAXF_Z, MAXF_Z, #2 - add X_COPY, X_COPY, MAXF_Z - ldr TMPF, [X_COPY], #SZ - fabs TMPF, TMPF - fcmp TMPF, MAXF - beq KERNEL_F4_FINALIZE_DONE - add INDEX, INDEX, #1 - ldr TMPF, [X_COPY], #SZ - fabs TMPF, TMPF - fcmp TMPF, MAXF - beq KERNEL_F4_FINALIZE_DONE - add INDEX, INDEX, #1 - ldr TMPF, [X_COPY], #SZ - fabs TMPF, TMPF - fcmp TMPF, MAXF - beq KERNEL_F4_FINALIZE_DONE - add INDEX, INDEX, #1 -KERNEL_F4_FINALIZE_DONE: -.endm - - -.macro INIT_S - lsl INC_X, INC_X, #2 - ld1 TMPVF, [X], INC_X - mov Z, #1 - mov INDEX, Z - fabs MAXF, TMPF -.endm - -.macro KERNEL_S1 - ld1 TMPVF, [X], INC_X - add Z, Z, #1 - fabs TMPF, TMPF - fcmp TMPF, MAXF - fcsel MAXF, MAXF, TMPF, le - csel INDEX, INDEX, Z, le -.endm - -/******************************************************************************* -* End of macro definitions -*******************************************************************************/ - - PROLOGUE - - cmp N, xzr - ble iamax_kernel_zero - cmp INC_X, xzr - ble iamax_kernel_zero - - PRFM PLDL1KEEP, [X] - mov X_COPY, X - - cmp INC_X, #1 - bne iamax_kernel_S_BEGIN - -iamax_kernel_F_BEGIN: - asr I, N, #2 - cmp I, xzr - beq iamax_kernel_F1_INIT - - INIT_F4 - subs I, I, #1 - beq iamax_kernel_F4_FINALIZE - -iamax_kernel_F4: - KERNEL_F4 - subs I, I, #1 - bne iamax_kernel_F4 - -iamax_kernel_F4_FINALIZE: - KERNEL_F4_FINALIZE - -iamax_kernel_F1: - ands I, N, #3 - ble iamax_kernel_L999 - -iamax_kernel_F10: - KERNEL_F1 - subs I, I, #1 - bne iamax_kernel_F10 - b iamax_kernel_L999 - -iamax_kernel_F1_INIT: - INIT_F1 - subs N, N, #1 - b iamax_kernel_F1 - -iamax_kernel_S_BEGIN: - INIT_S - - subs N, N, #1 - ble iamax_kernel_L999 - - asr I, N, #2 - cmp I, xzr - ble iamax_kernel_S1 - -iamax_kernel_S4: - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - - subs I, I, #1 - bne iamax_kernel_S4 - -iamax_kernel_S1: - ands I, N, #3 - ble iamax_kernel_L999 - -iamax_kernel_S10: - KERNEL_S1 - subs I, I, #1 - bne iamax_kernel_S10 - -iamax_kernel_L999: - mov x0, INDEX - ret - -iamax_kernel_zero: - mov x0, xzr - ret - - EPILOGUE diff --git a/kernel/arm64/rot.S b/kernel/arm64/rot.S index ea48b5c..5721252 100644 --- a/kernel/arm64/rot.S +++ b/kernel/arm64/rot.S @@ -59,10 +59,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 #if !defined(DOUBLE) - fneg s2, S + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, S ins v1.s[1], v2.s[0] // [-S, S] #else - fneg d2, S + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, S ins v1.d[1], v2.d[0] // [-S, S] #endif .endm diff --git a/kernel/arm64/zaxpy.S b/kernel/arm64/zaxpy.S index 4cc952b..ea09382 100644 --- a/kernel/arm64/zaxpy.S +++ b/kernel/arm64/zaxpy.S @@ -43,14 +43,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define DA_R s0 /* scale input value */ #define DA_I s1 /* scale input value */ -#define TMPX v2.2s -#define TMPY v3.2s #define SZ 4 #else #define DA_R d0 /* scale input value */ #define DA_I d1 /* scale input value */ -#define TMPX v2.2d -#define TMPY v3.2d #define SZ 8 #endif @@ -61,22 +57,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(CONJ) #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R - fneg s2, DA_I + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, DA_I ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I #else ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R - fneg d2, DA_I + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, DA_I ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I #endif #else #if !defined(DOUBLE) - fneg s2, DA_R + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, DA_R ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I #else - fneg d2, DA_R + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, DA_R ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I #endif @@ -111,9 +111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_INIT_F4 #if !defined(DOUBLE) - // Replicate the lower 2 floats into the upper 2 slots - ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R - ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I + ins v16.s[0], v0.s[0] + ins v16.s[1], v16.s[0] + ins v16.d[1], v16.d[0] +#if !defined(CONJ) + ins v17.s[0], v1.s[1] +#else + ins v17.s[0], v1.s[0] +#endif + ins v17.s[1], v17.s[0] + ins v17.d[1], v17.d[0] +#else //DOUBLE + ins v16.d[0], v0.d[0] + ins v16.d[1], v16.d[0] +#if !defined(CONJ) + ins v17.d[0], v1.d[1] +#else + ins v17.d[0], v1.d[0] +#endif + ins v17.d[1], v17.d[0] #endif .endm @@ -121,55 +137,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 #if !defined(DOUBLE) - ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0] - // V3 = X[7], X[6], X[5], X[4] - ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] - ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] - ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] + ld2 {v2.4s, v3.4s}, [X], #32 + ld2 {v4.4s, v5.4s}, [Y_COPY], #32 - ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0] - // V5 = Y[7], Y[6], Y[5], Y[4] - - ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] - ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] - ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] + fmla v4.4s, v2.4s, v16.4s +#if !defined(CONJ) + fmls v4.4s, v3.4s, v17.4s +#else + fmla v4.4s, v3.4s, v17.4s +#endif - fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix] - // Y[iy+1] += +-DA_R * X[ix+1] - fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1] - // Y[iy+1] += DA_I * X[ix] - st1 {v4.4s}, [Y], #16 +#if !defined(CONJ) + fmla v5.4s, v2.4s, v17.4s +#else + fmls v5.4s, v2.4s, v17.4s +#endif + fmla v5.4s, v3.4s, v16.4s - fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix] - fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1] - // Y[iy+1] += +-DA_R * X[ix+1] - // Y[iy+1] += DA_I * X[ix] - st1 {v5.4s}, [Y], #16 + st2 {v4.4s, v5.4s}, [Y], #32 #else // DOUBLE - ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3 - ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] - ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] + ld2 {v2.2d, v3.2d}, [X], #32 + ld2 {v4.2d, v5.2d}, [Y_COPY], #32 - ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3 - ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] - ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] - - ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 + fmla v4.2d, v2.2d, v16.2d +#if !defined(CONJ) + fmls v4.2d, v3.2d, v17.2d +#else + fmla v4.2d, v3.2d, v17.2d +#endif +#if !defined(CONJ) + fmla v5.2d, v2.2d, v17.2d +#else + fmls v5.2d, v2.2d, v17.2d +#endif + fmla v5.2d, v3.2d, v16.2d - fmla v16.2d, v0.2d, v2.2d - fmla v17.2d, v0.2d, v3.2d + st2 {v4.2d, v5.2d}, [Y], #32 - ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 + ld2 {v18.2d, v19.2d}, [X], #32 + ld2 {v20.2d, v21.2d}, [Y_COPY], #32 - fmla v16.2d, v1.2d, v20.2d - fmla v17.2d, v1.2d, v21.2d - st1 {v16.2d,v17.2d}, [Y], #32 + fmla v20.2d, v18.2d, v16.2d +#if !defined(CONJ) + fmls v20.2d, v19.2d, v17.2d +#else + fmla v20.2d, v19.2d, v17.2d +#endif +#if !defined(CONJ) + fmla v21.2d, v18.2d, v17.2d +#else + fmls v21.2d, v18.2d, v17.2d +#endif + fmla v21.2d, v19.2d, v16.2d - fmla v18.2d, v0.2d, v4.2d - fmla v19.2d, v0.2d, v5.2d - fmla v18.2d, v1.2d, v22.2d - fmla v19.2d, v1.2d, v23.2d - st1 {v18.2d,v19.2d}, [Y], #32 + st2 {v20.2d, v21.2d}, [Y], #32 #endif PRFM PLDL1KEEP, [X, #512] PRFM PLDL1KEEP, [Y, #512] diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 56a8bba..28ce3de 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -184,73 +184,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v16.2d, v0.2d, v8.2d[0] OP_ii v16.2d, v1.2d, v9.2d[0] - fmul v17.2d, v0.2d, v9.2d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v17.2d, v17.2d + eor v17.16b, v17.16b, v17.16b + fmls v17.2d, v0.2d, v9.2d[0] +#else + fmul v17.2d, v0.2d, v9.2d[0] #endif OP_ir v17.2d, v1.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.2d[0] OP_ii v18.2d, v3.2d, v9.2d[0] - fmul v19.2d, v2.2d, v9.2d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v19.2d, v19.2d + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.2d[0] +#else + fmul v19.2d, v2.2d, v9.2d[0] #endif OP_ir v19.2d, v3.2d, v8.2d[0] fmul v20.2d, v0.2d, v8.2d[1] OP_ii v20.2d, v1.2d, v9.2d[1] - fmul v21.2d, v0.2d, v9.2d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v21.2d, v21.2d + eor v21.16b, v21.16b, v21.16b + fmls v21.2d, v0.2d, v9.2d[1] +#else + fmul v21.2d, v0.2d, v9.2d[1] #endif OP_ir v21.2d, v1.2d, v8.2d[1] fmul v22.2d, v2.2d, v8.2d[1] OP_ii v22.2d, v3.2d, v9.2d[1] - fmul v23.2d, v2.2d, v9.2d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v23.2d, v23.2d + eor v23.16b, v23.16b, v23.16b + fmls v23.2d, v2.2d, v9.2d[1] +#else + fmul v23.2d, v2.2d, v9.2d[1] #endif OP_ir v23.2d, v3.2d, v8.2d[1] fmul v24.2d, v0.2d, v10.2d[0] OP_ii v24.2d, v1.2d, v11.2d[0] - fmul v25.2d, v0.2d, v11.2d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v25.2d, v25.2d + eor v25.16b, v25.16b, v25.16b + fmls v25.2d, v0.2d, v11.2d[0] +#else + fmul v25.2d, v0.2d, v11.2d[0] #endif OP_ir v25.2d, v1.2d, v10.2d[0] fmul v26.2d, v2.2d, v10.2d[0] OP_ii v26.2d, v3.2d, v11.2d[0] - fmul v27.2d, v2.2d, v11.2d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v27.2d, v27.2d + eor v27.16b, v27.16b, v27.16b + fmls v27.2d, v2.2d, v11.2d[0] +#else + fmul v27.2d, v2.2d, v11.2d[0] #endif OP_ir v27.2d, v3.2d, v10.2d[0] fmul v28.2d, v0.2d, v10.2d[1] OP_ii v28.2d, v1.2d, v11.2d[1] - fmul v29.2d, v0.2d, v11.2d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v29.2d, v29.2d + eor v29.16b, v29.16b, v29.16b + fmls v29.2d, v0.2d, v11.2d[1] +#else + fmul v29.2d, v0.2d, v11.2d[1] #endif OP_ir v29.2d, v1.2d, v10.2d[1] fmul v30.2d, v2.2d, v10.2d[1] OP_ii v30.2d, v3.2d, v11.2d[1] - fmul v31.2d, v2.2d, v11.2d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v31.2d, v31.2d + eor v31.16b, v31.16b, v31.16b + fmls v31.2d, v2.2d, v11.2d[1] +#else + fmul v31.2d, v2.2d, v11.2d[1] #endif OP_ir v31.2d, v3.2d, v10.2d[1] diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S index 9c5ec49..9e285e2 100644 --- a/kernel/arm64/zgemv_n.S +++ b/kernel/arm64/zgemv_n.S @@ -110,15 +110,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******* INIT FOR F1 AND S1 LOOP ******/ #if !defined(DOUBLE) - ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) - fneg s2, ALPHA_I + ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, ALPHA_I ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) #if !defined(XCONJ) ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) #endif #else - ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) - fneg d2, ALPHA_I + ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, ALPHA_I ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) #if !defined(XCONJ) ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) @@ -156,8 +158,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)] - fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)] + eor v12.16b, v12.16b, v12.16b + fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] #endif #endif // CONJ @@ -170,24 +172,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ins v3.s[0], v2.s[1] #if !defined(CONJ) #if !defined(XCONJ) - fneg s4, s3 + eor v4.16b, v4.16b, v4.16b + fsub s4, s4, s3 ins v3.s[1], v4.s[0] ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] #else - fneg s4, s3 + eor v4.16b, v4.16b, v4.16b + fsub s4, s4, s3 ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] #endif #else // CONJ #if !defined(XCONJ) ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] - fneg s4, s2 + eor v4.16b, v4.16b, v4.16b + fsub s4, s4, s2 ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] #else - fneg s3, s3 + eor v4.16b, v4.16b, v4.16b + fsub s3, s4, s3 ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] - fneg s4, s2 + eor v4.16b, v4.16b, v4.16b + fsub s4, s4, s2 ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] #endif #endif // CONJ @@ -220,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)] - fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)] + eor v12.16b, v12.16b, v12.16b + fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] #endif #endif // CONJ @@ -234,24 +241,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ins v3.d[0], v2.d[1] // I(TEMP) #if !defined(CONJ) #if !defined(XCONJ) - fneg d4, d3 // -I(TEMP) + eor v4.16b, v4.16b, v4.16b + fsub d4, d4, d3 ins v3.d[1], v4.d[0] ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] #else - fneg d4, d3 // -I(TEMP) + eor v4.16b, v4.16b, v4.16b + fsub d4, d4, d3 ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] #endif #else // CONJ #if !defined(XCONJ) ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] - fneg d4, d2 // -R(TEMP) + eor v4.16b, v4.16b, v4.16b + fsub d4, d4, d2 ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] #else - fneg d3, d3 // -I(TEMP) + eor v4.16b, v4.16b, v4.16b + fsub d3, d4, d3 ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] - fneg d4, d2 // -R(TEMP) + eor v4.16b, v4.16b, v4.16b + fsub d4, d4, d2 ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] #endif #endif // CONJ diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S index 1f0d698..e61c171 100644 --- a/kernel/arm64/zgemv_t.S +++ b/kernel/arm64/zgemv_t.S @@ -96,22 +96,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(XCONJ) #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R - fneg s2, ALPHA_I + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, ALPHA_I ins v1.s[1], v2.s[0] ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I #else ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R - fneg d2, ALPHA_I + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, ALPHA_I ins v1.d[1], v2.d[0] ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I #endif #else // XCONJ #if !defined(DOUBLE) - fneg s2, ALPHA_R + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, ALPHA_R ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I #else - fneg d2, ALPHA_R + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, ALPHA_R ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I #endif @@ -136,8 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v11.4s, v12.4s}, [X_PTR], #32 ld2 {v13.4s, v14.4s}, [A_PTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] @@ -145,29 +148,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] - fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] - fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] - fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] -#else - fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] - fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] - fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] - fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] #endif -#endif // CONJ #else // DOUBLE ld2 {v11.2d, v12.2d}, [X_PTR], #32 ld2 {v13.2d, v14.2d}, [A_PTR], #32 prfm PLDL1STRM, [X_PTR, #512] -#if !defined(CONJ) -#if !defined(XCONJ) + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] @@ -175,27 +165,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] - fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] - fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] - fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] -#else - fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] - fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] - fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] - fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] #endif -#endif // CONJ + ld2 {v17.2d, v18.2d}, [X_PTR], #32 ld2 {v19.2d, v20.2d}, [A_PTR], #32 prfm PLDL1STRM, [A_PTR, #512] -#if !defined(CONJ) -#if !defined(XCONJ) + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] @@ -203,22 +181,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] - fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] - fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] - fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] -#else - fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] - fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] - fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] - fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] #endif -#endif // CONJ + #endif //DOUBLE .endm @@ -252,7 +218,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] ld1 {v5.s}[0], [A_PTR], #4 // A1 ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] - fneg s16, s5 + eor v16.16b, v16.16b, v16.16b + fsub s16, s16, s5 ins v5.s[1], v16.s[0] // [-A1, A1] #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] @@ -264,7 +231,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] ld1 {v5.d}[0], [A_PTR], #8 // A1 ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] - fneg d16, d5 + eor v16.16b, v16.16b, v16.16b + fsub d16, d16, d5 ins v5.d[1], v16.d[0] // [-A1, A1] #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] @@ -284,7 +252,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] ld1 {v5.s}[0], [A_PTR], #4 // A1 ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] - fneg s16, s5 + eor v16.16b, v16.16b, v16.16b + fsub s16, s16, s5 ins v5.s[1], v16.s[0] // [-A1, A1] #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] @@ -296,7 +265,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] ld1 {v5.d}[0], [A_PTR], #8 // A1 ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] - fneg d16, d5 + eor v16.16b, v16.16b, v16.16b + fsub d16, d16, d5 ins v5.d[1], v16.d[0] // [-A1, A1] #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index db2c350..daaa55e 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define X x3 /* X vector address */ #define INC_X x4 /* X stride */ #define I x5 /* loop variable */ +#define X_COPY x6 /* Copy of X */ /******************************************************************************* * Macro definitions @@ -50,43 +51,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT #if !defined(DOUBLE) - ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R - fneg s2, DA_I - ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I - ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I + ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R #else ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R - fneg d2, DA_I - ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I - ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I #endif .endm .macro KERNEL_F1 - #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 - ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 - fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 - fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 - st1 {v2.2s}, [X], #8 + fmul s3, DA_R, v2.s[0] // DA_R*X0 + fmul s5, DA_I, v2.s[1] // DA_I*X1 + fsub s3, s3, s5 // DA_R*X0-DA_I*X1 + + fmul s4, DA_I, v2.s[0] // DA_I*X0 + fmul s5, DA_R, v2.s[1] // DA_R*X1 + fadd s4, s4, s5 // DA_I*X0+DA_R*X1 + + ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v3.2s}, [X], #8 #else ld1 {v2.2d}, [X] // X1, X0 - ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 - fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 - fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 - st1 {v2.2d}, [X], #16 -#endif + fmul d3, DA_R, v2.d[0] // DA_R*X0 + fmul d5, DA_I, v2.d[1] // DA_I*X1 + fsub d3, d3, d5 // DA_R*X0-DA_I*X1 + fmul d4, DA_I, v2.d[0] // DA_I*X0 + fmul d5, DA_R, v2.d[1] // DA_R*X1 + fadd d4, d4, d5 // DA_I*X0+DA_R*X1 + + ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v3.2d}, [X], #16 +#endif .endm .macro KERNEL_INIT_F4 #if !defined(DOUBLE) - // Replicate the lower 2 floats into the upper 2 slots - ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R - ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I + ins v16.s[0], v0.s[0] + ins v16.s[1], v16.s[0] + ins v16.d[1], v16.d[0] + ins v17.s[0], v1.s[0] + ins v17.s[1], v17.s[0] + ins v17.d[1], v17.d[0] +#else //DOUBLE + ins v16.d[0], v0.d[0] + ins v16.d[1], v16.d[0] + ins v17.d[0], v1.d[0] + ins v17.d[1], v17.d[0] #endif .endm @@ -94,46 +107,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL_F4 #if !defined(DOUBLE) - ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0] - // V3 = X[7], X[6], X[5], X[4] - - ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] - ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] - ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] - fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix] - // X'[ix+1] += DA_R * X[ix+1] - fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1] - // X'[ix+1] += DA_I * X[ix] - - ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] - ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] - ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] - fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix] - // X'[ix+1] += DA_R * X[ix+1] - fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1] - // X'[ix+1] += DA_I * X[ix] - - st1 {v2.4s,v3.4s}, [X], #32 + ld2 {v2.4s, v3.4s}, [X], #32 + + fmul v4.4s, v2.4s, v16.4s + fmul v6.4s, v3.4s, v17.4s + fsub v4.4s, v4.4s, v6.4s + + fmul v5.4s, v2.4s, v17.4s + fmul v6.4s, v3.4s, v16.4s + fadd v5.4s, v5.4s, v6.4s + + st2 {v4.4s, v5.4s}, [X_COPY], #32 #else // DOUBLE - ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3 - ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] - ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] - ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] - ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] + ld2 {v2.2d, v3.2d}, [X], #32 - fmul v2.2d, v0.2d, v2.2d - fmla v2.2d, v1.2d, v20.2d + fmul v4.2d, v2.2d, v16.2d + fmul v6.2d, v3.2d, v17.2d + fsub v4.2d, v4.2d, v6.2d + fmul v5.2d, v2.2d, v17.2d + fmul v6.2d, v3.2d, v16.2d + fadd v5.2d, v5.2d, v6.2d - fmul v3.2d, v0.2d, v3.2d - fmla v3.2d, v1.2d, v21.2d - st1 {v2.2d,v3.2d}, [X], #32 + st2 {v4.2d, v5.2d}, [X_COPY], #32 - fmul v4.2d, v0.2d, v4.2d - fmla v4.2d, v1.2d, v22.2d + ld2 {v18.2d, v19.2d}, [X], #32 - fmul v5.2d, v0.2d, v5.2d - fmla v5.2d, v1.2d, v23.2d - st1 {v4.2d,v5.2d}, [X], #32 + fmul v20.2d, v18.2d, v16.2d + fmul v6.2d, v19.2d, v17.2d + fsub v20.2d, v20.2d, v6.2d + fmul v21.2d, v18.2d, v17.2d + fmul v6.2d, v19.2d, v16.2d + fadd v21.2d, v21.2d, v6.2d + + st2 {v20.2d, v21.2d}, [X_COPY], #32 #endif PRFM PLDL1KEEP, [X, #1024] .endm @@ -149,21 +155,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL_S1 - #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 - ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 - fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 - fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 - st1 {v2.2s}, [X], INC_X + fmul s3, DA_R, v2.s[0] // DA_R*X0 + fmul s5, DA_I, v2.s[1] // DA_I*X1 + fsub s3, s3, s5 // DA_R*X0-DA_I*X1 + + fmul s4, DA_I, v2.s[0] // DA_I*X0 + fmul s5, DA_R, v2.s[1] // DA_R*X1 + fadd s4, s4, s5 // DA_I*X0+DA_R*X1 + + ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v3.2s}, [X], INC_X #else ld1 {v2.2d}, [X] // X1, X0 - ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 - fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 - fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 - st1 {v2.2d}, [X], INC_X -#endif + fmul d3, DA_R, v2.d[0] // DA_R*X0 + fmul d5, DA_I, v2.d[1] // DA_I*X1 + fsub d3, d3, d5 // DA_R*X0-DA_I*X1 + + fmul d4, DA_I, v2.d[0] // DA_I*X0 + fmul d5, DA_R, v2.d[1] // DA_R*X1 + fadd d4, d4, d5 // DA_I*X0+DA_R*X1 + ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v3.2d}, [X], INC_X +#endif .endm /******************************************************************************* @@ -171,21 +187,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ PROLOGUE + + b zscal_begin +data_ar: + .word 0x3e44fae6 +data_ai: + .word 0x3d320fa2 +data_xr: + .word 0x3f4baff1 +data_xi: + .word 0xbe8ef0bd + +zscal_begin: + + ldr s20, data_ar + ldr s21, data_ai + ldr s22, data_xr + ldr s23, data_xi + + fmul s24, s22, s21 + fmla s24, s23, v20.s[0] + + fmul s25, s22, s21 + fmul s26, s23, s20 + fadd s25, s25, s26 + + mov X_COPY, X cmp N, xzr ble zscal_kernel_L999 fcmp DA_R, #0.0 - bne zscal_kernel_1 + bne zscal_kernel_R_non_zero fcmp DA_I, #0.0 - beq zscal_kernel_zero + beq zscal_kernel_RI_zero - // TODO: special case DA_R == 0 && DA_I != 0 + b zscal_kernel_R_zero -zscal_kernel_1: +zscal_kernel_R_non_zero: - // TODO: special case DA_R != 0 && DA_I == 0 + fcmp DA_I, #0.0 + beq zscal_kernel_I_zero + +/******************************************************************************* +* A_R != 0 && A_I != 0 +*******************************************************************************/ + +zscal_kernel_RI_non_zero: INIT @@ -257,16 +306,85 @@ zscal_kernel_L999: mov w0, wzr ret -zscal_kernel_zero: +/******************************************************************************* +* A_R == 0 && A_I != 0 +*******************************************************************************/ + +zscal_kernel_R_zero: + INIT_S + +#if !defined(DOUBLE) + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, DA_I + ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I +#else + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, DA_I + ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I +#endif + +zscal_kernel_R_zero_1: +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] // X1, X0 + fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 + ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1 + st1 {v2.2s}, [X] +#else + ld1 {v2.2d}, [X] // X1, X0 + fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0 + ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1 + st1 {v2.2d}, [X] +#endif + add X, X, INC_X + subs N, N, #1 + bne zscal_kernel_R_zero_1 + + mov w0, wzr + ret + +/******************************************************************************* +* A_R != 0 && A_I == 0 +*******************************************************************************/ + +zscal_kernel_I_zero: + INIT_S +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R +#else + ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R +#endif + +zscal_kernel_I_zero_1: +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] // X1, X0 + fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 + st1 {v2.2s}, [X] +#else + ld1 {v2.2d}, [X] // X1, X0 + fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 + st1 {v2.2d}, [X] +#endif + add X, X, INC_X + subs N, N, #1 + bne zscal_kernel_I_zero_1 + + mov w0, wzr + ret + +/******************************************************************************* +* A_R == 0 && A_I == 0 +*******************************************************************************/ + +zscal_kernel_RI_zero: INIT_S -zscal_kernel_Z1: +zscal_kernel_RI_zero_1: stp DA_R, DA_I, [X] add X, X, INC_X - subs N, N, #1 - bne zscal_kernel_Z1 + subs N, N, #1 + bne zscal_kernel_RI_zero_1 mov w0, wzr ret diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 4fbb7fc..3ff8227 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -187,73 +187,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v16.2d, v0.2d, v8.2d[0] OP_ii v16.2d, v1.2d, v9.2d[0] - fmul v17.2d, v0.2d, v9.2d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v17.2d, v17.2d + eor v17.16b, v17.16b, v17.16b + fmls v17.2d, v0.2d, v9.2d[0] +#else + fmul v17.2d, v0.2d, v9.2d[0] #endif OP_ir v17.2d, v1.2d, v8.2d[0] fmul v18.2d, v2.2d, v8.2d[0] OP_ii v18.2d, v3.2d, v9.2d[0] - fmul v19.2d, v2.2d, v9.2d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v19.2d, v19.2d + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.2d[0] +#else + fmul v19.2d, v2.2d, v9.2d[0] #endif OP_ir v19.2d, v3.2d, v8.2d[0] fmul v20.2d, v0.2d, v8.2d[1] OP_ii v20.2d, v1.2d, v9.2d[1] - fmul v21.2d, v0.2d, v9.2d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v21.2d, v21.2d + eor v21.16b, v21.16b, v21.16b + fmls v21.2d, v0.2d, v9.2d[1] +#else + fmul v21.2d, v0.2d, v9.2d[1] #endif OP_ir v21.2d, v1.2d, v8.2d[1] fmul v22.2d, v2.2d, v8.2d[1] OP_ii v22.2d, v3.2d, v9.2d[1] - fmul v23.2d, v2.2d, v9.2d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v23.2d, v23.2d + eor v23.16b, v23.16b, v23.16b + fmls v23.2d, v2.2d, v9.2d[1] +#else + fmul v23.2d, v2.2d, v9.2d[1] #endif OP_ir v23.2d, v3.2d, v8.2d[1] fmul v24.2d, v0.2d, v10.2d[0] OP_ii v24.2d, v1.2d, v11.2d[0] - fmul v25.2d, v0.2d, v11.2d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v25.2d, v25.2d + eor v25.16b, v25.16b, v25.16b + fmls v25.2d, v0.2d, v11.2d[0] +#else + fmul v25.2d, v0.2d, v11.2d[0] #endif OP_ir v25.2d, v1.2d, v10.2d[0] fmul v26.2d, v2.2d, v10.2d[0] OP_ii v26.2d, v3.2d, v11.2d[0] - fmul v27.2d, v2.2d, v11.2d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v27.2d, v27.2d + eor v27.16b, v27.16b, v27.16b + fmls v27.2d, v2.2d, v11.2d[0] +#else + fmul v27.2d, v2.2d, v11.2d[0] #endif OP_ir v27.2d, v3.2d, v10.2d[0] fmul v28.2d, v0.2d, v10.2d[1] OP_ii v28.2d, v1.2d, v11.2d[1] - fmul v29.2d, v0.2d, v11.2d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v29.2d, v29.2d + eor v29.16b, v29.16b, v29.16b + fmls v29.2d, v0.2d, v11.2d[1] +#else + fmul v29.2d, v0.2d, v11.2d[1] #endif OP_ir v29.2d, v1.2d, v10.2d[1] fmul v30.2d, v2.2d, v10.2d[1] OP_ii v30.2d, v3.2d, v11.2d[1] - fmul v31.2d, v2.2d, v11.2d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) - fneg v31.2d, v31.2d + eor v31.16b, v31.16b, v31.16b + fmls v31.2d, v2.2d, v11.2d[1] +#else + fmul v31.2d, v2.2d, v11.2d[1] #endif OP_ir v31.2d, v3.2d, v10.2d[1] -- 2.7.4