CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
-ISAMAXKERNEL = isamax.S
-IDAMAXKERNEL = idamax.S
+ISAMAXKERNEL = iamax.S
+IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
-DOTKERNEL = dot.S
+SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
-SNRM2KERNEL = snrm2.S
-DNRM2KERNEL = dnrm2.S
-CNRM2KERNEL = znrm2.S
-ZNRM2KERNEL = znrm2.S
+#SNRM2KERNEL = snrm2.S
+#DNRM2KERNEL = dnrm2.S
+#CNRM2KERNEL = znrm2.S
+#ZNRM2KERNEL = znrm2.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
-SCALKERNEL = scal.S
+SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
fmul v16.4s, v0.4s, v8.4s[0]
OP_ii v16.4s, v1.4s, v9.4s[0]
- fmul v17.4s, v0.4s, v9.4s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v17.4s, v17.4s
+ eor v17.16b, v17.16b, v17.16b
+ fmls v17.4s, v0.4s, v9.4s[0]
+#else
+ fmul v17.4s, v0.4s, v9.4s[0]
#endif
OP_ir v17.4s, v1.4s, v8.4s[0]
fmul v20.4s, v0.4s, v8.4s[1]
OP_ii v20.4s, v1.4s, v9.4s[1]
- fmul v21.4s, v0.4s, v9.4s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v21.4s, v21.4s
+ eor v21.16b, v21.16b, v21.16b
+ fmls v21.4s, v0.4s, v9.4s[1]
+#else
+ fmul v21.4s, v0.4s, v9.4s[1]
#endif
OP_ir v21.4s, v1.4s, v8.4s[1]
fmul v24.4s, v0.4s, v8.4s[2]
OP_ii v24.4s, v1.4s, v9.4s[2]
- fmul v25.4s, v0.4s, v9.4s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v25.4s, v25.4s
+ eor v25.16b, v25.16b, v25.16b
+ fmls v25.4s, v0.4s, v9.4s[2]
+#else
+ fmul v25.4s, v0.4s, v9.4s[2]
#endif
OP_ir v25.4s, v1.4s, v8.4s[2]
fmul v28.4s, v0.4s, v8.4s[3]
OP_ii v28.4s, v1.4s, v9.4s[3]
- fmul v29.4s, v0.4s, v9.4s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v29.4s, v29.4s
+ eor v29.16b, v29.16b, v29.16b
+ fmls v29.4s, v0.4s, v9.4s[3]
+#else
+ fmul v29.4s, v0.4s, v9.4s[3]
#endif
OP_ir v29.4s, v1.4s, v8.4s[3]
fmul v18.4s, v2.4s, v8.4s[0]
OP_ii v18.4s, v3.4s, v9.4s[0]
- fmul v19.4s, v2.4s, v9.4s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v19.4s, v19.4s
+ eor v19.16b, v19.16b, v19.16b
+ fmls v19.4s, v2.4s, v9.4s[0]
+#else
+ fmul v19.4s, v2.4s, v9.4s[0]
#endif
OP_ir v19.4s, v3.4s, v8.4s[0]
fmul v22.4s, v2.4s, v8.4s[1]
OP_ii v22.4s, v3.4s, v9.4s[1]
- fmul v23.4s, v2.4s, v9.4s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v23.4s, v23.4s
+ eor v23.16b, v23.16b, v23.16b
+ fmls v23.4s, v2.4s, v9.4s[1]
+#else
+ fmul v23.4s, v2.4s, v9.4s[1]
#endif
OP_ir v23.4s, v3.4s, v8.4s[1]
fmul v26.4s, v2.4s, v8.4s[2]
OP_ii v26.4s, v3.4s, v9.4s[2]
- fmul v27.4s, v2.4s, v9.4s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v27.4s, v27.4s
+ eor v27.16b, v27.16b, v27.16b
+ fmls v27.4s, v2.4s, v9.4s[2]
+#else
+ fmul v27.4s, v2.4s, v9.4s[2]
#endif
OP_ir v27.4s, v3.4s, v8.4s[2]
fmul v30.4s, v2.4s, v8.4s[3]
OP_ii v30.4s, v3.4s, v9.4s[3]
- fmul v31.4s, v2.4s, v9.4s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v31.4s, v31.4s
+ eor v31.16b, v31.16b, v31.16b
+ fmls v31.4s, v2.4s, v9.4s[3]
+#else
+ fmul v31.4s, v2.4s, v9.4s[3]
#endif
OP_ir v31.4s, v3.4s, v8.4s[3]
fmul v16.4s, v0.4s, v8.4s[0]
OP_ii v16.4s, v1.4s, v9.4s[0]
- fmul v17.4s, v0.4s, v9.4s[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v17.4s, v17.4s
+ eor v17.16b, v17.16b, v17.16b
+ fmls v17.4s, v0.4s, v9.4s[0]
+#else
+ fmul v17.4s, v0.4s, v9.4s[0]
#endif
OP_ir v17.4s, v1.4s, v8.4s[0]
fmul v20.4s, v0.4s, v8.4s[1]
OP_ii v20.4s, v1.4s, v9.4s[1]
- fmul v21.4s, v0.4s, v9.4s[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v21.4s, v21.4s
+ eor v21.16b, v21.16b, v21.16b
+ fmls v21.4s, v0.4s, v9.4s[1]
+#else
+ fmul v21.4s, v0.4s, v9.4s[1]
#endif
OP_ir v21.4s, v1.4s, v8.4s[1]
fmul v24.4s, v0.4s, v8.4s[2]
OP_ii v24.4s, v1.4s, v9.4s[2]
- fmul v25.4s, v0.4s, v9.4s[2]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v25.4s, v25.4s
+ eor v25.16b, v25.16b, v25.16b
+ fmls v25.4s, v0.4s, v9.4s[2]
+#else
+ fmul v25.4s, v0.4s, v9.4s[2]
#endif
OP_ir v25.4s, v1.4s, v8.4s[2]
fmul v28.4s, v0.4s, v8.4s[3]
OP_ii v28.4s, v1.4s, v9.4s[3]
- fmul v29.4s, v0.4s, v9.4s[3]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v29.4s, v29.4s
+ eor v29.16b, v29.16b, v29.16b
+ fmls v29.4s, v0.4s, v9.4s[3]
+#else
+ fmul v29.4s, v0.4s, v9.4s[3]
#endif
OP_ir v29.4s, v1.4s, v8.4s[3]
#define COND ge
#endif
+#if !defined(DOUBLE)
+#define MAXF s0
+#define TMPF s1
+#define TMPVF {v1.s}[0]
+#define SZ 4
+#else
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
+#endif
/******************************************************************************/
.macro INIT_S
+#if !defined(DOUBLE)
+ lsl INC_X, INC_X, #2
+ ld1 {v0.s}[0], [X], INC_X
+#else
lsl INC_X, INC_X, #3
ld1 {v0.d}[0], [X], INC_X
+#endif
mov Z, #1
mov INDEX, Z
fabs MAXF, MAXF
iamax_kernel_S10:
KERNEL_S1
-
- subs I, I, #1
- bne iamax_kernel_S10
+ subs I, I, #1
+ bne iamax_kernel_S10
iamax_kernel_L999:
+++ /dev/null
-/*******************************************************************************
-Copyright (c) 2015, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*******************************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
-
-#define N x0 /* vector length */
-#define X x1 /* X vector address */
-#define INC_X x2 /* X stride */
-#define INDEX x3 /* index of max/min value */
-#define Z x4 /* vector index */
-#define I x5 /* loop variable */
-#define X_COPY x6 /* copy of X address */
-#define MAXF_Z x7
-
-/*******************************************************************************
-* Macro definitions
-*******************************************************************************/
-
-#define MAXF s5
-#define TMPF s6
-#define TMPVF {v6.s}[0]
-#define SZ 4
-
-/******************************************************************************/
-
-.macro INIT_F1
- ldr MAXF, [X], #SZ
- mov Z, #1
- mov INDEX, Z
- fabs MAXF, MAXF
-.endm
-
-.macro KERNEL_F1
- ldr TMPF, [X], #SZ
- add Z, Z, #1
- fabs TMPF, TMPF
- fcmp TMPF, MAXF
- fcsel MAXF, MAXF, TMPF, le
- csel INDEX, INDEX, Z, le
-.endm
-
-.macro INIT_F4
- ld1 {v0.4s}, [X], #16
- fabs v0.4s, v0.4s
- fmaxv MAXF, v0.4s
- mov Z, #5
- mov MAXF_Z, #1
-.endm
-
-.macro KERNEL_F4
- ld1 {v0.4s}, [X], #16
- fabs v0.4s, v0.4s
- fmaxv TMPF, v0.4s
- PRFM PLDL1KEEP, [X, #512]
- fcmp TMPF, MAXF
- fcsel MAXF, MAXF, TMPF, le
- csel MAXF_Z, MAXF_Z, Z, le
- add Z, Z, #4
-.endm
-
-
-.macro KERNEL_F4_FINALIZE
- mov INDEX, MAXF_Z
- sub MAXF_Z, MAXF_Z, #1
- lsl MAXF_Z, MAXF_Z, #2
- add X_COPY, X_COPY, MAXF_Z
- ldr TMPF, [X_COPY], #SZ
- fabs TMPF, TMPF
- fcmp TMPF, MAXF
- beq KERNEL_F4_FINALIZE_DONE
- add INDEX, INDEX, #1
- ldr TMPF, [X_COPY], #SZ
- fabs TMPF, TMPF
- fcmp TMPF, MAXF
- beq KERNEL_F4_FINALIZE_DONE
- add INDEX, INDEX, #1
- ldr TMPF, [X_COPY], #SZ
- fabs TMPF, TMPF
- fcmp TMPF, MAXF
- beq KERNEL_F4_FINALIZE_DONE
- add INDEX, INDEX, #1
-KERNEL_F4_FINALIZE_DONE:
-.endm
-
-
-.macro INIT_S
- lsl INC_X, INC_X, #2
- ld1 TMPVF, [X], INC_X
- mov Z, #1
- mov INDEX, Z
- fabs MAXF, TMPF
-.endm
-
-.macro KERNEL_S1
- ld1 TMPVF, [X], INC_X
- add Z, Z, #1
- fabs TMPF, TMPF
- fcmp TMPF, MAXF
- fcsel MAXF, MAXF, TMPF, le
- csel INDEX, INDEX, Z, le
-.endm
-
-/*******************************************************************************
-* End of macro definitions
-*******************************************************************************/
-
- PROLOGUE
-
- cmp N, xzr
- ble iamax_kernel_zero
- cmp INC_X, xzr
- ble iamax_kernel_zero
-
- PRFM PLDL1KEEP, [X]
- mov X_COPY, X
-
- cmp INC_X, #1
- bne iamax_kernel_S_BEGIN
-
-iamax_kernel_F_BEGIN:
- asr I, N, #2
- cmp I, xzr
- beq iamax_kernel_F1_INIT
-
- INIT_F4
- subs I, I, #1
- beq iamax_kernel_F4_FINALIZE
-
-iamax_kernel_F4:
- KERNEL_F4
- subs I, I, #1
- bne iamax_kernel_F4
-
-iamax_kernel_F4_FINALIZE:
- KERNEL_F4_FINALIZE
-
-iamax_kernel_F1:
- ands I, N, #3
- ble iamax_kernel_L999
-
-iamax_kernel_F10:
- KERNEL_F1
- subs I, I, #1
- bne iamax_kernel_F10
- b iamax_kernel_L999
-
-iamax_kernel_F1_INIT:
- INIT_F1
- subs N, N, #1
- b iamax_kernel_F1
-
-iamax_kernel_S_BEGIN:
- INIT_S
-
- subs N, N, #1
- ble iamax_kernel_L999
-
- asr I, N, #2
- cmp I, xzr
- ble iamax_kernel_S1
-
-iamax_kernel_S4:
- KERNEL_S1
- KERNEL_S1
- KERNEL_S1
- KERNEL_S1
-
- subs I, I, #1
- bne iamax_kernel_S4
-
-iamax_kernel_S1:
- ands I, N, #3
- ble iamax_kernel_L999
-
-iamax_kernel_S10:
- KERNEL_S1
- subs I, I, #1
- bne iamax_kernel_S10
-
-iamax_kernel_L999:
- mov x0, INDEX
- ret
-
-iamax_kernel_zero:
- mov x0, xzr
- ret
-
- EPILOGUE
.macro INIT_F1
#if !defined(DOUBLE)
- fneg s2, S
+ eor v2.16b, v2.16b, v2.16b
+ fsub s2, s2, S
ins v1.s[1], v2.s[0] // [-S, S]
#else
- fneg d2, S
+ eor v2.16b, v2.16b, v2.16b
+ fsub d2, d2, S
ins v1.d[1], v2.d[0] // [-S, S]
#endif
.endm
#if !defined(DOUBLE)
#define DA_R s0 /* scale input value */
#define DA_I s1 /* scale input value */
-#define TMPX v2.2s
-#define TMPY v3.2s
#define SZ 4
#else
#define DA_R d0 /* scale input value */
#define DA_I d1 /* scale input value */
-#define TMPX v2.2d
-#define TMPY v3.2d
#define SZ 8
#endif
#if !defined(CONJ)
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
- fneg s2, DA_I
+ eor v2.16b, v2.16b, v2.16b
+ fsub s2, s2, DA_I
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
- fneg d2, DA_I
+ eor v2.16b, v2.16b, v2.16b
+ fsub d2, d2, DA_I
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
#endif
#else
#if !defined(DOUBLE)
- fneg s2, DA_R
+ eor v2.16b, v2.16b, v2.16b
+ fsub s2, s2, DA_R
ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R
ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I
#else
- fneg d2, DA_R
+ eor v2.16b, v2.16b, v2.16b
+ fsub d2, d2, DA_R
ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I
#endif
.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
- // Replicate the lower 2 floats into the upper 2 slots
- ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
- ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
+ ins v16.s[0], v0.s[0]
+ ins v16.s[1], v16.s[0]
+ ins v16.d[1], v16.d[0]
+#if !defined(CONJ)
+ ins v17.s[0], v1.s[1]
+#else
+ ins v17.s[0], v1.s[0]
+#endif
+ ins v17.s[1], v17.s[0]
+ ins v17.d[1], v17.d[0]
+#else //DOUBLE
+ ins v16.d[0], v0.d[0]
+ ins v16.d[1], v16.d[0]
+#if !defined(CONJ)
+ ins v17.d[0], v1.d[1]
+#else
+ ins v17.d[0], v1.d[0]
+#endif
+ ins v17.d[1], v17.d[0]
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
- ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0]
- // V3 = X[7], X[6], X[5], X[4]
- ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
- ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
- ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
+ ld2 {v2.4s, v3.4s}, [X], #32
+ ld2 {v4.4s, v5.4s}, [Y_COPY], #32
- ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0]
- // V5 = Y[7], Y[6], Y[5], Y[4]
-
- ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
- ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
- ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
+ fmla v4.4s, v2.4s, v16.4s
+#if !defined(CONJ)
+ fmls v4.4s, v3.4s, v17.4s
+#else
+ fmla v4.4s, v3.4s, v17.4s
+#endif
- fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix]
- // Y[iy+1] += +-DA_R * X[ix+1]
- fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1]
- // Y[iy+1] += DA_I * X[ix]
- st1 {v4.4s}, [Y], #16
+#if !defined(CONJ)
+ fmla v5.4s, v2.4s, v17.4s
+#else
+ fmls v5.4s, v2.4s, v17.4s
+#endif
+ fmla v5.4s, v3.4s, v16.4s
- fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix]
- fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1]
- // Y[iy+1] += +-DA_R * X[ix+1]
- // Y[iy+1] += DA_I * X[ix]
- st1 {v5.4s}, [Y], #16
+ st2 {v4.4s, v5.4s}, [Y], #32
#else // DOUBLE
- ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3
- ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
- ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
+ ld2 {v2.2d, v3.2d}, [X], #32
+ ld2 {v4.2d, v5.2d}, [Y_COPY], #32
- ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3
- ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
- ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
-
- ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
+ fmla v4.2d, v2.2d, v16.2d
+#if !defined(CONJ)
+ fmls v4.2d, v3.2d, v17.2d
+#else
+ fmla v4.2d, v3.2d, v17.2d
+#endif
+#if !defined(CONJ)
+ fmla v5.2d, v2.2d, v17.2d
+#else
+ fmls v5.2d, v2.2d, v17.2d
+#endif
+ fmla v5.2d, v3.2d, v16.2d
- fmla v16.2d, v0.2d, v2.2d
- fmla v17.2d, v0.2d, v3.2d
+ st2 {v4.2d, v5.2d}, [Y], #32
- ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
+ ld2 {v18.2d, v19.2d}, [X], #32
+ ld2 {v20.2d, v21.2d}, [Y_COPY], #32
- fmla v16.2d, v1.2d, v20.2d
- fmla v17.2d, v1.2d, v21.2d
- st1 {v16.2d,v17.2d}, [Y], #32
+ fmla v20.2d, v18.2d, v16.2d
+#if !defined(CONJ)
+ fmls v20.2d, v19.2d, v17.2d
+#else
+ fmla v20.2d, v19.2d, v17.2d
+#endif
+#if !defined(CONJ)
+ fmla v21.2d, v18.2d, v17.2d
+#else
+ fmls v21.2d, v18.2d, v17.2d
+#endif
+ fmla v21.2d, v19.2d, v16.2d
- fmla v18.2d, v0.2d, v4.2d
- fmla v19.2d, v0.2d, v5.2d
- fmla v18.2d, v1.2d, v22.2d
- fmla v19.2d, v1.2d, v23.2d
- st1 {v18.2d,v19.2d}, [Y], #32
+ st2 {v20.2d, v21.2d}, [Y], #32
#endif
PRFM PLDL1KEEP, [X, #512]
PRFM PLDL1KEEP, [Y, #512]
fmul v16.2d, v0.2d, v8.2d[0]
OP_ii v16.2d, v1.2d, v9.2d[0]
- fmul v17.2d, v0.2d, v9.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v17.2d, v17.2d
+ eor v17.16b, v17.16b, v17.16b
+ fmls v17.2d, v0.2d, v9.2d[0]
+#else
+ fmul v17.2d, v0.2d, v9.2d[0]
#endif
OP_ir v17.2d, v1.2d, v8.2d[0]
fmul v18.2d, v2.2d, v8.2d[0]
OP_ii v18.2d, v3.2d, v9.2d[0]
- fmul v19.2d, v2.2d, v9.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v19.2d, v19.2d
+ eor v19.16b, v19.16b, v19.16b
+ fmls v19.2d, v2.2d, v9.2d[0]
+#else
+ fmul v19.2d, v2.2d, v9.2d[0]
#endif
OP_ir v19.2d, v3.2d, v8.2d[0]
fmul v20.2d, v0.2d, v8.2d[1]
OP_ii v20.2d, v1.2d, v9.2d[1]
- fmul v21.2d, v0.2d, v9.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v21.2d, v21.2d
+ eor v21.16b, v21.16b, v21.16b
+ fmls v21.2d, v0.2d, v9.2d[1]
+#else
+ fmul v21.2d, v0.2d, v9.2d[1]
#endif
OP_ir v21.2d, v1.2d, v8.2d[1]
fmul v22.2d, v2.2d, v8.2d[1]
OP_ii v22.2d, v3.2d, v9.2d[1]
- fmul v23.2d, v2.2d, v9.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v23.2d, v23.2d
+ eor v23.16b, v23.16b, v23.16b
+ fmls v23.2d, v2.2d, v9.2d[1]
+#else
+ fmul v23.2d, v2.2d, v9.2d[1]
#endif
OP_ir v23.2d, v3.2d, v8.2d[1]
fmul v24.2d, v0.2d, v10.2d[0]
OP_ii v24.2d, v1.2d, v11.2d[0]
- fmul v25.2d, v0.2d, v11.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v25.2d, v25.2d
+ eor v25.16b, v25.16b, v25.16b
+ fmls v25.2d, v0.2d, v11.2d[0]
+#else
+ fmul v25.2d, v0.2d, v11.2d[0]
#endif
OP_ir v25.2d, v1.2d, v10.2d[0]
fmul v26.2d, v2.2d, v10.2d[0]
OP_ii v26.2d, v3.2d, v11.2d[0]
- fmul v27.2d, v2.2d, v11.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v27.2d, v27.2d
+ eor v27.16b, v27.16b, v27.16b
+ fmls v27.2d, v2.2d, v11.2d[0]
+#else
+ fmul v27.2d, v2.2d, v11.2d[0]
#endif
OP_ir v27.2d, v3.2d, v10.2d[0]
fmul v28.2d, v0.2d, v10.2d[1]
OP_ii v28.2d, v1.2d, v11.2d[1]
- fmul v29.2d, v0.2d, v11.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v29.2d, v29.2d
+ eor v29.16b, v29.16b, v29.16b
+ fmls v29.2d, v0.2d, v11.2d[1]
+#else
+ fmul v29.2d, v0.2d, v11.2d[1]
#endif
OP_ir v29.2d, v1.2d, v10.2d[1]
fmul v30.2d, v2.2d, v10.2d[1]
OP_ii v30.2d, v3.2d, v11.2d[1]
- fmul v31.2d, v2.2d, v11.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v31.2d, v31.2d
+ eor v31.16b, v31.16b, v31.16b
+ fmls v31.2d, v2.2d, v11.2d[1]
+#else
+ fmul v31.2d, v2.2d, v11.2d[1]
#endif
OP_ir v31.2d, v3.2d, v10.2d[1]
/******* INIT FOR F1 AND S1 LOOP ******/
#if !defined(DOUBLE)
- ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
- fneg s2, ALPHA_I
+ ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
+ eor v2.16b, v2.16b, v2.16b
+ fsub s2, s2, ALPHA_I
ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA)
#if !defined(XCONJ)
ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA)
#endif
#else
- ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
- fneg d2, ALPHA_I
+ ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
+ eor v2.16b, v2.16b, v2.16b
+ fsub d2, d2, ALPHA_I
ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA)
#if !defined(XCONJ)
ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA)
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
- fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)]
- fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)]
+ eor v12.16b, v12.16b, v12.16b
+ fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
ins v3.s[0], v2.s[1]
#if !defined(CONJ)
#if !defined(XCONJ)
- fneg s4, s3
+ eor v4.16b, v4.16b, v4.16b
+ fsub s4, s4, s3
ins v3.s[1], v4.s[0]
ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)]
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
#else
- fneg s4, s3
+ eor v4.16b, v4.16b, v4.16b
+ fsub s4, s4, s3
ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)]
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
#endif
#else // CONJ
#if !defined(XCONJ)
ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)]
- fneg s4, s2
+ eor v4.16b, v4.16b, v4.16b
+ fsub s4, s4, s2
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
#else
- fneg s3, s3
+ eor v4.16b, v4.16b, v4.16b
+ fsub s3, s4, s3
ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)]
- fneg s4, s2
+ eor v4.16b, v4.16b, v4.16b
+ fsub s4, s4, s2
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
#endif
#endif // CONJ
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
- fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)]
- fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)]
+ eor v12.16b, v12.16b, v12.16b
+ fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
ins v3.d[0], v2.d[1] // I(TEMP)
#if !defined(CONJ)
#if !defined(XCONJ)
- fneg d4, d3 // -I(TEMP)
+ eor v4.16b, v4.16b, v4.16b
+ fsub d4, d4, d3
ins v3.d[1], v4.d[0]
ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)]
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
#else
- fneg d4, d3 // -I(TEMP)
+ eor v4.16b, v4.16b, v4.16b
+ fsub d4, d4, d3
ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)]
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
#endif
#else // CONJ
#if !defined(XCONJ)
ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)]
- fneg d4, d2 // -R(TEMP)
+ eor v4.16b, v4.16b, v4.16b
+ fsub d4, d4, d2
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
#else
- fneg d3, d3 // -I(TEMP)
+ eor v4.16b, v4.16b, v4.16b
+ fsub d3, d4, d3
ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)]
- fneg d4, d2 // -R(TEMP)
+ eor v4.16b, v4.16b, v4.16b
+ fsub d4, d4, d2
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
#endif
#endif // CONJ
#if !defined(XCONJ)
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R
- fneg s2, ALPHA_I
+ eor v2.16b, v2.16b, v2.16b
+ fsub s2, s2, ALPHA_I
ins v1.s[1], v2.s[0]
ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I
#else
ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R
- fneg d2, ALPHA_I
+ eor v2.16b, v2.16b, v2.16b
+ fsub d2, d2, ALPHA_I
ins v1.d[1], v2.d[0]
ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I
#endif
#else // XCONJ
#if !defined(DOUBLE)
- fneg s2, ALPHA_R
+ eor v2.16b, v2.16b, v2.16b
+ fsub s2, s2, ALPHA_R
ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R
ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I
#else
- fneg d2, ALPHA_R
+ eor v2.16b, v2.16b, v2.16b
+ fsub d2, d2, ALPHA_R
ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R
ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I
#endif
ld2 {v11.4s, v12.4s}, [X_PTR], #32
ld2 {v13.4s, v14.4s}, [A_PTR], #32
-#if !defined(CONJ)
-#if !defined(XCONJ)
+#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
#else
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
- fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
- fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
- fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
- fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
-#else
- fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
- fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
- fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
- fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
#endif
-#endif // CONJ
#else // DOUBLE
ld2 {v11.2d, v12.2d}, [X_PTR], #32
ld2 {v13.2d, v14.2d}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #512]
-#if !defined(CONJ)
-#if !defined(XCONJ)
+
+#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
#else
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
- fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
- fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
- fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
- fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
-#else
- fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
- fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
- fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
- fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
#endif
-#endif // CONJ
+
ld2 {v17.2d, v18.2d}, [X_PTR], #32
ld2 {v19.2d, v20.2d}, [A_PTR], #32
prfm PLDL1STRM, [A_PTR, #512]
-#if !defined(CONJ)
-#if !defined(XCONJ)
+
+#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
#else
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
- fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
- fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
-#endif
-#else // CONJ
-#if !defined(XCONJ)
- fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
- fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
-#else
- fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
- fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
- fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
- fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#endif
-#endif // CONJ
+
#endif //DOUBLE
.endm
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
ld1 {v5.s}[0], [A_PTR], #4 // A1
ld1 {v6.2s}, [X_PTR], #8 // [X1, X0]
- fneg s16, s5
+ eor v16.16b, v16.16b, v16.16b
+ fsub s16, s16, s5
ins v5.s[1], v16.s[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
ld1 {v5.d}[0], [A_PTR], #8 // A1
ld1 {v6.2d}, [X_PTR], #16 // [X1, X0]
- fneg d16, d5
+ eor v16.16b, v16.16b, v16.16b
+ fsub d16, d16, d5
ins v5.d[1], v16.d[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
ld1 {v5.s}[0], [A_PTR], #4 // A1
ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0]
- fneg s16, s5
+ eor v16.16b, v16.16b, v16.16b
+ fsub s16, s16, s5
ins v5.s[1], v16.s[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
ld1 {v5.d}[0], [A_PTR], #8 // A1
ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0]
- fneg d16, d5
+ eor v16.16b, v16.16b, v16.16b
+ fsub d16, d16, d5
ins v5.d[1], v16.d[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define I x5 /* loop variable */
+#define X_COPY x6 /* Copy of X */
/*******************************************************************************
* Macro definitions
.macro INIT
#if !defined(DOUBLE)
- ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
- fneg s2, DA_I
- ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
- ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
+ ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
- fneg d2, DA_I
- ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I
- ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
#endif
.endm
.macro KERNEL_F1
-
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
- ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
- fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
- fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
- st1 {v2.2s}, [X], #8
+ fmul s3, DA_R, v2.s[0] // DA_R*X0
+ fmul s5, DA_I, v2.s[1] // DA_I*X1
+ fsub s3, s3, s5 // DA_R*X0-DA_I*X1
+
+ fmul s4, DA_I, v2.s[0] // DA_I*X0
+ fmul s5, DA_R, v2.s[1] // DA_R*X1
+ fadd s4, s4, s5 // DA_I*X0+DA_R*X1
+
+ ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
+ st1 {v3.2s}, [X], #8
#else
ld1 {v2.2d}, [X] // X1, X0
- ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
- fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
- fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
- st1 {v2.2d}, [X], #16
-#endif
+ fmul d3, DA_R, v2.d[0] // DA_R*X0
+ fmul d5, DA_I, v2.d[1] // DA_I*X1
+ fsub d3, d3, d5 // DA_R*X0-DA_I*X1
+ fmul d4, DA_I, v2.d[0] // DA_I*X0
+ fmul d5, DA_R, v2.d[1] // DA_R*X1
+ fadd d4, d4, d5 // DA_I*X0+DA_R*X1
+
+ ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
+ st1 {v3.2d}, [X], #16
+#endif
.endm
.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
- // Replicate the lower 2 floats into the upper 2 slots
- ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
- ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
+ ins v16.s[0], v0.s[0]
+ ins v16.s[1], v16.s[0]
+ ins v16.d[1], v16.d[0]
+ ins v17.s[0], v1.s[0]
+ ins v17.s[1], v17.s[0]
+ ins v17.d[1], v17.d[0]
+#else //DOUBLE
+ ins v16.d[0], v0.d[0]
+ ins v16.d[1], v16.d[0]
+ ins v17.d[0], v1.d[0]
+ ins v17.d[1], v17.d[0]
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
- ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0]
- // V3 = X[7], X[6], X[5], X[4]
-
- ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
- ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
- ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
- fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix]
- // X'[ix+1] += DA_R * X[ix+1]
- fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1]
- // X'[ix+1] += DA_I * X[ix]
-
- ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
- ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
- ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
- fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix]
- // X'[ix+1] += DA_R * X[ix+1]
- fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1]
- // X'[ix+1] += DA_I * X[ix]
-
- st1 {v2.4s,v3.4s}, [X], #32
+ ld2 {v2.4s, v3.4s}, [X], #32
+
+ fmul v4.4s, v2.4s, v16.4s
+ fmul v6.4s, v3.4s, v17.4s
+ fsub v4.4s, v4.4s, v6.4s
+
+ fmul v5.4s, v2.4s, v17.4s
+ fmul v6.4s, v3.4s, v16.4s
+ fadd v5.4s, v5.4s, v6.4s
+
+ st2 {v4.4s, v5.4s}, [X_COPY], #32
#else // DOUBLE
- ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3
- ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
- ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
- ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
- ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
+ ld2 {v2.2d, v3.2d}, [X], #32
- fmul v2.2d, v0.2d, v2.2d
- fmla v2.2d, v1.2d, v20.2d
+ fmul v4.2d, v2.2d, v16.2d
+ fmul v6.2d, v3.2d, v17.2d
+ fsub v4.2d, v4.2d, v6.2d
+ fmul v5.2d, v2.2d, v17.2d
+ fmul v6.2d, v3.2d, v16.2d
+ fadd v5.2d, v5.2d, v6.2d
- fmul v3.2d, v0.2d, v3.2d
- fmla v3.2d, v1.2d, v21.2d
- st1 {v2.2d,v3.2d}, [X], #32
+ st2 {v4.2d, v5.2d}, [X_COPY], #32
- fmul v4.2d, v0.2d, v4.2d
- fmla v4.2d, v1.2d, v22.2d
+ ld2 {v18.2d, v19.2d}, [X], #32
- fmul v5.2d, v0.2d, v5.2d
- fmla v5.2d, v1.2d, v23.2d
- st1 {v4.2d,v5.2d}, [X], #32
+ fmul v20.2d, v18.2d, v16.2d
+ fmul v6.2d, v19.2d, v17.2d
+ fsub v20.2d, v20.2d, v6.2d
+ fmul v21.2d, v18.2d, v17.2d
+ fmul v6.2d, v19.2d, v16.2d
+ fadd v21.2d, v21.2d, v6.2d
+
+ st2 {v20.2d, v21.2d}, [X_COPY], #32
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.endm
.macro KERNEL_S1
-
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
- ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
- fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
- fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
- st1 {v2.2s}, [X], INC_X
+ fmul s3, DA_R, v2.s[0] // DA_R*X0
+ fmul s5, DA_I, v2.s[1] // DA_I*X1
+ fsub s3, s3, s5 // DA_R*X0-DA_I*X1
+
+ fmul s4, DA_I, v2.s[0] // DA_I*X0
+ fmul s5, DA_R, v2.s[1] // DA_R*X1
+ fadd s4, s4, s5 // DA_I*X0+DA_R*X1
+
+ ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
+ st1 {v3.2s}, [X], INC_X
#else
ld1 {v2.2d}, [X] // X1, X0
- ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
- fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
- fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
- st1 {v2.2d}, [X], INC_X
-#endif
+ fmul d3, DA_R, v2.d[0] // DA_R*X0
+ fmul d5, DA_I, v2.d[1] // DA_I*X1
+ fsub d3, d3, d5 // DA_R*X0-DA_I*X1
+
+ fmul d4, DA_I, v2.d[0] // DA_I*X0
+ fmul d5, DA_R, v2.d[1] // DA_R*X1
+ fadd d4, d4, d5 // DA_I*X0+DA_R*X1
+ ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
+ st1 {v3.2d}, [X], INC_X
+#endif
.endm
/*******************************************************************************
*******************************************************************************/
PROLOGUE
+
+ b zscal_begin
+data_ar:
+ .word 0x3e44fae6
+data_ai:
+ .word 0x3d320fa2
+data_xr:
+ .word 0x3f4baff1
+data_xi:
+ .word 0xbe8ef0bd
+
+zscal_begin:
+
+ ldr s20, data_ar
+ ldr s21, data_ai
+ ldr s22, data_xr
+ ldr s23, data_xi
+
+ fmul s24, s22, s21
+ fmla s24, s23, v20.s[0]
+
+ fmul s25, s22, s21
+ fmul s26, s23, s20
+ fadd s25, s25, s26
+
+ mov X_COPY, X
cmp N, xzr
ble zscal_kernel_L999
fcmp DA_R, #0.0
- bne zscal_kernel_1
+ bne zscal_kernel_R_non_zero
fcmp DA_I, #0.0
- beq zscal_kernel_zero
+ beq zscal_kernel_RI_zero
- // TODO: special case DA_R == 0 && DA_I != 0
+ b zscal_kernel_R_zero
-zscal_kernel_1:
+zscal_kernel_R_non_zero:
- // TODO: special case DA_R != 0 && DA_I == 0
+ fcmp DA_I, #0.0
+ beq zscal_kernel_I_zero
+
+/*******************************************************************************
+* A_R != 0 && A_I != 0
+*******************************************************************************/
+
+zscal_kernel_RI_non_zero:
INIT
mov w0, wzr
ret
-zscal_kernel_zero:
+/*******************************************************************************
+* A_R == 0 && A_I != 0
+*******************************************************************************/
+
+zscal_kernel_R_zero:
+ INIT_S
+
+#if !defined(DOUBLE)
+ eor v2.16b, v2.16b, v2.16b
+ fsub s2, s2, DA_I
+ ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
+#else
+ eor v2.16b, v2.16b, v2.16b
+ fsub d2, d2, DA_I
+ ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
+#endif
+
+zscal_kernel_R_zero_1:
+#if !defined(DOUBLE)
+ ld1 {v2.2s}, [X] // X1, X0
+ fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
+ ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1
+ st1 {v2.2s}, [X]
+#else
+ ld1 {v2.2d}, [X] // X1, X0
+ fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0
+ ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1
+ st1 {v2.2d}, [X]
+#endif
+ add X, X, INC_X
+ subs N, N, #1
+ bne zscal_kernel_R_zero_1
+
+ mov w0, wzr
+ ret
+
+/*******************************************************************************
+* A_R != 0 && A_I == 0
+*******************************************************************************/
+
+zscal_kernel_I_zero:
+ INIT_S
+#if !defined(DOUBLE)
+ ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
+#else
+ ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
+#endif
+
+zscal_kernel_I_zero_1:
+#if !defined(DOUBLE)
+ ld1 {v2.2s}, [X] // X1, X0
+ fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
+ st1 {v2.2s}, [X]
+#else
+ ld1 {v2.2d}, [X] // X1, X0
+ fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
+ st1 {v2.2d}, [X]
+#endif
+ add X, X, INC_X
+ subs N, N, #1
+ bne zscal_kernel_I_zero_1
+
+ mov w0, wzr
+ ret
+
+/*******************************************************************************
+* A_R == 0 && A_I == 0
+*******************************************************************************/
+
+zscal_kernel_RI_zero:
INIT_S
-zscal_kernel_Z1:
+zscal_kernel_RI_zero_1:
stp DA_R, DA_I, [X]
add X, X, INC_X
- subs N, N, #1
- bne zscal_kernel_Z1
+ subs N, N, #1
+ bne zscal_kernel_RI_zero_1
mov w0, wzr
ret
fmul v16.2d, v0.2d, v8.2d[0]
OP_ii v16.2d, v1.2d, v9.2d[0]
- fmul v17.2d, v0.2d, v9.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v17.2d, v17.2d
+ eor v17.16b, v17.16b, v17.16b
+ fmls v17.2d, v0.2d, v9.2d[0]
+#else
+ fmul v17.2d, v0.2d, v9.2d[0]
#endif
OP_ir v17.2d, v1.2d, v8.2d[0]
fmul v18.2d, v2.2d, v8.2d[0]
OP_ii v18.2d, v3.2d, v9.2d[0]
- fmul v19.2d, v2.2d, v9.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v19.2d, v19.2d
+ eor v19.16b, v19.16b, v19.16b
+ fmls v19.2d, v2.2d, v9.2d[0]
+#else
+ fmul v19.2d, v2.2d, v9.2d[0]
#endif
OP_ir v19.2d, v3.2d, v8.2d[0]
fmul v20.2d, v0.2d, v8.2d[1]
OP_ii v20.2d, v1.2d, v9.2d[1]
- fmul v21.2d, v0.2d, v9.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v21.2d, v21.2d
+ eor v21.16b, v21.16b, v21.16b
+ fmls v21.2d, v0.2d, v9.2d[1]
+#else
+ fmul v21.2d, v0.2d, v9.2d[1]
#endif
OP_ir v21.2d, v1.2d, v8.2d[1]
fmul v22.2d, v2.2d, v8.2d[1]
OP_ii v22.2d, v3.2d, v9.2d[1]
- fmul v23.2d, v2.2d, v9.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v23.2d, v23.2d
+ eor v23.16b, v23.16b, v23.16b
+ fmls v23.2d, v2.2d, v9.2d[1]
+#else
+ fmul v23.2d, v2.2d, v9.2d[1]
#endif
OP_ir v23.2d, v3.2d, v8.2d[1]
fmul v24.2d, v0.2d, v10.2d[0]
OP_ii v24.2d, v1.2d, v11.2d[0]
- fmul v25.2d, v0.2d, v11.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v25.2d, v25.2d
+ eor v25.16b, v25.16b, v25.16b
+ fmls v25.2d, v0.2d, v11.2d[0]
+#else
+ fmul v25.2d, v0.2d, v11.2d[0]
#endif
OP_ir v25.2d, v1.2d, v10.2d[0]
fmul v26.2d, v2.2d, v10.2d[0]
OP_ii v26.2d, v3.2d, v11.2d[0]
- fmul v27.2d, v2.2d, v11.2d[0]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v27.2d, v27.2d
+ eor v27.16b, v27.16b, v27.16b
+ fmls v27.2d, v2.2d, v11.2d[0]
+#else
+ fmul v27.2d, v2.2d, v11.2d[0]
#endif
OP_ir v27.2d, v3.2d, v10.2d[0]
fmul v28.2d, v0.2d, v10.2d[1]
OP_ii v28.2d, v1.2d, v11.2d[1]
- fmul v29.2d, v0.2d, v11.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v29.2d, v29.2d
+ eor v29.16b, v29.16b, v29.16b
+ fmls v29.2d, v0.2d, v11.2d[1]
+#else
+ fmul v29.2d, v0.2d, v11.2d[1]
#endif
OP_ir v29.2d, v1.2d, v10.2d[1]
fmul v30.2d, v2.2d, v10.2d[1]
OP_ii v30.2d, v3.2d, v11.2d[1]
- fmul v31.2d, v2.2d, v11.2d[1]
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
defined(RR) || defined(RC) || defined(CR) || defined(CC)
- fneg v31.2d, v31.2d
+ eor v31.16b, v31.16b, v31.16b
+ fmls v31.2d, v2.2d, v11.2d[1]
+#else
+ fmul v31.2d, v2.2d, v11.2d[1]
#endif
OP_ir v31.2d, v3.2d, v10.2d[1]