CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
+SGEMVNKERNEL = gemv_n.S
+DGEMVNKERNEL = gemv_n.S
+CGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n.S
+
+SGEMVTKERNEL = gemv_t.S
+DGEMVTKERNEL = gemv_t.S
+CGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t.S
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M x0 /* Y vector length */
+#define N x1 /* X vector length */
+#define A x3 /* A vector address */
+#define LDA x4 /* A stride */
+#define X x5 /* X vector address */
+#define INC_X x6 /* X stride */
+#define Y x7 /* Y vector address */
+#define INC_Y x2 /* Y stride */
+#define A_PTR x9 /* loop A vector address */
+#define Y_IPTR x10 /* loop Y vector address */
+#define J x11 /* loop variable */
+#define I x12 /* loop variable */
+#define Y_OPTR x13 /* loop Y vector address */
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+#if !defined(DOUBLE)
+#define ALPHA s0
+#define TEMP s1
+#define TEMPV {v1.s}[0]
+#define TMP1 s2
+#define TMPV1 {v2.s}[0]
+#define TMP2 s3
+#define TMPV2 {v3.s}[0]
+#define SZ 4
+#define SHZ 2
+#else
+#define ALPHA d0
+#define TEMP d1
+#define TEMPV {v1.d}[0]
+#define TMP1 d2
+#define TMPV1 {v2.d}[0]
+#define TMP2 d3
+#define TMPV2 {v3.d}[0]
+#define SZ 8
+#define SHZ 3
+#endif
+
+/******************************************************************************/
+
+.macro SAVE_REGS
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+.endm
+
+.macro KERNEL_F16
+#if !defined(DOUBLE)
+ ld1 {v2.4s, v3.4s}, [A_PTR], #32
+ ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
+ fmla v4.4s, v1.4s, v2.4s
+ fmla v5.4s, v1.4s, v3.4s
+ st1 {v4.4s, v5.4s}, [Y_OPTR], #32
+
+ ld1 {v6.4s, v7.4s}, [A_PTR], #32
+ ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
+ fmla v8.4s, v1.4s, v6.4s
+ fmla v9.4s, v1.4s, v7.4s
+ st1 {v8.4s, v9.4s}, [Y_OPTR], #32
+#else //DOUBLE
+ ld1 {v2.2d, v3.2d}, [A_PTR], #32
+ ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
+ fmla v4.2d, v1.2d, v2.2d
+ fmla v5.2d, v1.2d, v3.2d
+ st1 {v4.2d, v5.2d}, [Y_OPTR], #32
+
+ ld1 {v6.2d, v7.2d}, [A_PTR], #32
+ ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
+ fmla v8.2d, v1.2d, v6.2d
+ fmla v9.2d, v1.2d, v7.2d
+ st1 {v8.2d, v9.2d}, [Y_OPTR], #32
+
+ ld1 {v10.2d, v11.2d}, [A_PTR], #32
+ ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
+ fmla v12.2d, v1.2d, v10.2d
+ fmla v13.2d, v1.2d, v11.2d
+ st1 {v12.2d, v13.2d}, [Y_OPTR], #32
+
+ ld1 {v14.2d, v15.2d}, [A_PTR], #32
+ ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
+ fmla v16.2d, v1.2d, v14.2d
+ fmla v17.2d, v1.2d, v15.2d
+ st1 {v16.2d, v17.2d}, [Y_OPTR], #32
+#endif
+.endm
+
+.macro KERNEL_F4
+#if !defined(DOUBLE)
+ ld1 {v2.4s}, [A_PTR], #16
+ ld1 {v3.4s}, [Y_IPTR], #16
+ fmla v3.4s, v1.4s, v2.4s
+ st1 {v3.4s}, [Y_OPTR], #16
+#else
+ ld1 {v2.2d}, [A_PTR], #16
+ ld1 {v3.2d}, [Y_IPTR], #16
+ fmla v3.2d, v1.2d, v2.2d
+ st1 {v3.2d}, [Y_OPTR], #16
+
+ ld1 {v4.2d}, [A_PTR], #16
+ ld1 {v5.2d}, [Y_IPTR], #16
+ fmla v5.2d, v1.2d, v4.2d
+ st1 {v5.2d}, [Y_OPTR], #16
+#endif
+.endm
+
+.macro KERNEL_F1
+
+ ld1 TMPV1, [A_PTR], #SZ
+ ld1 TMPV2, [Y_IPTR]
+ fmadd TMP2, TEMP, TMP1, TMP2
+ st1 TMPV2, [Y_IPTR], #SZ
+
+.endm
+
+.macro INIT_S
+
+ lsl INC_Y, INC_Y, #SHZ
+
+.endm
+
+.macro KERNEL_S1
+
+ ld1 TMPV1, [A_PTR], #SZ
+ ld1 TMPV2, [Y_IPTR]
+ fmadd TMP2, TEMP, TMP1, TMP2
+ st1 TMPV2, [Y_IPTR], INC_Y
+
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ ldr INC_Y, [sp]
+
+ SAVE_REGS
+
+ cmp N, xzr
+ ble gemv_n_kernel_L999
+ cmp M, xzr
+ ble gemv_n_kernel_L999
+
+ lsl LDA, LDA, #SHZ
+ lsl INC_X, INC_X, #SHZ
+ mov J, N
+
+ cmp INC_Y, #1
+ bne gemv_n_kernel_S_BEGIN
+
+gemv_n_kernel_F_LOOP:
+
+ ld1 TEMPV, [X], INC_X
+ fmul TEMP, ALPHA, TEMP
+#if !defined(DOUBLE)
+ ins v1.s[1], v1.s[0]
+ ins v1.s[2], v1.s[0]
+ ins v1.s[3], v1.s[0]
+#else
+ ins v1.d[1], v1.d[0]
+#endif
+ mov A_PTR, A
+ mov Y_IPTR, Y
+ mov Y_OPTR, Y
+
+gemv_n_kernel_F32:
+
+ asr I, M, #5
+ cmp I, xzr
+ beq gemv_n_kernel_F4
+
+gemv_n_kernel_F320:
+
+ KERNEL_F16
+ KERNEL_F16
+
+ subs I, I, #1
+ bne gemv_n_kernel_F320
+
+gemv_n_kernel_F4:
+ ands I, M, #31
+ asr I, I, #2
+ cmp I, xzr
+ beq gemv_n_kernel_F1
+
+gemv_n_kernel_F40:
+
+ KERNEL_F4
+
+ subs I, I, #1
+ bne gemv_n_kernel_F40
+
+gemv_n_kernel_F1:
+ ands I, M, #3
+ ble gemv_n_kernel_F_END
+
+gemv_n_kernel_F10:
+
+ KERNEL_F1
+
+ subs I, I, #1
+ bne gemv_n_kernel_F10
+
+gemv_n_kernel_F_END:
+
+ add A, A, LDA
+ subs J, J, #1
+ bne gemv_n_kernel_F_LOOP
+
+ b gemv_n_kernel_L999
+
+gemv_n_kernel_S_BEGIN:
+
+ INIT_S
+
+gemv_n_kernel_S_LOOP:
+
+ ld1 TEMPV, [X], INC_X
+ fmul TEMP, ALPHA, TEMP
+ mov A_PTR, A
+ mov Y_IPTR, Y
+
+ asr I, M, #2
+ cmp I, xzr
+ ble gemv_n_kernel_S1
+
+gemv_n_kernel_S4:
+
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+
+ subs I, I, #1
+ bne gemv_n_kernel_S4
+
+gemv_n_kernel_S1:
+
+ ands I, M, #3
+ ble gemv_n_kernel_S_END
+
+gemv_n_kernel_S10:
+
+ KERNEL_S1
+
+ subs I, I, #1
+ bne gemv_n_kernel_S10
+
+gemv_n_kernel_S_END:
+
+ add A, A, LDA
+ subs J, J, #1
+ bne gemv_n_kernel_S_LOOP
+
+gemv_n_kernel_L999:
+
+ mov w0, wzr
+
+ RESTORE_REGS
+
+ ret
+
+ EPILOGUE
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M x0 /* Y vector length */
+#define N x1 /* X vector length */
+#define A x3 /* A vector address */
+#define LDA x4 /* A stride */
+#define X x5 /* X vector address */
+#define INC_X x6 /* X stride */
+#define Y x7 /* Y vector address */
+#define INC_Y x2 /* Y stride */
+#define A_PTR x9 /* loop A vector address */
+#define X_PTR x10 /* loop X vector address */
+#define J x11 /* loop variable */
+#define I x12 /* loop variable */
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+#if !defined(DOUBLE)
+#define REG0 wzr
+#define ALPHA s0
+#define TEMP s1
+#define TEMP1 s2
+#define TEMP2 s3
+#define TEMP3 s4
+#define TEMPV {v1.s}[0]
+#define TMP1 s2
+#define TMPV1 {v2.s}[0]
+#define TMP2 s3
+#define TMPV2 {v3.s}[0]
+#define SZ 4
+#define SHZ 2
+#else
+#define REG0 xzr
+#define ALPHA d0
+#define TEMP d1
+#define TEMP1 d2
+#define TEMP2 d3
+#define TEMP3 d4
+#define TEMPV {v1.d}[0]
+#define TMP1 d2
+#define TMPV1 {v2.d}[0]
+#define TMP2 d3
+#define TMPV2 {v3.d}[0]
+#define SZ 8
+#define SHZ 3
+#endif
+
+/******************************************************************************/
+
+.macro SAVE_REGS
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+.endm
+
+.macro KERNEL_F32
+#if !defined(DOUBLE)
+ ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
+ ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
+ fmla v1.4s, v5.4s, v9.4s
+ fmla v2.4s, v6.4s, v10.4s
+ fmla v3.4s, v7.4s, v11.4s
+ fmla v4.4s, v8.4s, v12.4s
+
+ ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
+ ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
+ fmla v1.4s, v13.4s, v17.4s
+ fmla v2.4s, v14.4s, v18.4s
+ fmla v3.4s, v15.4s, v19.4s
+ fmla v4.4s, v16.4s, v20.4s
+#else
+ ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
+ ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
+ fmla v1.2d, v5.2d, v9.2d
+ fmla v2.2d, v6.2d, v10.2d
+ fmla v3.2d, v7.2d, v11.2d
+ fmla v4.2d, v8.2d, v12.2d
+
+ ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
+ ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
+ fmla v1.2d, v13.2d, v17.2d
+ fmla v2.2d, v14.2d, v18.2d
+ fmla v3.2d, v15.2d, v19.2d
+ fmla v4.2d, v16.2d, v20.2d
+
+ ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
+ ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
+ fmla v1.2d, v5.2d, v9.2d
+ fmla v2.2d, v6.2d, v10.2d
+ fmla v3.2d, v7.2d, v11.2d
+ fmla v4.2d, v8.2d, v12.2d
+
+ ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
+ ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
+ fmla v1.2d, v13.2d, v17.2d
+ fmla v2.2d, v14.2d, v18.2d
+ fmla v3.2d, v15.2d, v19.2d
+ fmla v4.2d, v16.2d, v20.2d
+#endif
+.endm
+
+.macro KERNEL_F32_FINALIZE
+#if !defined(DOUBLE)
+ fadd v1.4s, v1.4s, v2.4s
+ fadd v1.4s, v1.4s, v3.4s
+ fadd v1.4s, v1.4s, v4.4s
+#else
+ fadd v1.2d, v1.2d, v2.2d
+ fadd v1.2d, v1.2d, v3.2d
+ fadd v1.2d, v1.2d, v4.2d
+#endif
+.endm
+
+.macro KERNEL_F4
+#if !defined(DOUBLE)
+ ld1 {v2.4s}, [A_PTR], #16
+ ld1 {v3.4s}, [X_PTR], #16
+ fmla v1.4s, v2.4s, v3.4s
+#else
+ ld1 {v2.2d}, [A_PTR], #16
+ ld1 {v3.2d}, [X_PTR], #16
+ fmla v1.2d, v2.2d, v3.2d
+
+ ld1 {v4.2d}, [A_PTR], #16
+ ld1 {v5.2d}, [X_PTR], #16
+ fmla v1.2d, v4.2d, v5.2d
+#endif
+.endm
+
+.macro KERNEL_F4_FINALIZE
+#if !defined(DOUBLE)
+ ext v2.16b, v1.16b, v1.16b, #8
+ fadd v1.2s, v1.2s, v2.2s
+ faddp TEMP, v1.2s
+#else
+ faddp TEMP, v1.2d
+#endif
+.endm
+
+.macro KERNEL_F1
+ ld1 TMPV1, [A_PTR], #SZ
+ ld1 TMPV2, [X_PTR], #SZ
+ fmadd TEMP, TMP1, TMP2, TEMP
+.endm
+
+.macro INIT_S
+ lsl INC_X, INC_X, #SHZ
+.endm
+
+.macro KERNEL_S1
+ ld1 TMPV1, [A_PTR], #SZ
+ ld1 TMPV2, [X_PTR], INC_X
+ fmadd TEMP, TMP1, TMP2, TEMP
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ ldr INC_Y, [sp]
+
+ SAVE_REGS
+
+ cmp N, xzr
+ ble gemv_t_kernel_L999
+ cmp M, xzr
+ ble gemv_t_kernel_L999
+
+ lsl LDA, LDA, #SHZ
+ lsl INC_Y, INC_Y, #SHZ
+ mov J, N
+
+ cmp INC_X, #1
+ bne gemv_t_kernel_S_BEGIN
+
+gemv_t_kernel_F_LOOP:
+
+ fmov TEMP, REG0
+ fmov TEMP1, REG0
+ fmov TEMP2, REG0
+ fmov TEMP3, REG0
+
+ mov A_PTR, A
+ mov X_PTR, X
+
+gemv_t_kernel_F32:
+
+ asr I, M, #5
+ cmp I, xzr
+ beq gemv_t_kernel_F4
+
+gemv_t_kernel_F320:
+
+ KERNEL_F32
+
+ subs I, I, #1
+ bne gemv_t_kernel_F320
+
+ KERNEL_F32_FINALIZE
+
+gemv_t_kernel_F4:
+ ands I, M, #31
+ asr I, I, #2
+ cmp I, xzr
+ beq gemv_t_kernel_F1
+
+gemv_t_kernel_F40:
+
+ KERNEL_F4
+
+ subs I, I, #1
+ bne gemv_t_kernel_F40
+
+gemv_t_kernel_F1:
+
+ KERNEL_F4_FINALIZE
+
+ ands I, M, #3
+ ble gemv_t_kernel_F_END
+
+gemv_t_kernel_F10:
+
+ KERNEL_F1
+
+ subs I, I, #1
+ bne gemv_t_kernel_F10
+
+gemv_t_kernel_F_END:
+
+ ld1 TMPV1, [Y]
+ add A, A, LDA
+ subs J, J, #1
+ fmadd TMP1, ALPHA, TEMP, TMP1
+ st1 TMPV1, [Y], INC_Y
+ bne gemv_t_kernel_F_LOOP
+
+ b gemv_t_kernel_L999
+
+gemv_t_kernel_S_BEGIN:
+
+ INIT_S
+
+gemv_t_kernel_S_LOOP:
+
+ fmov TEMP, REG0
+ mov A_PTR, A
+ mov X_PTR, X
+
+ asr I, M, #2
+ cmp I, xzr
+ ble gemv_t_kernel_S1
+
+gemv_t_kernel_S4:
+
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+
+ subs I, I, #1
+ bne gemv_t_kernel_S4
+
+gemv_t_kernel_S1:
+
+ ands I, M, #3
+ ble gemv_t_kernel_S_END
+
+gemv_t_kernel_S10:
+
+ KERNEL_S1
+
+ subs I, I, #1
+ bne gemv_t_kernel_S10
+
+gemv_t_kernel_S_END:
+
+ ld1 TMPV1, [Y]
+ add A, A, LDA
+ subs J, J, #1
+ fmadd TMP1, ALPHA, TEMP, TMP1
+ st1 TMPV1, [Y], INC_Y
+ bne gemv_t_kernel_S_LOOP
+
+gemv_t_kernel_L999:
+
+ RESTORE_REGS
+
+ mov w0, wzr
+ ret
+
+ EPILOGUE
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M x0 /* Y vector length */
+#define N x1 /* X vector length */
+#define A x3 /* A vector address */
+#define LDA x4 /* A stride */
+#define X x5 /* X vector address */
+#define INC_X x6 /* X stride */
+#define Y x7 /* Y vector address */
+#define INC_Y x2 /* Y stride */
+#define A_PTR x9 /* loop A vector address */
+#define Y_IPTR x10 /* loop Y vector address */
+#define J x11 /* loop variable */
+#define I x12 /* loop variable */
+#define Y_OPTR x13 /* loop Y vector address */
+#define X_PTR x14 /* loop X vector address */
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+#if !defined(DOUBLE)
+#define ALPHA_R s0
+#define ALPHA_I s1
+#define ALPHA_R_COPY s7
+#define ALPHA_I_COPY s8
+#define SHZ 3
+#else
+#define ALPHA_R d0
+#define ALPHA_I d1
+#define ALPHA_R_COPY d7
+#define ALPHA_I_COPY d8
+#define SHZ 4
+#endif
+
+/******************************************************************************/
+
+.macro SAVE_REGS
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+.endm
+
+
+.macro INIT
+ /********** INIT FOR F4 LOOP **********/
+ fmov ALPHA_R_COPY, ALPHA_R
+ fmov ALPHA_I_COPY, ALPHA_I
+#if !defined(DOUBLE)
+ ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
+ ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
+ ins v7.d[1], v7.d[0]
+ ins v8.d[1], v8.d[0]
+#else
+ ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
+ ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
+#endif
+
+ /******* INIT FOR F1 AND S1 LOOP ******/
+#if !defined(DOUBLE)
+ ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
+ fneg s2, ALPHA_I
+ ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA)
+#if !defined(XCONJ)
+ ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA)
+#endif
+#else
+ ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
+ fneg d2, ALPHA_I
+ ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA)
+#if !defined(XCONJ)
+ ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA)
+#endif
+#endif
+.endm
+
+.macro INIT_LOOP
+ /********** INIT_LOOP FOR F4 LOOP **********/
+#if !defined(DOUBLE)
+ ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
+ ins v10.s[0], v9.s[1]
+ ins v9.s[1], v9.s[0] // [R(X), R(X)]
+ ins v10.s[1], v10.s[0] // [I(X), I(X)]
+ ins v9.d[1], v9.d[0]
+ ins v10.d[1], v10.d[0]
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
+ fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
+ fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
+ fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
+#else
+ fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
+ fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
+ fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
+ fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
+ fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
+ fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
+ fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
+#else
+ fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
+ fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
+ fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)]
+ fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)]
+ fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
+#endif
+#endif // CONJ
+
+ /****** INIT_LOOP FOR F1 AND S1 LOOP ******/
+ ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
+ ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
+ fmul v2.2s, v0.2s, v2.2s
+ fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
+ ins v3.s[0], v2.s[1]
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fneg s4, s3
+ ins v3.s[1], v4.s[0]
+ ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)]
+ ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
+#else
+ fneg s4, s3
+ ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)]
+ ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)]
+ fneg s4, s2
+ ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
+#else
+ fneg s3, s3
+ ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)]
+ fneg s4, s2
+ ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
+#endif
+#endif // CONJ
+
+#else // DOUBLE
+
+ /********** INIT_LOOP FOR F4 LOOP **********/
+ ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
+ ins v10.d[0], v9.d[1]
+ ins v9.d[1], v9.d[0] // [R(X), R(X)]
+ ins v10.d[1], v10.d[0] // [I(X), I(X)]
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
+ fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
+ fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
+ fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
+#else
+ fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
+ fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
+ fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
+ fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
+ fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
+ fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
+ fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
+#else
+ fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
+ fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
+ fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)]
+ fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)]
+ fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
+#endif
+#endif // CONJ
+
+ /****** INIT_LOOP FOR F1 AND S1 LOOP ******/
+ ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
+ ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
+ fmul v2.2d, v0.2d, v2.2d
+ fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
+ ins v3.d[0], v2.d[1] // I(TEMP)
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fneg d4, d3 // -I(TEMP)
+ ins v3.d[1], v4.d[0]
+ ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)]
+ ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
+#else
+ fneg d4, d3 // -I(TEMP)
+ ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)]
+ ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)]
+ fneg d4, d2 // -R(TEMP)
+ ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
+#else
+ fneg d3, d3 // -I(TEMP)
+ ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)]
+ fneg d4, d2 // -R(TEMP)
+ ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
+#endif
+#endif // CONJ
+
+#endif // DOUBLE
+.endm
+
+.macro KERNEL_F4
+#if !defined(DOUBLE)
+
+ ld2 {v13.4s, v14.4s}, [A_PTR], #32
+ ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
+ fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
+ fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
+ fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
+#else
+ fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
+ fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
+ fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
+ fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
+ fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
+ fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
+ fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
+#else
+ fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
+ fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
+ fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
+ fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
+#endif
+#endif // CONJ
+ st2 {v15.4s, v16.4s}, [Y_OPTR], #32
+
+#else // DOUBLE
+
+ ld2 {v13.2d, v14.2d}, [A_PTR], #32
+ ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
+ fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
+ fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
+ fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
+#else
+ fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
+ fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
+ fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
+ fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
+ fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
+ fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
+ fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
+#else
+ fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
+ fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
+ fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
+ fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
+#endif
+#endif // CONJ
+ st2 {v15.2d, v16.2d}, [Y_OPTR], #32
+
+ ld2 {v17.2d, v18.2d}, [A_PTR], #32
+ ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
+ fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
+ fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
+ fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
+#else
+ fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
+ fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
+ fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
+ fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
+ fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
+ fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
+ fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
+#else
+ fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
+ fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
+ fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
+ fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
+#endif
+#endif // CONJ
+ st2 {v19.2d, v20.2d}, [Y_OPTR], #32
+
+#endif
+
+.endm
+
+.macro KERNEL_F1
+#if !defined(DOUBLE)
+ ld1 {v4.2s}, [A_PTR], #8
+ ld1 {v5.2s}, [Y_IPTR], #8
+ ext v6.8b, v4.8b, v4.8b, #4
+ fmla v5.2s, v2.2s, v4.2s
+ fmla v5.2s, v3.2s, v6.2s
+ st1 {v5.2s}, [Y_OPTR], #8
+#else // DOUBLE
+ ld1 {v4.2d}, [A_PTR], #16
+ ld1 {v5.2d}, [Y_IPTR], #16
+ ext v6.16b, v4.16b, v4.16b, #8
+ fmla v5.2d, v2.2d, v4.2d
+ fmla v5.2d, v3.2d, v6.2d
+ st1 {v5.2d}, [Y_OPTR], #16
+#endif
+.endm
+
+.macro INIT_S
+ lsl INC_Y, INC_Y, #SHZ
+.endm
+
+.macro KERNEL_S1
+#if !defined(DOUBLE)
+ ld1 {v4.2s}, [A_PTR], #8
+ ld1 {v5.2s}, [Y_IPTR], INC_Y
+ ext v6.8b, v4.8b, v4.8b, #4
+ fmla v5.2s, v2.2s, v4.2s
+ fmla v5.2s, v3.2s, v6.2s
+ st1 {v5.2s}, [Y_OPTR], INC_Y
+#else // DOUBLE
+ ld1 {v4.2d}, [A_PTR], #16
+ ld1 {v5.2d}, [Y_IPTR], INC_Y
+ ext v6.16b, v4.16b, v4.16b, #8
+ fmla v5.2d, v2.2d, v4.2d
+ fmla v5.2d, v3.2d, v6.2d
+ st1 {v5.2d}, [Y_OPTR], INC_Y
+#endif
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ ldr INC_Y, [sp]
+
+ SAVE_REGS
+
+ cmp N, xzr
+ ble zgemv_n_kernel_L999
+ cmp M, xzr
+ ble zgemv_n_kernel_L999
+
+ lsl LDA, LDA, #SHZ
+ lsl INC_X, INC_X, #SHZ
+ mov J, N
+
+ INIT
+
+ cmp INC_Y, #1
+ bne zgemv_n_kernel_S_BEGIN
+
+zgemv_n_kernel_F_LOOP:
+ mov A_PTR, A
+ mov Y_IPTR, Y
+ mov Y_OPTR, Y
+ mov X_PTR, X
+ add X, X, INC_X
+ INIT_LOOP
+
+ asr I, M, #2
+ cmp I, xzr
+ beq zgemv_n_kernel_F1
+
+zgemv_n_kernel_F4:
+
+ KERNEL_F1
+ KERNEL_F1
+ KERNEL_F1
+ KERNEL_F1
+
+ subs I, I, #1
+ bne zgemv_n_kernel_F4
+
+zgemv_n_kernel_F1:
+
+ ands I, M, #3
+ ble zgemv_n_kernel_F_END
+
+zgemv_n_kernel_F10:
+
+ KERNEL_F1
+
+ subs I, I, #1
+ bne zgemv_n_kernel_F10
+
+zgemv_n_kernel_F_END:
+
+ add A, A, LDA
+ subs J, J, #1
+ bne zgemv_n_kernel_F_LOOP
+
+ b zgemv_n_kernel_L999
+
+zgemv_n_kernel_S_BEGIN:
+
+ INIT_S
+
+zgemv_n_kernel_S_LOOP:
+ mov A_PTR, A
+ mov Y_IPTR, Y
+ mov Y_OPTR, Y
+ mov X_PTR, X
+ add X, X, INC_X
+ INIT_LOOP
+
+ asr I, M, #2
+ cmp I, xzr
+ ble zgemv_n_kernel_S1
+
+zgemv_n_kernel_S4:
+
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+
+ subs I, I, #1
+ bne zgemv_n_kernel_S4
+
+zgemv_n_kernel_S1:
+
+ ands I, M, #3
+ ble zgemv_n_kernel_S_END
+
+zgemv_n_kernel_S10:
+
+ KERNEL_S1
+
+ subs I, I, #1
+ bne zgemv_n_kernel_S10
+
+zgemv_n_kernel_S_END:
+
+ add A, A, LDA
+ subs J, J, #1
+ bne zgemv_n_kernel_S_LOOP
+
+zgemv_n_kernel_L999:
+ RESTORE_REGS
+
+ mov w0, wzr
+ ret
+
+ EPILOGUE
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M x0 /* Y vector length */
+#define N x1 /* X vector length */
+#define A x3 /* A vector address */
+#define LDA x4 /* A stride */
+#define X x5 /* X vector address */
+#define INC_X x6 /* X stride */
+#define Y x7 /* Y vector address */
+#define INC_Y x2 /* Y stride */
+#define A_PTR x9 /* loop A vector address */
+#define X_PTR x10 /* loop Y vector address */
+#define J x11 /* loop variable */
+#define I x12 /* loop variable */
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+#if !defined(DOUBLE)
+#define ALPHA_R s0
+#define ALPHA_I s1
+#define ALPHA_R_COPY s7
+#define ALPHA_I_COPY s8
+#define SHZ 3
+#else
+#define ALPHA_R d0
+#define ALPHA_I d1
+#define ALPHA_R_COPY d7
+#define ALPHA_I_COPY d8
+#define SHZ 4
+#endif
+
+/******************************************************************************/
+
+
+.macro SAVE_REGS
+ add sp, sp, #-(11 * 16)
+ stp d8, d9, [sp, #(0 * 16)]
+ stp d10, d11, [sp, #(1 * 16)]
+ stp d12, d13, [sp, #(2 * 16)]
+ stp d14, d15, [sp, #(3 * 16)]
+ stp d16, d17, [sp, #(4 * 16)]
+ stp x18, x19, [sp, #(5 * 16)]
+ stp x20, x21, [sp, #(6 * 16)]
+ stp x22, x23, [sp, #(7 * 16)]
+ stp x24, x25, [sp, #(8 * 16)]
+ stp x26, x27, [sp, #(9 * 16)]
+ str x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+ ldp d8, d9, [sp, #(0 * 16)]
+ ldp d10, d11, [sp, #(1 * 16)]
+ ldp d12, d13, [sp, #(2 * 16)]
+ ldp d14, d15, [sp, #(3 * 16)]
+ ldp d16, d17, [sp, #(4 * 16)]
+ ldp x18, x19, [sp, #(5 * 16)]
+ ldp x20, x21, [sp, #(6 * 16)]
+ ldp x22, x23, [sp, #(7 * 16)]
+ ldp x24, x25, [sp, #(8 * 16)]
+ ldp x26, x27, [sp, #(9 * 16)]
+ ldr x28, [sp, #(10 * 16)]
+ add sp, sp, #(11*16)
+.endm
+
+.macro INIT
+#if !defined(XCONJ)
+#if !defined(DOUBLE)
+ ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R
+ fneg s2, ALPHA_I
+ ins v1.s[1], v2.s[0]
+ ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I
+#else
+ ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R
+ fneg d2, ALPHA_I
+ ins v1.d[1], v2.d[0]
+ ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I
+#endif
+#else // XCONJ
+#if !defined(DOUBLE)
+ fneg s2, ALPHA_R
+ ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R
+ ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I
+#else
+ fneg d2, ALPHA_R
+ ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R
+ ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I
+#endif
+#endif
+.endm
+
+.macro INIT_LOOP
+ fmov d9, xzr // TEMP_R = [0, 0]
+ fmov d10, xzr // TEMP_I = [0, 0]
+#if !defined(DOUBLE)
+#else
+ fmov d15, xzr // TEMP_R = [0, 0]
+ fmov d16, xzr // TEMP_I = [0, 0]
+#endif
+
+ fmov d2, xzr // TEMP = [0, 0]
+.endm
+
+.macro KERNEL_F4
+#if !defined(DOUBLE)
+
+ ld2 {v11.4s, v12.4s}, [X_PTR], #32
+ ld2 {v13.4s, v14.4s}, [A_PTR], #32
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
+ fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
+ fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
+ fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
+#else
+ fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
+ fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
+ fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
+ fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
+ fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
+ fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
+ fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
+#else
+ fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
+ fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
+ fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
+ fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
+#endif
+#endif // CONJ
+
+#else // DOUBLE
+ ld2 {v11.2d, v12.2d}, [X_PTR], #32
+ ld2 {v13.2d, v14.2d}, [A_PTR], #32
+ prfm PLDL1STRM, [X_PTR, #512]
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
+ fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
+ fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
+ fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
+#else
+ fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
+ fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
+ fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
+ fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
+ fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
+ fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
+ fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
+#else
+ fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
+ fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
+ fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
+ fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
+#endif
+#endif // CONJ
+ ld2 {v17.2d, v18.2d}, [X_PTR], #32
+ ld2 {v19.2d, v20.2d}, [A_PTR], #32
+ prfm PLDL1STRM, [A_PTR, #512]
+#if !defined(CONJ)
+#if !defined(XCONJ)
+ fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
+ fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
+ fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
+ fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
+#else
+ fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
+ fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
+ fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
+ fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
+#endif
+#else // CONJ
+#if !defined(XCONJ)
+ fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
+ fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
+ fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
+ fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
+#else
+ fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
+ fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
+ fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
+ fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
+#endif
+#endif // CONJ
+#endif //DOUBLE
+.endm
+
+.macro KERNEL_F4_FINALIZE
+#if !defined(DOUBLE)
+ ext v21.16b, v9.16b, v9.16b, #8
+ fadd v9.2s, v9.2s, v21.2s
+ faddp s9, v9.2s
+
+ ext v21.16b, v10.16b, v10.16b, #8
+ fadd v10.2s, v10.2s, v21.2s
+ faddp s10, v10.2s
+
+ ins v2.s[0], v9.s[0]
+ ins v2.s[1], v10.s[0]
+#else
+ fadd v9.2d, v9.2d, v15.2d
+ fadd v10.2d, v10.2d, v16.2d
+
+ faddp d9, v9.2d
+ faddp d10, v10.2d
+
+ ins v2.d[0], v9.d[0]
+ ins v2.d[1], v10.d[0]
+#endif
+.endm
+
+
+.macro KERNEL_F1
+#if !defined(DOUBLE)
+ ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
+ ld1 {v5.s}[0], [A_PTR], #4 // A1
+ ld1 {v6.2s}, [X_PTR], #8 // [X1, X0]
+ fneg s16, s5
+ ins v5.s[1], v16.s[0] // [-A1, A1]
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
+#endif
+ ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
+ fmla v2.2s, v4.2s, v6.2s
+ fmla v2.2s, v5.2s, v7.2s
+#else // DOUBLE
+ ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
+ ld1 {v5.d}[0], [A_PTR], #8 // A1
+ ld1 {v6.2d}, [X_PTR], #16 // [X1, X0]
+ fneg d16, d5
+ ins v5.d[1], v16.d[0] // [-A1, A1]
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
+#endif
+ ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
+ fmla v2.2d, v4.2d, v6.2d
+ fmla v2.2d, v5.2d, v7.2d
+#endif
+.endm
+
+.macro INIT_S
+ lsl INC_X, INC_X, #SHZ
+.endm
+
+.macro KERNEL_S1
+#if !defined(DOUBLE)
+ ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
+ ld1 {v5.s}[0], [A_PTR], #4 // A1
+ ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0]
+ fneg s16, s5
+ ins v5.s[1], v16.s[0] // [-A1, A1]
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
+#endif
+ ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
+ fmla v2.2s, v4.2s, v6.2s
+ fmla v2.2s, v5.2s, v7.2s
+#else // DOUBLE
+ ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
+ ld1 {v5.d}[0], [A_PTR], #8 // A1
+ ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0]
+ fneg d16, d5
+ ins v5.d[1], v16.d[0] // [-A1, A1]
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+ ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
+#endif
+ ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
+ fmla v2.2d, v4.2d, v6.2d
+ fmla v2.2d, v5.2d, v7.2d
+#endif
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+ PROLOGUE
+
+ ldr INC_Y, [sp]
+ SAVE_REGS
+
+ cmp N, xzr
+ ble zgemv_t_kernel_L999
+ cmp M, xzr
+ ble zgemv_t_kernel_L999
+
+ lsl LDA, LDA, #SHZ
+ lsl INC_Y, INC_Y, #SHZ
+ mov J, N
+
+ INIT
+
+ cmp INC_X, #1
+ bne zgemv_t_kernel_S_BEGIN
+
+zgemv_t_kernel_F_LOOP:
+
+ mov A_PTR, A
+ mov X_PTR, X
+
+ INIT_LOOP
+
+ asr I, M, #2
+ cmp I, xzr
+ beq zgemv_t_kernel_F1
+
+zgemv_t_kernel_F4:
+
+ KERNEL_F4
+
+ subs I, I, #1
+ bne zgemv_t_kernel_F4
+
+ KERNEL_F4_FINALIZE
+
+zgemv_t_kernel_F1:
+
+ ands I, M, #3
+ ble zgemv_t_kernel_F_END
+
+zgemv_t_kernel_F10:
+
+ KERNEL_F1
+
+ subs I, I, #1
+ bne zgemv_t_kernel_F10
+
+zgemv_t_kernel_F_END:
+
+#if !defined(DOUBLE)
+ ld1 {v4.2s}, [Y]
+ ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
+ fmla v4.2s, v0.2s, v2.2s
+ fmla v4.2s, v1.2s, v3.2s
+ st1 {v4.2s}, [Y], INC_Y
+#else // DOUBLE
+ ld1 {v4.2d}, [Y]
+ ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
+ fmla v4.2d, v0.2d, v2.2d
+ fmla v4.2d, v1.2d, v3.2d
+ st1 {v4.2d}, [Y], INC_Y
+#endif
+
+ add A, A, LDA
+ subs J, J, #1
+ bne zgemv_t_kernel_F_LOOP
+
+ b zgemv_t_kernel_L999
+
+zgemv_t_kernel_S_BEGIN:
+
+ INIT_S
+
+zgemv_t_kernel_S_LOOP:
+
+ mov A_PTR, A
+ mov X_PTR, X
+ INIT_LOOP
+
+ asr I, M, #2
+ cmp I, xzr
+ ble zgemv_t_kernel_S1
+
+zgemv_t_kernel_S4:
+
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+ KERNEL_S1
+
+ subs I, I, #1
+ bne zgemv_t_kernel_S4
+
+zgemv_t_kernel_S1:
+
+ ands I, M, #3
+ ble zgemv_t_kernel_S_END
+
+zgemv_t_kernel_S10:
+
+ KERNEL_S1
+
+ subs I, I, #1
+ bne zgemv_t_kernel_S10
+
+zgemv_t_kernel_S_END:
+
+#if !defined(DOUBLE)
+ ld1 {v4.2s}, [Y]
+ ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
+ fmla v4.2s, v0.2s, v2.2s
+ fmla v4.2s, v1.2s, v3.2s
+ st1 {v4.2s}, [Y], INC_Y
+#else // DOUBLE
+ ld1 {v4.2d}, [Y]
+ ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
+ fmla v4.2d, v0.2d, v2.2d
+ fmla v4.2d, v1.2d, v3.2d
+ st1 {v4.2d}, [Y], INC_Y
+#endif
+
+ add A, A, LDA
+ subs J, J, #1
+ bne zgemv_t_kernel_S_LOOP
+
+zgemv_t_kernel_L999:
+ RESTORE_REGS
+ mov w0, wzr
+ ret
+
+ EPILOGUE