SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
-CGEMVNKERNEL = zgemv_n.c
-ZGEMVNKERNEL = zgemv_n.c
+CGEMVNKERNEL = cgemv_n_vfp.S
+ZGEMVNKERNEL = zgemv_n_vfp.S
SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
-CGEMVTKERNEL = zgemv_t.c
-ZGEMVTKERNEL = zgemv_t.c
+CGEMVTKERNEL = cgemv_t_vfp.S
+ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S
-SGEMVNKERNEL = gemv_n_vfpv3.S
-DGEMVNKERNEL = gemv_n_vfpv3.S
-CGEMVNKERNEL = zgemv_n.c
-ZGEMVNKERNEL = zgemv_n.c
-
-SGEMVTKERNEL = gemv_t_vfpv3.S
-DGEMVTKERNEL = gemv_t_vfpv3.S
-CGEMVTKERNEL = zgemv_t.c
-ZGEMVTKERNEL = zgemv_t.c
+SGEMVNKERNEL = gemv_n_vfp.S
+DGEMVNKERNEL = gemv_n_vfp.S
+CGEMVNKERNEL = cgemv_n_vfp.S
+ZGEMVNKERNEL = zgemv_n_vfp.S
+
+SGEMVTKERNEL = gemv_t_vfp.S
+DGEMVTKERNEL = gemv_t_vfp.S
+CGEMVTKERNEL = cgemv_t_vfp.S
+ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/29 Saar
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#define OLD_A r3
+#define OLD_M r0
+
+#define AO1 r0
+#define N r1
+#define J r2
+
+#define AO2 r4
+#define XO r5
+#define YO r6
+#define LDA r7
+#define INC_X r8
+#define INC_Y r9
+
+#define I r12
+
+#define ALPHA_I [fp, #-236]
+#define ALPHA_R [fp, #-244]
+
+#define M [fp, #-252 ]
+#define A [fp, #-256 ]
+
+
+#define X_PRE 64
+#define Y_PRE 0
+#define A_PRE 0
+
+/**************************************************************************************/
+
+#if !defined(CONJ) && !defined(XCONJ)
+
+ #define KMAC_R fnmacs
+ #define KMAC_I fmacs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fnmacs
+ #define FMAC_I1 fmacs
+ #define FMAC_I2 fmacs
+
+#elif defined(CONJ) && !defined(XCONJ)
+
+ #define KMAC_R fmacs
+ #define KMAC_I fnmacs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fnmacs
+ #define FMAC_I1 fmacs
+ #define FMAC_I2 fmacs
+
+#elif !defined(CONJ) && defined(XCONJ)
+
+ #define KMAC_R fmacs
+ #define KMAC_I fnmacs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fmacs
+ #define FMAC_I1 fnmacs
+ #define FMAC_I2 fmacs
+
+#else
+
+ #define KMAC_R fnmacs
+ #define KMAC_I fmacs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fmacs
+ #define FMAC_I1 fnmacs
+ #define FMAC_I2 fmacs
+
+#endif
+
+.macro INIT_F4
+
+ pld [ YO, #Y_PRE ]
+ vsub.f32 s8 , s8 , s8
+ vmov.f32 s9 , s8
+ vmov.f32 s10, s8
+ vmov.f32 s11, s8
+ vmov.f32 s12, s8
+ vmov.f32 s13, s8
+ vmov.f32 s14, s8
+ vmov.f32 s15, s8
+
+.endm
+
+.macro KERNEL_F4X4
+
+ pld [ XO, #X_PRE ]
+ KERNEL_F4X1
+ KERNEL_F4X1
+ KERNEL_F4X1
+ KERNEL_F4X1
+
+.endm
+
+.macro KERNEL_F4X1
+
+ pld [ AO2, #A_PRE ]
+ flds s0 , [ AO1 ]
+ flds s1 , [ AO1, #4 ]
+ flds s2 , [ AO1, #8 ]
+ flds s3 , [ AO1, #12 ]
+
+ flds s4 , [ XO ]
+ flds s5 , [ XO, #4 ]
+
+ fmacs s8 , s0, s4
+ fmacs s9 , s0, s5
+ fmacs s10 , s2, s4
+ fmacs s11 , s2, s5
+
+ KMAC_R s8 , s1, s5
+ KMAC_I s9 , s1, s4
+ KMAC_R s10 , s3, s5
+ KMAC_I s11 , s3, s4
+
+ flds s0 , [ AO1, #16 ]
+ flds s1 , [ AO1, #20 ]
+ flds s2 , [ AO1, #24 ]
+ flds s3 , [ AO1, #28 ]
+
+ fmacs s12 , s0, s4
+ fmacs s13 , s0, s5
+ fmacs s14 , s2, s4
+ fmacs s15 , s2, s5
+
+ KMAC_R s12 , s1, s5
+ KMAC_I s13 , s1, s4
+ KMAC_R s14 , s3, s5
+ KMAC_I s15 , s3, s4
+
+ add XO , XO, #8
+ add AO1 , AO1, LDA
+ add AO2 , AO2, LDA
+
+.endm
+
+.macro SAVE_F4
+
+ flds s0, ALPHA_R
+ flds s1, ALPHA_I
+
+ fldmias YO, { s4 - s7 }
+
+ FMAC_R1 s4 , s0 , s8
+ FMAC_I1 s5 , s0 , s9
+ FMAC_R2 s4 , s1 , s9
+ FMAC_I2 s5 , s1 , s8
+
+ FMAC_R1 s6 , s0 , s10
+ FMAC_I1 s7 , s0 , s11
+ FMAC_R2 s6 , s1 , s11
+ FMAC_I2 s7 , s1 , s10
+
+ fstmias YO!, { s4 - s7 }
+
+ fldmias YO, { s4 - s7 }
+
+ FMAC_R1 s4 , s0 , s12
+ FMAC_I1 s5 , s0 , s13
+ FMAC_R2 s4 , s1 , s13
+ FMAC_I2 s5 , s1 , s12
+
+ FMAC_R1 s6 , s0 , s14
+ FMAC_I1 s7 , s0 , s15
+ FMAC_R2 s6 , s1 , s15
+ FMAC_I2 s7 , s1 , s14
+
+ fstmias YO!, { s4 - s7 }
+
+.endm
+
+
+
+
+.macro INIT_F1
+
+ vsub.f32 s8 , s8 , s8
+ vmov.f32 s9 , s8
+
+.endm
+
+.macro KERNEL_F1X1
+
+ flds s0 , [ AO1 ]
+ flds s1 , [ AO1, #4 ]
+
+ flds s4 , [ XO ]
+ flds s5 , [ XO, #4 ]
+
+ fmacs s8 , s0, s4
+ fmacs s9 , s0, s5
+
+ KMAC_R s8 , s1, s5
+ KMAC_I s9 , s1, s4
+
+ add XO , XO, #8
+ add AO1 , AO1, LDA
+
+
+.endm
+
+.macro SAVE_F1
+
+ flds s0, ALPHA_R
+ flds s1, ALPHA_I
+
+ fldmias YO, { s4 - s5 }
+
+ FMAC_R1 s4 , s0 , s8
+ FMAC_I1 s5 , s0 , s9
+ FMAC_R2 s4 , s1 , s9
+ FMAC_I2 s5 , s1 , s8
+
+ fstmias YO, { s4 - s5 }
+
+ add YO, YO, #8
+
+.endm
+
+/****************************************************************************************/
+
+.macro INIT_S4
+
+ vsub.f32 s8 , s8 , s8
+ vmov.f32 s9 , s8
+ vmov.f32 s10, s8
+ vmov.f32 s11, s8
+ vmov.f32 s12, s8
+ vmov.f32 s13, s8
+ vmov.f32 s14, s8
+ vmov.f32 s15, s8
+
+.endm
+
+.macro KERNEL_S4X4
+
+ KERNEL_S4X1
+ KERNEL_S4X1
+ KERNEL_S4X1
+ KERNEL_S4X1
+
+.endm
+
+.macro KERNEL_S4X1
+
+ flds s0 , [ AO1 ]
+ flds s1 , [ AO1, #4 ]
+ flds s2 , [ AO1, #8 ]
+ flds s3 , [ AO1, #12 ]
+
+ flds s4 , [ XO ]
+ flds s5 , [ XO, #4 ]
+
+ fmacs s8 , s0, s4
+ fmacs s9 , s0, s5
+ fmacs s10 , s2, s4
+ fmacs s11 , s2, s5
+
+ KMAC_R s8 , s1, s5
+ KMAC_I s9 , s1, s4
+ KMAC_R s10 , s3, s5
+ KMAC_I s11 , s3, s4
+
+ flds s0 , [ AO1, #16 ]
+ flds s1 , [ AO1, #20 ]
+ flds s2 , [ AO1, #24 ]
+ flds s3 , [ AO1, #28 ]
+
+ fmacs s12 , s0, s4
+ fmacs s13 , s0, s5
+ fmacs s14 , s2, s4
+ fmacs s15 , s2, s5
+
+ KMAC_R s12 , s1, s5
+ KMAC_I s13 , s1, s4
+ KMAC_R s14 , s3, s5
+ KMAC_I s15 , s3, s4
+
+ add XO , XO, INC_X
+ add AO1 , AO1, LDA
+ add AO2 , AO2, LDA
+
+.endm
+
+.macro SAVE_S4
+
+ flds s0, ALPHA_R
+ flds s1, ALPHA_I
+
+ fldmias YO, { s4 - s5 }
+
+ FMAC_R1 s4 , s0 , s8
+ FMAC_I1 s5 , s0 , s9
+ FMAC_R2 s4 , s1 , s9
+ FMAC_I2 s5 , s1 , s8
+
+ fstmias YO, { s4 - s5 }
+
+ add YO, YO, INC_Y
+
+ fldmias YO, { s6 - s7 }
+
+ FMAC_R1 s6 , s0 , s10
+ FMAC_I1 s7 , s0 , s11
+ FMAC_R2 s6 , s1 , s11
+ FMAC_I2 s7 , s1 , s10
+
+ fstmias YO, { s6 - s7 }
+
+ add YO, YO, INC_Y
+
+ fldmias YO, { s4 - s5 }
+
+ FMAC_R1 s4 , s0 , s12
+ FMAC_I1 s5 , s0 , s13
+ FMAC_R2 s4 , s1 , s13
+ FMAC_I2 s5 , s1 , s12
+
+ fstmias YO, { s4 - s5 }
+
+ add YO, YO, INC_Y
+
+ fldmias YO, { s6 - s7 }
+
+ FMAC_R1 s6 , s0 , s14
+ FMAC_I1 s7 , s0 , s15
+ FMAC_R2 s6 , s1 , s15
+ FMAC_I2 s7 , s1 , s14
+
+ fstmias YO, { s6 - s7 }
+
+ add YO, YO, INC_Y
+
+.endm
+
+
+
+
+.macro INIT_S1
+
+ vsub.f32 s8 , s8 , s8
+ vmov.f32 s9 , s8
+
+.endm
+
+.macro KERNEL_S1X1
+
+ flds s0 , [ AO1 ]
+ flds s1 , [ AO1, #4 ]
+
+ flds s4 , [ XO ]
+ flds s5 , [ XO, #4 ]
+
+ fmacs s8 , s0, s4
+ fmacs s9 , s0, s5
+
+ KMAC_R s8 , s1, s5
+ KMAC_I s9 , s1, s4
+
+ add XO , XO, INC_X
+ add AO1 , AO1, LDA
+
+
+.endm
+
+.macro SAVE_S1
+
+ flds s0, ALPHA_R
+ flds s1, ALPHA_I
+
+ fldmias YO, { s4 - s5 }
+
+ FMAC_R1 s4 , s0 , s8
+ FMAC_I1 s5 , s0 , s9
+ FMAC_R2 s4 , s1 , s9
+ FMAC_I2 s5 , s1 , s8
+
+ fstmias YO, { s4 - s5 }
+
+ add YO, YO, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ push {r4 - r9 , fp}
+ add fp, sp, #28
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ sub r12, fp, #192
+
+#if defined(DOUBLE)
+ vstm r12, { d8 - d15 } // store floating point registers
+#else
+ vstm r12, { s8 - s15 } // store floating point registers
+#endif
+
+ cmp OLD_M, #0
+ ble cgemvn_kernel_L999
+
+ cmp N, #0
+ ble cgemvn_kernel_L999
+
+ str OLD_A, A
+ str OLD_M, M
+ vstr s0 , ALPHA_R
+ vstr s1 , ALPHA_I
+
+
+ ldr INC_X , OLD_INC_X
+ ldr INC_Y , OLD_INC_Y
+
+ cmp INC_X, #0
+ beq cgemvn_kernel_L999
+
+ cmp INC_Y, #0
+ beq cgemvn_kernel_L999
+
+ ldr LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+ lsl LDA, LDA, #4 // LDA * SIZE * 2
+#else
+ lsl LDA, LDA, #3 // LDA * SIZE * 2
+#endif
+
+ cmp INC_X, #1
+ bne cgemvn_kernel_S4_BEGIN
+
+ cmp INC_Y, #1
+ bne cgemvn_kernel_S4_BEGIN
+
+
+cgemvn_kernel_F4_BEGIN:
+
+ ldr YO , Y
+
+ ldr I, M
+ asrs I, I, #2 // I = M / 4
+ ble cgemvn_kernel_F1_BEGIN
+
+cgemvn_kernel_F4X4:
+
+ ldr AO1, A
+ add AO2, AO1, LDA
+ add r3 , AO1, #32
+ str r3 , A
+
+ add AO2, AO2, LDA
+ add AO2, AO2, LDA
+
+ ldr XO , X
+
+ INIT_F4
+
+ asrs J, N, #2 // J = N / 4
+ ble cgemvn_kernel_F4X1
+
+
+cgemvn_kernel_F4X4_10:
+
+ KERNEL_F4X4
+
+ subs J, J, #1
+ bne cgemvn_kernel_F4X4_10
+
+
+cgemvn_kernel_F4X1:
+
+ ands J, N , #3
+ ble cgemvn_kernel_F4_END
+
+cgemvn_kernel_F4X1_10:
+
+ KERNEL_F4X1
+
+ subs J, J, #1
+ bne cgemvn_kernel_F4X1_10
+
+
+cgemvn_kernel_F4_END:
+
+ SAVE_F4
+
+ subs I , I , #1
+ bne cgemvn_kernel_F4X4
+
+
+cgemvn_kernel_F1_BEGIN:
+
+ ldr I, M
+ ands I, I , #3
+ ble cgemvn_kernel_L999
+
+cgemvn_kernel_F1X1:
+
+ ldr AO1, A
+ add r3, AO1, #8
+ str r3, A
+
+ ldr XO , X
+
+ INIT_F1
+
+ mov J, N
+
+
+cgemvn_kernel_F1X1_10:
+
+ KERNEL_F1X1
+
+ subs J, J, #1
+ bne cgemvn_kernel_F1X1_10
+
+
+cgemvn_kernel_F1_END:
+
+ SAVE_F1
+
+ subs I , I , #1
+ bne cgemvn_kernel_F1X1
+
+ b cgemvn_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+cgemvn_kernel_S4_BEGIN:
+
+#if defined(DOUBLE)
+ lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
+ lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
+#else
+ lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
+ lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
+#endif
+
+ ldr YO , Y
+
+ ldr I, M
+ asrs I, I, #2 // I = M / 4
+ ble cgemvn_kernel_S1_BEGIN
+
+cgemvn_kernel_S4X4:
+
+ ldr AO1, A
+ add AO2, AO1, LDA
+ add r3 , AO1, #32
+ str r3 , A
+
+ ldr XO , X
+
+ INIT_S4
+
+ asrs J, N, #2 // J = N / 4
+ ble cgemvn_kernel_S4X1
+
+
+cgemvn_kernel_S4X4_10:
+
+ KERNEL_S4X4
+
+ subs J, J, #1
+ bne cgemvn_kernel_S4X4_10
+
+
+cgemvn_kernel_S4X1:
+
+ ands J, N , #3
+ ble cgemvn_kernel_S4_END
+
+cgemvn_kernel_S4X1_10:
+
+ KERNEL_S4X1
+
+ subs J, J, #1
+ bne cgemvn_kernel_S4X1_10
+
+
+cgemvn_kernel_S4_END:
+
+ SAVE_S4
+
+ subs I , I , #1
+ bne cgemvn_kernel_S4X4
+
+
+cgemvn_kernel_S1_BEGIN:
+
+ ldr I, M
+ ands I, I , #3
+ ble cgemvn_kernel_L999
+
+cgemvn_kernel_S1X1:
+
+ ldr AO1, A
+ add r3, AO1, #8
+ str r3, A
+
+ ldr XO , X
+
+ INIT_S1
+
+ mov J, N
+
+
+cgemvn_kernel_S1X1_10:
+
+ KERNEL_S1X1
+
+ subs J, J, #1
+ bne cgemvn_kernel_S1X1_10
+
+
+cgemvn_kernel_S1_END:
+
+ SAVE_S1
+
+ subs I , I , #1
+ bne cgemvn_kernel_S1X1
+
+
+/*************************************************************************************************************/
+
+cgemvn_kernel_L999:
+
+ sub r3, fp, #192
+
+#if defined(DOUBLE)
+ vldm r3, { d8 - d15 } // restore floating point registers
+#else
+ vldm r3, { s8 - s15 } // restore floating point registers
+#endif
+
+ mov r0, #0 // set return value
+
+ sub sp, fp, #28
+ pop {r4 -r9 ,fp}
+ bx lr
+
+ EPILOGUE
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/29 Saar
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#define OLD_A r3
+#define OLD_N r1
+
+#define M r0
+#define AO1 r1
+#define J r2
+
+#define AO2 r4
+#define XO r5
+#define YO r6
+#define LDA r7
+#define INC_X r8
+#define INC_Y r9
+
+#define I r12
+
+#define N [fp, #-252 ]
+#define A [fp, #-256 ]
+
+
+#define X_PRE 512
+#define A_PRE 512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#if !defined(CONJ) && !defined(XCONJ)
+
+ #define KMAC_R fnmacs
+ #define KMAC_I fmacs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fnmacs
+ #define FMAC_I1 fmacs
+ #define FMAC_I2 fmacs
+
+#elif defined(CONJ) && !defined(XCONJ)
+
+ #define KMAC_R fmacs
+ #define KMAC_I fnmacs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fnmacs
+ #define FMAC_I1 fmacs
+ #define FMAC_I2 fmacs
+
+#elif !defined(CONJ) && defined(XCONJ)
+
+ #define KMAC_R fmacs
+ #define KMAC_I fnmacs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fmacs
+ #define FMAC_I1 fnmacs
+ #define FMAC_I2 fmacs
+
+#else
+
+ #define KMAC_R fnmacs
+ #define KMAC_I fmacs
+
+ #define FMAC_R1 fmacs
+ #define FMAC_R2 fmacs
+ #define FMAC_I1 fnmacs
+ #define FMAC_I2 fmacs
+
+#endif
+
+
+
+.macro INIT_F2
+
+ vsub.f32 s12, s12, s12
+ vsub.f32 s13, s13, s13
+ vsub.f32 s14, s14, s14
+ vsub.f32 s15, s15, s15
+
+.endm
+
+.macro KERNEL_F2X4
+
+ KERNEL_F2X1
+ KERNEL_F2X1
+ KERNEL_F2X1
+ KERNEL_F2X1
+
+.endm
+
+.macro KERNEL_F2X1
+
+ fldmias XO! , { s2 - s3 }
+ fldmias AO1!, { s4 - s5 }
+ fldmias AO2!, { s8 - s9 }
+
+ fmacs s12 , s4 , s2
+ fmacs s13 , s4 , s3
+ KMAC_R s12 , s5 , s3
+ KMAC_I s13 , s5 , s2
+
+ fmacs s14 , s8 , s2
+ fmacs s15 , s8 , s3
+ KMAC_R s14 , s9 , s3
+ KMAC_I s15 , s9 , s2
+
+.endm
+
+.macro SAVE_F2
+
+ fldmias YO, { s4 - s7 }
+
+ FMAC_R1 s4 , s0 , s12
+ FMAC_I1 s5 , s0 , s13
+ FMAC_R2 s4 , s1 , s13
+ FMAC_I2 s5 , s1 , s12
+
+ FMAC_R1 s6 , s0 , s14
+ FMAC_I1 s7 , s0 , s15
+ FMAC_R2 s6 , s1 , s15
+ FMAC_I2 s7 , s1 , s14
+
+ fstmias YO!, { s4 - s7 }
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_F1
+
+ vsub.f32 s12, s12, s12
+ vsub.f32 s13, s13, s13
+
+.endm
+
+.macro KERNEL_F1X4
+
+ KERNEL_F1X1
+ KERNEL_F1X1
+ KERNEL_F1X1
+ KERNEL_F1X1
+
+.endm
+
+.macro KERNEL_F1X1
+
+ fldmias XO! , { s2 - s3 }
+ fldmias AO1!, { s4 - s5 }
+
+ fmacs s12 , s4 , s2
+ fmacs s13 , s4 , s3
+ KMAC_R s12 , s5 , s3
+ KMAC_I s13 , s5 , s2
+
+.endm
+
+.macro SAVE_F1
+
+ fldmias YO, { s4 - s5 }
+
+ FMAC_R1 s4 , s0 , s12
+ FMAC_I1 s5 , s0 , s13
+ FMAC_R2 s4 , s1 , s13
+ FMAC_I2 s5 , s1 , s12
+
+ fstmias YO!, { s4 - s5 }
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_S2
+
+ vsub.f32 s12, s12, s12
+ vsub.f32 s13, s13, s13
+ vsub.f32 s14, s14, s14
+ vsub.f32 s15, s15, s15
+
+.endm
+
+.macro KERNEL_S2X4
+
+ KERNEL_S2X1
+ KERNEL_S2X1
+ KERNEL_S2X1
+ KERNEL_S2X1
+
+.endm
+
+.macro KERNEL_S2X1
+
+ fldmias XO , { s2 - s3 }
+ fldmias AO1!, { s4 - s5 }
+ fldmias AO2!, { s8 - s9 }
+
+ fmacs s12 , s4 , s2
+ fmacs s13 , s4 , s3
+ KMAC_R s12 , s5 , s3
+ KMAC_I s13 , s5 , s2
+
+ fmacs s14 , s8 , s2
+ fmacs s15 , s8 , s3
+ KMAC_R s14 , s9 , s3
+ KMAC_I s15 , s9 , s2
+
+ add XO, XO, INC_X
+
+.endm
+
+.macro SAVE_S2
+
+ fldmias YO, { s4 - s5 }
+
+ FMAC_R1 s4 , s0 , s12
+ FMAC_I1 s5 , s0 , s13
+ FMAC_R2 s4 , s1 , s13
+ FMAC_I2 s5 , s1 , s12
+
+ fstmias YO, { s4 - s5 }
+
+ add YO, YO, INC_Y
+
+ fldmias YO, { s6 - s7 }
+
+ FMAC_R1 s6 , s0 , s14
+ FMAC_I1 s7 , s0 , s15
+ FMAC_R2 s6 , s1 , s15
+ FMAC_I2 s7 , s1 , s14
+
+ fstmias YO, { s6 - s7 }
+
+ add YO, YO, INC_Y
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_S1
+
+ vsub.f32 s12, s12, s12
+ vsub.f32 s13, s13, s13
+
+.endm
+
+.macro KERNEL_S1X4
+
+ KERNEL_S1X1
+ KERNEL_S1X1
+ KERNEL_S1X1
+ KERNEL_S1X1
+
+.endm
+
+.macro KERNEL_S1X1
+
+ fldmias XO , { s2 - s3 }
+ fldmias AO1!, { s4 - s5 }
+
+ fmacs s12 , s4 , s2
+ fmacs s13 , s4 , s3
+ KMAC_R s12 , s5 , s3
+ KMAC_I s13 , s5 , s2
+
+ add XO, XO, INC_X
+
+.endm
+
+.macro SAVE_S1
+
+ fldmias YO, { s4 - s5 }
+
+ FMAC_R1 s4 , s0 , s12
+ FMAC_I1 s5 , s0 , s13
+ FMAC_R2 s4 , s1 , s13
+ FMAC_I2 s5 , s1 , s12
+
+ fstmias YO, { s4 - s5 }
+
+ add YO, YO, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ push {r4 - r9 , fp}
+ add fp, sp, #28
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ sub r12, fp, #192
+
+#if defined(DOUBLE)
+ vstm r12, { d8 - d15 } // store floating point registers
+#else
+ vstm r12, { s8 - s15 } // store floating point registers
+#endif
+
+ cmp M, #0
+ ble cgemvt_kernel_L999
+
+ cmp OLD_N, #0
+ ble cgemvt_kernel_L999
+
+ str OLD_A, A
+ str OLD_N, N
+
+ ldr INC_X , OLD_INC_X
+ ldr INC_Y , OLD_INC_Y
+
+ cmp INC_X, #0
+ beq cgemvt_kernel_L999
+
+ cmp INC_Y, #0
+ beq cgemvt_kernel_L999
+
+ ldr LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+ lsl LDA, LDA, #4 // LDA * SIZE
+#else
+ lsl LDA, LDA, #3 // LDA * SIZE
+#endif
+
+ cmp INC_X, #1
+ bne cgemvt_kernel_S2_BEGIN
+
+ cmp INC_Y, #1
+ bne cgemvt_kernel_S2_BEGIN
+
+
+cgemvt_kernel_F2_BEGIN:
+
+ ldr YO , Y
+
+ ldr J, N
+ asrs J, J, #1 // J = N / 2
+ ble cgemvt_kernel_F1_BEGIN
+
+cgemvt_kernel_F2X4:
+
+ ldr AO1, A
+ add AO2, AO1, LDA
+ add r3 , AO2, LDA
+ str r3 , A
+
+ ldr XO , X
+
+ INIT_F2
+
+ asrs I, M, #2 // I = M / 4
+ ble cgemvt_kernel_F2X1
+
+
+cgemvt_kernel_F2X4_10:
+
+ KERNEL_F2X4
+
+ subs I, I, #1
+ bne cgemvt_kernel_F2X4_10
+
+
+cgemvt_kernel_F2X1:
+
+ ands I, M , #3
+ ble cgemvt_kernel_F2_END
+
+cgemvt_kernel_F2X1_10:
+
+ KERNEL_F2X1
+
+ subs I, I, #1
+ bne cgemvt_kernel_F2X1_10
+
+
+cgemvt_kernel_F2_END:
+
+ SAVE_F2
+
+ subs J , J , #1
+ bne cgemvt_kernel_F2X4
+
+
+cgemvt_kernel_F1_BEGIN:
+
+ ldr J, N
+ ands J, J, #1
+ ble cgemvt_kernel_L999
+
+cgemvt_kernel_F1X4:
+
+ ldr AO1, A
+
+ ldr XO , X
+
+ INIT_F1
+
+ asrs I, M, #2 // I = M / 4
+ ble cgemvt_kernel_F1X1
+
+
+cgemvt_kernel_F1X4_10:
+
+ KERNEL_F1X4
+
+ subs I, I, #1
+ bne cgemvt_kernel_F1X4_10
+
+
+cgemvt_kernel_F1X1:
+
+ ands I, M , #3
+ ble cgemvt_kernel_F1_END
+
+cgemvt_kernel_F1X1_10:
+
+ KERNEL_F1X1
+
+ subs I, I, #1
+ bne cgemvt_kernel_F1X1_10
+
+
+cgemvt_kernel_F1_END:
+
+ SAVE_F1
+
+ b cgemvt_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+cgemvt_kernel_S2_BEGIN:
+
+#if defined(DOUBLE)
+ lsl INC_X, INC_X, #4 // INC_X * SIZE
+ lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
+#else
+ lsl INC_X, INC_X, #3 // INC_X * SIZE
+ lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
+#endif
+
+ ldr YO , Y
+
+ ldr J, N
+ asrs J, J, #1 // J = N / 2
+ ble cgemvt_kernel_S1_BEGIN
+
+cgemvt_kernel_S2X4:
+
+ ldr AO1, A
+ add AO2, AO1, LDA
+ add r3 , AO2, LDA
+ str r3 , A
+
+ ldr XO , X
+
+ INIT_S2
+
+ asrs I, M, #2 // I = M / 4
+ ble cgemvt_kernel_S2X1
+
+
+cgemvt_kernel_S2X4_10:
+
+ KERNEL_S2X4
+
+ subs I, I, #1
+ bne cgemvt_kernel_S2X4_10
+
+
+cgemvt_kernel_S2X1:
+
+ ands I, M , #3
+ ble cgemvt_kernel_S2_END
+
+cgemvt_kernel_S2X1_10:
+
+ KERNEL_S2X1
+
+ subs I, I, #1
+ bne cgemvt_kernel_S2X1_10
+
+
+cgemvt_kernel_S2_END:
+
+ SAVE_S2
+
+ subs J , J , #1
+ bne cgemvt_kernel_S2X4
+
+
+cgemvt_kernel_S1_BEGIN:
+
+ ldr J, N
+ ands J, J, #1
+ ble cgemvt_kernel_L999
+
+cgemvt_kernel_S1X4:
+
+ ldr AO1, A
+
+ ldr XO , X
+
+ INIT_S1
+
+ asrs I, M, #2 // I = M / 4
+ ble cgemvt_kernel_S1X1
+
+
+cgemvt_kernel_S1X4_10:
+
+ KERNEL_S1X4
+
+ subs I, I, #1
+ bne cgemvt_kernel_S1X4_10
+
+
+cgemvt_kernel_S1X1:
+
+ ands I, M , #3
+ ble cgemvt_kernel_S1_END
+
+cgemvt_kernel_S1X1_10:
+
+ KERNEL_S1X1
+
+ subs I, I, #1
+ bne cgemvt_kernel_S1X1_10
+
+
+cgemvt_kernel_S1_END:
+
+ SAVE_S1
+
+
+
+/*************************************************************************************************************/
+
+cgemvt_kernel_L999:
+
+ sub r3, fp, #192
+
+#if defined(DOUBLE)
+ vldm r3, { d8 - d15 } // restore floating point registers
+#else
+ vldm r3, { s8 - s15 } // restore floating point registers
+#endif
+
+ mov r0, #0 // set return value
+
+ sub sp, fp, #28
+ pop {r4 -r9 ,fp}
+ bx lr
+
+ EPILOGUE
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/29 Saar
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#define OLD_A r3
+#define OLD_M r0
+
+#define AO1 r0
+#define N r1
+#define J r2
+
+#define AO2 r4
+#define XO r5
+#define YO r6
+#define LDA r7
+#define INC_X r8
+#define INC_Y r9
+
+#define I r12
+
+#define ALPHA_I [fp, #-236]
+#define ALPHA_R [fp, #-244]
+
+#define M [fp, #-252 ]
+#define A [fp, #-256 ]
+
+
+#define X_PRE 64
+#define Y_PRE 0
+#define A_PRE 0
+
+/**************************************************************************************/
+
+#if !defined(CONJ) && !defined(XCONJ)
+
+ #define KMAC_R fnmacd
+ #define KMAC_I fmacd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fnmacd
+ #define FMAC_I1 fmacd
+ #define FMAC_I2 fmacd
+
+#elif defined(CONJ) && !defined(XCONJ)
+
+ #define KMAC_R fmacd
+ #define KMAC_I fnmacd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fnmacd
+ #define FMAC_I1 fmacd
+ #define FMAC_I2 fmacd
+
+#elif !defined(CONJ) && defined(XCONJ)
+
+ #define KMAC_R fmacd
+ #define KMAC_I fnmacd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fmacd
+ #define FMAC_I1 fnmacd
+ #define FMAC_I2 fmacd
+
+#else
+
+ #define KMAC_R fnmacd
+ #define KMAC_I fmacd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fmacd
+ #define FMAC_I1 fnmacd
+ #define FMAC_I2 fmacd
+
+#endif
+
+.macro INIT_F4
+
+ pld [ YO, #Y_PRE ]
+ vsub.f64 d8 , d8 , d8
+ vmov.f64 d9 , d8
+ vmov.f64 d10, d8
+ vmov.f64 d11, d8
+ vmov.f64 d12, d8
+ vmov.f64 d13, d8
+ vmov.f64 d14, d8
+ vmov.f64 d15, d8
+
+.endm
+
+.macro KERNEL_F4X4
+
+ pld [ XO, #X_PRE ]
+ KERNEL_F4X1
+ KERNEL_F4X1
+ pld [ XO, #X_PRE ]
+ KERNEL_F4X1
+ KERNEL_F4X1
+
+.endm
+
+.macro KERNEL_F4X1
+
+ fldd d0 , [ AO1 ]
+
+ fldd d4 , [ XO ]
+ fldd d5 , [ XO, #8 ]
+
+ pld [ AO2, #A_PRE ]
+
+ fldd d1 , [ AO1, #8 ]
+ fmacd d8 , d0, d4
+ fldd d2 , [ AO1, #16 ]
+ fmacd d9 , d0, d5
+ fldd d3 , [ AO1, #24 ]
+ fmacd d10 , d2, d4
+ fldd d0 , [ AO1, #32 ]
+ fmacd d11 , d2, d5
+
+ KMAC_R d8 , d1, d5
+ KMAC_I d9 , d1, d4
+ KMAC_R d10 , d3, d5
+ fldd d1 , [ AO1, #40 ]
+ KMAC_I d11 , d3, d4
+
+ fldd d2 , [ AO1, #48 ]
+
+ fmacd d12 , d0, d4
+ fldd d3 , [ AO1, #56 ]
+ fmacd d13 , d0, d5
+ pld [ AO2, #A_PRE+32 ]
+ fmacd d14 , d2, d4
+ fmacd d15 , d2, d5
+
+ KMAC_R d12 , d1, d5
+ add XO , XO, #16
+ KMAC_I d13 , d1, d4
+ add AO1 , AO1, LDA
+ KMAC_R d14 , d3, d5
+ add AO2 , AO2, LDA
+ KMAC_I d15 , d3, d4
+
+.endm
+
+.macro SAVE_F4
+
+ fldd d0, ALPHA_R
+ fldd d1, ALPHA_I
+
+ fldmiad YO, { d4 - d7 }
+
+ FMAC_R1 d4 , d0 , d8
+ FMAC_I1 d5 , d0 , d9
+ FMAC_R2 d4 , d1 , d9
+ FMAC_I2 d5 , d1 , d8
+
+ FMAC_R1 d6 , d0 , d10
+ FMAC_I1 d7 , d0 , d11
+ FMAC_R2 d6 , d1 , d11
+ FMAC_I2 d7 , d1 , d10
+
+ fstmiad YO!, { d4 - d7 }
+
+ fldmiad YO, { d4 - d7 }
+
+ FMAC_R1 d4 , d0 , d12
+ FMAC_I1 d5 , d0 , d13
+ FMAC_R2 d4 , d1 , d13
+ FMAC_I2 d5 , d1 , d12
+
+ FMAC_R1 d6 , d0 , d14
+ FMAC_I1 d7 , d0 , d15
+ FMAC_R2 d6 , d1 , d15
+ FMAC_I2 d7 , d1 , d14
+
+ fstmiad YO!, { d4 - d7 }
+
+.endm
+
+
+
+
+.macro INIT_F1
+
+ vsub.f64 d8 , d8 , d8
+ vmov.f64 d9 , d8
+
+.endm
+
+.macro KERNEL_F1X1
+
+ fldd d0 , [ AO1 ]
+ fldd d1 , [ AO1, #8 ]
+
+ fldd d4 , [ XO ]
+ fldd d5 , [ XO, #8 ]
+
+ fmacd d8 , d0, d4
+ fmacd d9 , d0, d5
+
+ KMAC_R d8 , d1, d5
+ KMAC_I d9 , d1, d4
+
+ add XO , XO, #16
+ add AO1 , AO1, LDA
+
+
+.endm
+
+.macro SAVE_F1
+
+ fldd d0, ALPHA_R
+ fldd d1, ALPHA_I
+
+ fldmiad YO, { d4 - d5 }
+
+ FMAC_R1 d4 , d0 , d8
+ FMAC_I1 d5 , d0 , d9
+ FMAC_R2 d4 , d1 , d9
+ FMAC_I2 d5 , d1 , d8
+
+ fstmiad YO, { d4 - d5 }
+
+ add YO, YO, #16
+
+.endm
+
+/****************************************************************************************/
+
+.macro INIT_S4
+
+ vsub.f64 d8 , d8 , d8
+ vmov.f64 d9 , d8
+ vmov.f64 d10, d8
+ vmov.f64 d11, d8
+ vmov.f64 d12, d8
+ vmov.f64 d13, d8
+ vmov.f64 d14, d8
+ vmov.f64 d15, d8
+
+.endm
+
+.macro KERNEL_S4X4
+
+ KERNEL_S4X1
+ KERNEL_S4X1
+ KERNEL_S4X1
+ KERNEL_S4X1
+
+.endm
+
+.macro KERNEL_S4X1
+
+ fldd d0 , [ AO1 ]
+ fldd d1 , [ AO1, #8 ]
+ fldd d2 , [ AO1, #16 ]
+ fldd d3 , [ AO1, #24 ]
+
+ fldd d4 , [ XO ]
+ fldd d5 , [ XO, #8 ]
+
+ fmacd d8 , d0, d4
+ fmacd d9 , d0, d5
+ fmacd d10 , d2, d4
+ fmacd d11 , d2, d5
+
+ KMAC_R d8 , d1, d5
+ KMAC_I d9 , d1, d4
+ KMAC_R d10 , d3, d5
+ KMAC_I d11 , d3, d4
+
+ fldd d0 , [ AO1, #32 ]
+ fldd d1 , [ AO1, #40 ]
+ fldd d2 , [ AO1, #48 ]
+ fldd d3 , [ AO1, #56 ]
+
+ fmacd d12 , d0, d4
+ fmacd d13 , d0, d5
+ fmacd d14 , d2, d4
+ fmacd d15 , d2, d5
+
+ KMAC_R d12 , d1, d5
+ KMAC_I d13 , d1, d4
+ KMAC_R d14 , d3, d5
+ KMAC_I d15 , d3, d4
+
+ add XO , XO, INC_X
+ add AO1 , AO1, LDA
+ add AO2 , AO2, LDA
+
+.endm
+
+.macro SAVE_S4
+
+ fldd d0, ALPHA_R
+ fldd d1, ALPHA_I
+
+ fldmiad YO, { d4 - d5 }
+
+ FMAC_R1 d4 , d0 , d8
+ FMAC_I1 d5 , d0 , d9
+ FMAC_R2 d4 , d1 , d9
+ FMAC_I2 d5 , d1 , d8
+
+ fstmiad YO, { d4 - d5 }
+
+ add YO, YO, INC_Y
+
+ fldmiad YO, { d6 - d7 }
+
+ FMAC_R1 d6 , d0 , d10
+ FMAC_I1 d7 , d0 , d11
+ FMAC_R2 d6 , d1 , d11
+ FMAC_I2 d7 , d1 , d10
+
+ fstmiad YO, { d6 - d7 }
+
+ add YO, YO, INC_Y
+
+ fldmiad YO, { d4 - d5 }
+
+ FMAC_R1 d4 , d0 , d12
+ FMAC_I1 d5 , d0 , d13
+ FMAC_R2 d4 , d1 , d13
+ FMAC_I2 d5 , d1 , d12
+
+ fstmiad YO, { d4 - d5 }
+
+ add YO, YO, INC_Y
+
+ fldmiad YO, { d6 - d7 }
+
+ FMAC_R1 d6 , d0 , d14
+ FMAC_I1 d7 , d0 , d15
+ FMAC_R2 d6 , d1 , d15
+ FMAC_I2 d7 , d1 , d14
+
+ fstmiad YO, { d6 - d7 }
+
+ add YO, YO, INC_Y
+
+.endm
+
+
+
+
+.macro INIT_S1
+
+ vsub.f64 d8 , d8 , d8
+ vmov.f64 d9 , d8
+
+.endm
+
+.macro KERNEL_S1X1
+
+ fldd d0 , [ AO1 ]
+ fldd d1 , [ AO1, #8 ]
+
+ fldd d4 , [ XO ]
+ fldd d5 , [ XO, #8 ]
+
+ fmacd d8 , d0, d4
+ fmacd d9 , d0, d5
+
+ KMAC_R d8 , d1, d5
+ KMAC_I d9 , d1, d4
+
+ add XO , XO, INC_X
+ add AO1 , AO1, LDA
+
+
+.endm
+
+.macro SAVE_S1
+
+ fldd d0, ALPHA_R
+ fldd d1, ALPHA_I
+
+ fldmiad YO, { d4 - d5 }
+
+ FMAC_R1 d4 , d0 , d8
+ FMAC_I1 d5 , d0 , d9
+ FMAC_R2 d4 , d1 , d9
+ FMAC_I2 d5 , d1 , d8
+
+ fstmiad YO, { d4 - d5 }
+
+ add YO, YO, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ push {r4 - r9 , fp}
+ add fp, sp, #28
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ sub r12, fp, #192
+
+#if defined(DOUBLE)
+ vstm r12, { d8 - d15 } // store floating point registers
+#else
+ vstm r12, { s8 - s15 } // store floating point registers
+#endif
+
+ cmp OLD_M, #0
+ ble zgemvn_kernel_L999
+
+ cmp N, #0
+ ble zgemvn_kernel_L999
+
+ str OLD_A, A
+ str OLD_M, M
+ vstr d0 , ALPHA_R
+ vstr d1 , ALPHA_I
+
+
+ ldr INC_X , OLD_INC_X
+ ldr INC_Y , OLD_INC_Y
+
+ cmp INC_X, #0
+ beq zgemvn_kernel_L999
+
+ cmp INC_Y, #0
+ beq zgemvn_kernel_L999
+
+ ldr LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+ lsl LDA, LDA, #4 // LDA * SIZE * 2
+#else
+ lsl LDA, LDA, #3 // LDA * SIZE * 2
+#endif
+
+ cmp INC_X, #1
+ bne zgemvn_kernel_S4_BEGIN
+
+ cmp INC_Y, #1
+ bne zgemvn_kernel_S4_BEGIN
+
+
+zgemvn_kernel_F4_BEGIN:
+
+ ldr YO , Y
+
+ ldr I, M
+ asrs I, I, #2 // I = M / 4
+ ble zgemvn_kernel_F1_BEGIN
+
+zgemvn_kernel_F4X4:
+
+ ldr AO1, A
+ add AO2, AO1, LDA
+ add r3 , AO1, #64
+ str r3 , A
+
+ add AO2, AO2, LDA
+ add AO2, AO2, LDA
+
+ ldr XO , X
+
+ INIT_F4
+
+ asrs J, N, #2 // J = N / 4
+ ble zgemvn_kernel_F4X1
+
+
+zgemvn_kernel_F4X4_10:
+
+ KERNEL_F4X4
+
+ subs J, J, #1
+ bne zgemvn_kernel_F4X4_10
+
+
+zgemvn_kernel_F4X1:
+
+ ands J, N , #3
+ ble zgemvn_kernel_F4_END
+
+zgemvn_kernel_F4X1_10:
+
+ KERNEL_F4X1
+
+ subs J, J, #1
+ bne zgemvn_kernel_F4X1_10
+
+
+zgemvn_kernel_F4_END:
+
+ SAVE_F4
+
+ subs I , I , #1
+ bne zgemvn_kernel_F4X4
+
+
+zgemvn_kernel_F1_BEGIN:
+
+ ldr I, M
+ ands I, I , #3
+ ble zgemvn_kernel_L999
+
+zgemvn_kernel_F1X1:
+
+ ldr AO1, A
+ add r3, AO1, #16
+ str r3, A
+
+ ldr XO , X
+
+ INIT_F1
+
+ mov J, N
+
+
+zgemvn_kernel_F1X1_10:
+
+ KERNEL_F1X1
+
+ subs J, J, #1
+ bne zgemvn_kernel_F1X1_10
+
+
+zgemvn_kernel_F1_END:
+
+ SAVE_F1
+
+ subs I , I , #1
+ bne zgemvn_kernel_F1X1
+
+ b zgemvn_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+zgemvn_kernel_S4_BEGIN:
+
+#if defined(DOUBLE)
+ lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
+ lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
+#else
+ lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
+ lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
+#endif
+
+ ldr YO , Y
+
+ ldr I, M
+ asrs I, I, #2 // I = M / 4
+ ble zgemvn_kernel_S1_BEGIN
+
+zgemvn_kernel_S4X4:
+
+ ldr AO1, A
+ add AO2, AO1, LDA
+ add r3 , AO1, #64
+ str r3 , A
+
+ ldr XO , X
+
+ INIT_S4
+
+ asrs J, N, #2 // J = N / 4
+ ble zgemvn_kernel_S4X1
+
+
+zgemvn_kernel_S4X4_10:
+
+ KERNEL_S4X4
+
+ subs J, J, #1
+ bne zgemvn_kernel_S4X4_10
+
+
+zgemvn_kernel_S4X1:
+
+ ands J, N , #3
+ ble zgemvn_kernel_S4_END
+
+zgemvn_kernel_S4X1_10:
+
+ KERNEL_S4X1
+
+ subs J, J, #1
+ bne zgemvn_kernel_S4X1_10
+
+
+zgemvn_kernel_S4_END:
+
+ SAVE_S4
+
+ subs I , I , #1
+ bne zgemvn_kernel_S4X4
+
+
+zgemvn_kernel_S1_BEGIN:
+
+ ldr I, M
+ ands I, I , #3
+ ble zgemvn_kernel_L999
+
+zgemvn_kernel_S1X1:
+
+ ldr AO1, A
+ add r3, AO1, #16
+ str r3, A
+
+ ldr XO , X
+
+ INIT_S1
+
+ mov J, N
+
+
+zgemvn_kernel_S1X1_10:
+
+ KERNEL_S1X1
+
+ subs J, J, #1
+ bne zgemvn_kernel_S1X1_10
+
+
+zgemvn_kernel_S1_END:
+
+ SAVE_S1
+
+ subs I , I , #1
+ bne zgemvn_kernel_S1X1
+
+
+/*************************************************************************************************************/
+
+zgemvn_kernel_L999:
+
+ sub r3, fp, #192
+
+#if defined(DOUBLE)
+ vldm r3, { d8 - d15 } // restore floating point registers
+#else
+ vldm r3, { s8 - s15 } // restore floating point registers
+#endif
+
+ mov r0, #0 // set return value
+
+ sub sp, fp, #28
+ pop {r4 -r9 ,fp}
+ bx lr
+
+ EPILOGUE
+
--- /dev/null
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/29 Saar
+* BLASTEST : OK
+* CTEST : OK
+* TEST : OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define OLD_LDA [fp, #0 ]
+#define X [fp, #4 ]
+#define OLD_INC_X [fp, #8 ]
+#define Y [fp, #12 ]
+#define OLD_INC_Y [fp, #16 ]
+#define OLD_A r3
+#define OLD_N r1
+
+#define M r0
+#define AO1 r1
+#define J r2
+
+#define AO2 r4
+#define XO r5
+#define YO r6
+#define LDA r7
+#define INC_X r8
+#define INC_Y r9
+
+#define I r12
+
+#define N [fp, #-252 ]
+#define A [fp, #-256 ]
+
+
+#define X_PRE 512
+#define A_PRE 512
+#define Y_PRE 32
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#if !defined(CONJ) && !defined(XCONJ)
+
+ #define KMAC_R fnmacd
+ #define KMAC_I fmacd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fnmacd
+ #define FMAC_I1 fmacd
+ #define FMAC_I2 fmacd
+
+#elif defined(CONJ) && !defined(XCONJ)
+
+ #define KMAC_R fmacd
+ #define KMAC_I fnmacd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fnmacd
+ #define FMAC_I1 fmacd
+ #define FMAC_I2 fmacd
+
+#elif !defined(CONJ) && defined(XCONJ)
+
+ #define KMAC_R fmacd
+ #define KMAC_I fnmacd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fmacd
+ #define FMAC_I1 fnmacd
+ #define FMAC_I2 fmacd
+
+#else
+
+ #define KMAC_R fnmacd
+ #define KMAC_I fmacd
+
+ #define FMAC_R1 fmacd
+ #define FMAC_R2 fmacd
+ #define FMAC_I1 fnmacd
+ #define FMAC_I2 fmacd
+
+#endif
+
+
+
+.macro INIT_F2
+
+ vsub.f64 d12, d12, d12
+ vsub.f64 d13, d13, d13
+ vsub.f64 d14, d14, d14
+ vsub.f64 d15, d15, d15
+
+.endm
+
+.macro KERNEL_F2X4
+
+ KERNEL_F2X1
+ KERNEL_F2X1
+ KERNEL_F2X1
+ KERNEL_F2X1
+
+.endm
+
+.macro KERNEL_F2X1
+
+ fldmiad XO! , { d2 - d3 }
+ fldmiad AO1!, { d4 - d5 }
+
+ fmacd d12 , d4 , d2
+ fmacd d13 , d4 , d3
+ fldmiad AO2!, { d8 - d9 }
+ KMAC_R d12 , d5 , d3
+ KMAC_I d13 , d5 , d2
+
+ fmacd d14 , d8 , d2
+ fmacd d15 , d8 , d3
+ KMAC_R d14 , d9 , d3
+ KMAC_I d15 , d9 , d2
+
+.endm
+
+.macro SAVE_F2
+
+ fldmiad YO, { d4 - d7 }
+
+ FMAC_R1 d4 , d0 , d12
+ FMAC_I1 d5 , d0 , d13
+ FMAC_R2 d4 , d1 , d13
+ FMAC_I2 d5 , d1 , d12
+
+ FMAC_R1 d6 , d0 , d14
+ FMAC_I1 d7 , d0 , d15
+ FMAC_R2 d6 , d1 , d15
+ FMAC_I2 d7 , d1 , d14
+
+ fstmiad YO!, { d4 - d7 }
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_F1
+
+ vsub.f64 d12, d12, d12
+ vsub.f64 d13, d13, d13
+
+.endm
+
+.macro KERNEL_F1X4
+
+ KERNEL_F1X1
+ KERNEL_F1X1
+ KERNEL_F1X1
+ KERNEL_F1X1
+
+.endm
+
+.macro KERNEL_F1X1
+
+ fldmiad XO! , { d2 - d3 }
+ fldmiad AO1!, { d4 - d5 }
+
+ fmacd d12 , d4 , d2
+ fmacd d13 , d4 , d3
+ KMAC_R d12 , d5 , d3
+ KMAC_I d13 , d5 , d2
+
+.endm
+
+.macro SAVE_F1
+
+ fldmiad YO, { d4 - d5 }
+
+ FMAC_R1 d4 , d0 , d12
+ FMAC_I1 d5 , d0 , d13
+ FMAC_R2 d4 , d1 , d13
+ FMAC_I2 d5 , d1 , d12
+
+ fstmiad YO!, { d4 - d5 }
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_S2
+
+ vsub.f64 d12, d12, d12
+ vsub.f64 d13, d13, d13
+ vsub.f64 d14, d14, d14
+ vsub.f64 d15, d15, d15
+
+.endm
+
+.macro KERNEL_S2X4
+
+ KERNEL_S2X1
+ KERNEL_S2X1
+ KERNEL_S2X1
+ KERNEL_S2X1
+
+.endm
+
+.macro KERNEL_S2X1
+
+ fldmiad XO , { d2 - d3 }
+ fldmiad AO1!, { d4 - d5 }
+ fldmiad AO2!, { d8 - d9 }
+
+ fmacd d12 , d4 , d2
+ fmacd d13 , d4 , d3
+ KMAC_R d12 , d5 , d3
+ KMAC_I d13 , d5 , d2
+
+ fmacd d14 , d8 , d2
+ fmacd d15 , d8 , d3
+ KMAC_R d14 , d9 , d3
+ KMAC_I d15 , d9 , d2
+
+ add XO, XO, INC_X
+
+.endm
+
+.macro SAVE_S2
+
+ fldmiad YO, { d4 - d5 }
+
+ FMAC_R1 d4 , d0 , d12
+ FMAC_I1 d5 , d0 , d13
+ FMAC_R2 d4 , d1 , d13
+ FMAC_I2 d5 , d1 , d12
+
+ fstmiad YO, { d4 - d5 }
+
+ add YO, YO, INC_Y
+
+ fldmiad YO, { d6 - d7 }
+
+ FMAC_R1 d6 , d0 , d14
+ FMAC_I1 d7 , d0 , d15
+ FMAC_R2 d6 , d1 , d15
+ FMAC_I2 d7 , d1 , d14
+
+ fstmiad YO, { d6 - d7 }
+
+ add YO, YO, INC_Y
+
+.endm
+
+/************************************************************************************************/
+
+.macro INIT_S1
+
+ vsub.f64 d12, d12, d12
+ vsub.f64 d13, d13, d13
+
+.endm
+
+.macro KERNEL_S1X4
+
+ KERNEL_S1X1
+ KERNEL_S1X1
+ KERNEL_S1X1
+ KERNEL_S1X1
+
+.endm
+
+.macro KERNEL_S1X1
+
+ fldmiad XO , { d2 - d3 }
+ fldmiad AO1!, { d4 - d5 }
+
+ fmacd d12 , d4 , d2
+ fmacd d13 , d4 , d3
+ KMAC_R d12 , d5 , d3
+ KMAC_I d13 , d5 , d2
+
+ add XO, XO, INC_X
+
+.endm
+
+.macro SAVE_S1
+
+ fldmiad YO, { d4 - d5 }
+
+ FMAC_R1 d4 , d0 , d12
+ FMAC_I1 d5 , d0 , d13
+ FMAC_R2 d4 , d1 , d13
+ FMAC_I2 d5 , d1 , d12
+
+ fstmiad YO, { d4 - d5 }
+
+ add YO, YO, INC_Y
+
+.endm
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+ PROLOGUE
+
+ .align 5
+ push {r4 - r9 , fp}
+ add fp, sp, #28
+ sub sp, sp, #STACKSIZE // reserve stack
+
+ sub r12, fp, #192
+
+#if defined(DOUBLE)
+ vstm r12, { d8 - d15 } // store floating point registers
+#else
+ vstm r12, { s8 - s15 } // store floating point registers
+#endif
+
+ cmp M, #0
+ ble zgemvt_kernel_L999
+
+ cmp OLD_N, #0
+ ble zgemvt_kernel_L999
+
+ str OLD_A, A
+ str OLD_N, N
+
+ ldr INC_X , OLD_INC_X
+ ldr INC_Y , OLD_INC_Y
+
+ cmp INC_X, #0
+ beq zgemvt_kernel_L999
+
+ cmp INC_Y, #0
+ beq zgemvt_kernel_L999
+
+ ldr LDA, OLD_LDA
+
+
+#if defined(DOUBLE)
+ lsl LDA, LDA, #4 // LDA * SIZE
+#else
+ lsl LDA, LDA, #3 // LDA * SIZE
+#endif
+
+ cmp INC_X, #1
+ bne zgemvt_kernel_S2_BEGIN
+
+ cmp INC_Y, #1
+ bne zgemvt_kernel_S2_BEGIN
+
+
+zgemvt_kernel_F2_BEGIN:
+
+ ldr YO , Y
+
+ ldr J, N
+ asrs J, J, #1 // J = N / 2
+ ble zgemvt_kernel_F1_BEGIN
+
+zgemvt_kernel_F2X4:
+
+ ldr AO1, A
+ add AO2, AO1, LDA
+ add r3 , AO2, LDA
+ str r3 , A
+
+ ldr XO , X
+
+ INIT_F2
+
+ asrs I, M, #2 // I = M / 4
+ ble zgemvt_kernel_F2X1
+
+
+zgemvt_kernel_F2X4_10:
+
+ KERNEL_F2X4
+
+ subs I, I, #1
+ bne zgemvt_kernel_F2X4_10
+
+
+zgemvt_kernel_F2X1:
+
+ ands I, M , #3
+ ble zgemvt_kernel_F2_END
+
+zgemvt_kernel_F2X1_10:
+
+ KERNEL_F2X1
+
+ subs I, I, #1
+ bne zgemvt_kernel_F2X1_10
+
+
+zgemvt_kernel_F2_END:
+
+ SAVE_F2
+
+ subs J , J , #1
+ bne zgemvt_kernel_F2X4
+
+
+zgemvt_kernel_F1_BEGIN:
+
+ ldr J, N
+ ands J, J, #1
+ ble zgemvt_kernel_L999
+
+zgemvt_kernel_F1X4:
+
+ ldr AO1, A
+
+ ldr XO , X
+
+ INIT_F1
+
+ asrs I, M, #2 // I = M / 4
+ ble zgemvt_kernel_F1X1
+
+
+zgemvt_kernel_F1X4_10:
+
+ KERNEL_F1X4
+
+ subs I, I, #1
+ bne zgemvt_kernel_F1X4_10
+
+
+zgemvt_kernel_F1X1:
+
+ ands I, M , #3
+ ble zgemvt_kernel_F1_END
+
+zgemvt_kernel_F1X1_10:
+
+ KERNEL_F1X1
+
+ subs I, I, #1
+ bne zgemvt_kernel_F1X1_10
+
+
+zgemvt_kernel_F1_END:
+
+ SAVE_F1
+
+ b zgemvt_kernel_L999
+
+
+
+/*************************************************************************************************************/
+
+zgemvt_kernel_S2_BEGIN:
+
+#if defined(DOUBLE)
+ lsl INC_X, INC_X, #4 // INC_X * SIZE
+ lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
+#else
+ lsl INC_X, INC_X, #3 // INC_X * SIZE
+ lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
+#endif
+
+ ldr YO , Y
+
+ ldr J, N
+ asrs J, J, #1 // J = N / 2
+ ble zgemvt_kernel_S1_BEGIN
+
+zgemvt_kernel_S2X4:
+
+ ldr AO1, A
+ add AO2, AO1, LDA
+ add r3 , AO2, LDA
+ str r3 , A
+
+ ldr XO , X
+
+ INIT_S2
+
+ asrs I, M, #2 // I = M / 4
+ ble zgemvt_kernel_S2X1
+
+
+zgemvt_kernel_S2X4_10:
+
+ KERNEL_S2X4
+
+ subs I, I, #1
+ bne zgemvt_kernel_S2X4_10
+
+
+zgemvt_kernel_S2X1:
+
+ ands I, M , #3
+ ble zgemvt_kernel_S2_END
+
+zgemvt_kernel_S2X1_10:
+
+ KERNEL_S2X1
+
+ subs I, I, #1
+ bne zgemvt_kernel_S2X1_10
+
+
+zgemvt_kernel_S2_END:
+
+ SAVE_S2
+
+ subs J , J , #1
+ bne zgemvt_kernel_S2X4
+
+
+zgemvt_kernel_S1_BEGIN:
+
+ ldr J, N
+ ands J, J, #1
+ ble zgemvt_kernel_L999
+
+zgemvt_kernel_S1X4:
+
+ ldr AO1, A
+
+ ldr XO , X
+
+ INIT_S1
+
+ asrs I, M, #2 // I = M / 4
+ ble zgemvt_kernel_S1X1
+
+
+zgemvt_kernel_S1X4_10:
+
+ KERNEL_S1X4
+
+ subs I, I, #1
+ bne zgemvt_kernel_S1X4_10
+
+
+zgemvt_kernel_S1X1:
+
+ ands I, M , #3
+ ble zgemvt_kernel_S1_END
+
+zgemvt_kernel_S1X1_10:
+
+ KERNEL_S1X1
+
+ subs I, I, #1
+ bne zgemvt_kernel_S1X1_10
+
+
+zgemvt_kernel_S1_END:
+
+ SAVE_S1
+
+
+
+/*************************************************************************************************************/
+
+zgemvt_kernel_L999:
+
+ sub r3, fp, #192
+
+#if defined(DOUBLE)
+ vldm r3, { d8 - d15 } // restore floating point registers
+#else
+ vldm r3, { s8 - s15 } // restore floating point registers
+#endif
+
+ mov r0, #0 // set return value
+
+ sub sp, fp, #28
+ pop {r4 -r9 ,fp}
+ bx lr
+
+ EPILOGUE
+