1 /***************************************************************************
2 Copyright (c) 2013, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
28 /**************************************************************************************
34 **************************************************************************************/
41 #define OLD_LDA [fp, #0 ]
43 #define OLD_INC_X [fp, #8 ]
45 #define OLD_INC_Y [fp, #16 ]
62 #define FP_ZERO [fp, #-228]
63 #define FP_ZERO_0 [fp, #-228]
64 #define FP_ZERO_1 [fp, #-224]
67 #define ALPHA_I [fp, #-236]
68 #define ALPHA_R [fp, #-244]
70 #define M [fp, #-252 ]
71 #define A [fp, #-256 ]
78 /**************************************************************************************/
80 #if !defined(CONJ) && !defined(XCONJ)
86 #define FMAC_R2 fnmacd
90 #elif defined(CONJ) && !defined(XCONJ)
96 #define FMAC_R2 fnmacd
100 #elif !defined(CONJ) && defined(XCONJ)
103 #define KMAC_I fnmacd
105 #define FMAC_R1 fmacd
106 #define FMAC_R2 fmacd
107 #define FMAC_I1 fnmacd
108 #define FMAC_I2 fmacd
112 #define KMAC_R fnmacd
115 #define FMAC_R1 fmacd
116 #define FMAC_R2 fmacd
117 #define FMAC_I1 fnmacd
118 #define FMAC_I2 fmacd
156 fldd d1 , [ AO1, #8 ]
158 fldd d2 , [ AO1, #16 ]
160 fldd d3 , [ AO1, #24 ]
162 fldd d0 , [ AO1, #32 ]
168 fldd d1 , [ AO1, #40 ]
171 fldd d2 , [ AO1, #48 ]
174 fldd d3 , [ AO1, #56 ]
176 pld [ AO2, #A_PRE+32 ]
195 fldmiad YO, { d4 - d7 }
202 FMAC_R1 d6 , d0 , d10
203 FMAC_I1 d7 , d0 , d11
204 FMAC_R2 d6 , d1 , d11
205 FMAC_I2 d7 , d1 , d10
207 fstmiad YO!, { d4 - d7 }
209 fldmiad YO, { d4 - d7 }
211 FMAC_R1 d4 , d0 , d12
212 FMAC_I1 d5 , d0 , d13
213 FMAC_R2 d4 , d1 , d13
214 FMAC_I2 d5 , d1 , d12
216 FMAC_R1 d6 , d0 , d14
217 FMAC_I1 d7 , d0 , d15
218 FMAC_R2 d6 , d1 , d15
219 FMAC_I2 d7 , d1 , d14
221 fstmiad YO!, { d4 - d7 }
238 fldd d1 , [ AO1, #8 ]
260 fldmiad YO, { d4 - d5 }
267 fstmiad YO, { d4 - d5 }
273 /****************************************************************************************/
300 fldd d1 , [ AO1, #8 ]
301 fldd d2 , [ AO1, #16 ]
302 fldd d3 , [ AO1, #24 ]
317 fldd d0 , [ AO1, #32 ]
318 fldd d1 , [ AO1, #40 ]
319 fldd d2 , [ AO1, #48 ]
320 fldd d3 , [ AO1, #56 ]
343 fldmiad YO, { d4 - d5 }
350 fstmiad YO, { d4 - d5 }
354 fldmiad YO, { d6 - d7 }
356 FMAC_R1 d6 , d0 , d10
357 FMAC_I1 d7 , d0 , d11
358 FMAC_R2 d6 , d1 , d11
359 FMAC_I2 d7 , d1 , d10
361 fstmiad YO, { d6 - d7 }
365 fldmiad YO, { d4 - d5 }
367 FMAC_R1 d4 , d0 , d12
368 FMAC_I1 d5 , d0 , d13
369 FMAC_R2 d4 , d1 , d13
370 FMAC_I2 d5 , d1 , d12
372 fstmiad YO, { d4 - d5 }
376 fldmiad YO, { d6 - d7 }
378 FMAC_R1 d6 , d0 , d14
379 FMAC_I1 d7 , d0 , d15
380 FMAC_R2 d6 , d1 , d15
381 FMAC_I2 d7 , d1 , d14
383 fstmiad YO, { d6 - d7 }
402 fldd d1 , [ AO1, #8 ]
424 fldmiad YO, { d4 - d5 }
431 fstmiad YO, { d4 - d5 }
439 /**************************************************************************************
440 * End of macro definitions
441 **************************************************************************************/
448 sub sp, sp, #STACKSIZE // reserve stack
453 vstm r12, { d8 - d15 } // store floating point registers
455 vstm r12, { s8 - s15 } // store floating point registers
463 ble zgemvn_kernel_L999
466 ble zgemvn_kernel_L999
474 ldr INC_X , OLD_INC_X
475 ldr INC_Y , OLD_INC_Y
478 beq zgemvn_kernel_L999
481 beq zgemvn_kernel_L999
487 lsl LDA, LDA, #4 // LDA * SIZE * 2
489 lsl LDA, LDA, #3 // LDA * SIZE * 2
493 bne zgemvn_kernel_S4_BEGIN
496 bne zgemvn_kernel_S4_BEGIN
499 zgemvn_kernel_F4_BEGIN:
504 asrs I, I, #2 // I = M / 4
505 ble zgemvn_kernel_F1_BEGIN
521 asrs J, N, #2 // J = N / 4
522 ble zgemvn_kernel_F4X1
525 zgemvn_kernel_F4X4_10:
530 bne zgemvn_kernel_F4X4_10
536 ble zgemvn_kernel_F4_END
538 zgemvn_kernel_F4X1_10:
543 bne zgemvn_kernel_F4X1_10
546 zgemvn_kernel_F4_END:
551 bne zgemvn_kernel_F4X4
554 zgemvn_kernel_F1_BEGIN:
558 ble zgemvn_kernel_L999
573 zgemvn_kernel_F1X1_10:
578 bne zgemvn_kernel_F1X1_10
581 zgemvn_kernel_F1_END:
586 bne zgemvn_kernel_F1X1
592 /*************************************************************************************************************/
594 zgemvn_kernel_S4_BEGIN:
597 lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
598 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
600 lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
601 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
607 asrs I, I, #2 // I = M / 4
608 ble zgemvn_kernel_S1_BEGIN
621 asrs J, N, #2 // J = N / 4
622 ble zgemvn_kernel_S4X1
625 zgemvn_kernel_S4X4_10:
630 bne zgemvn_kernel_S4X4_10
636 ble zgemvn_kernel_S4_END
638 zgemvn_kernel_S4X1_10:
643 bne zgemvn_kernel_S4X1_10
646 zgemvn_kernel_S4_END:
651 bne zgemvn_kernel_S4X4
654 zgemvn_kernel_S1_BEGIN:
658 ble zgemvn_kernel_L999
673 zgemvn_kernel_S1X1_10:
678 bne zgemvn_kernel_S1X1_10
681 zgemvn_kernel_S1_END:
686 bne zgemvn_kernel_S1X1
689 /*************************************************************************************************************/
696 vldm r3, { d8 - d15 } // restore floating point registers
698 vldm r3, { s8 - s15 } // restore floating point registers
701 mov r0, #0 // set return value