1 /***************************************************************************
2 Copyright (c) 2013, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
28 /**************************************************************************************
34 **************************************************************************************/
41 #define OLD_LDA [fp, #0 ]
43 #define OLD_INC_X [fp, #8 ]
45 #define OLD_INC_Y [fp, #16 ]
62 #define ALPHA_I [fp, #-236]
63 #define ALPHA_R [fp, #-244]
65 #define M [fp, #-252 ]
66 #define A [fp, #-256 ]
73 /**************************************************************************************/
75 #if !defined(CONJ) && !defined(XCONJ)
81 #define FMAC_R2 fnmacd
85 #elif defined(CONJ) && !defined(XCONJ)
91 #define FMAC_R2 fnmacd
95 #elif !defined(CONJ) && defined(XCONJ)
100 #define FMAC_R1 fmacd
101 #define FMAC_R2 fmacd
102 #define FMAC_I1 fnmacd
103 #define FMAC_I2 fmacd
107 #define KMAC_R fnmacd
110 #define FMAC_R1 fmacd
111 #define FMAC_R2 fmacd
112 #define FMAC_I1 fnmacd
113 #define FMAC_I2 fmacd
120 vsub.f64 d8 , d8 , d8
151 fldd d1 , [ AO1, #8 ]
153 fldd d2 , [ AO1, #16 ]
155 fldd d3 , [ AO1, #24 ]
157 fldd d0 , [ AO1, #32 ]
163 fldd d1 , [ AO1, #40 ]
166 fldd d2 , [ AO1, #48 ]
169 fldd d3 , [ AO1, #56 ]
171 pld [ AO2, #A_PRE+32 ]
190 fldmiad YO, { d4 - d7 }
197 FMAC_R1 d6 , d0 , d10
198 FMAC_I1 d7 , d0 , d11
199 FMAC_R2 d6 , d1 , d11
200 FMAC_I2 d7 , d1 , d10
202 fstmiad YO!, { d4 - d7 }
204 fldmiad YO, { d4 - d7 }
206 FMAC_R1 d4 , d0 , d12
207 FMAC_I1 d5 , d0 , d13
208 FMAC_R2 d4 , d1 , d13
209 FMAC_I2 d5 , d1 , d12
211 FMAC_R1 d6 , d0 , d14
212 FMAC_I1 d7 , d0 , d15
213 FMAC_R2 d6 , d1 , d15
214 FMAC_I2 d7 , d1 , d14
216 fstmiad YO!, { d4 - d7 }
225 vsub.f64 d8 , d8 , d8
233 fldd d1 , [ AO1, #8 ]
255 fldmiad YO, { d4 - d5 }
262 fstmiad YO, { d4 - d5 }
268 /****************************************************************************************/
272 vsub.f64 d8 , d8 , d8
295 fldd d1 , [ AO1, #8 ]
296 fldd d2 , [ AO1, #16 ]
297 fldd d3 , [ AO1, #24 ]
312 fldd d0 , [ AO1, #32 ]
313 fldd d1 , [ AO1, #40 ]
314 fldd d2 , [ AO1, #48 ]
315 fldd d3 , [ AO1, #56 ]
338 fldmiad YO, { d4 - d5 }
345 fstmiad YO, { d4 - d5 }
349 fldmiad YO, { d6 - d7 }
351 FMAC_R1 d6 , d0 , d10
352 FMAC_I1 d7 , d0 , d11
353 FMAC_R2 d6 , d1 , d11
354 FMAC_I2 d7 , d1 , d10
356 fstmiad YO, { d6 - d7 }
360 fldmiad YO, { d4 - d5 }
362 FMAC_R1 d4 , d0 , d12
363 FMAC_I1 d5 , d0 , d13
364 FMAC_R2 d4 , d1 , d13
365 FMAC_I2 d5 , d1 , d12
367 fstmiad YO, { d4 - d5 }
371 fldmiad YO, { d6 - d7 }
373 FMAC_R1 d6 , d0 , d14
374 FMAC_I1 d7 , d0 , d15
375 FMAC_R2 d6 , d1 , d15
376 FMAC_I2 d7 , d1 , d14
378 fstmiad YO, { d6 - d7 }
389 vsub.f64 d8 , d8 , d8
397 fldd d1 , [ AO1, #8 ]
419 fldmiad YO, { d4 - d5 }
426 fstmiad YO, { d4 - d5 }
434 /**************************************************************************************
435 * End of macro definitions
436 **************************************************************************************/
443 sub sp, sp, #STACKSIZE // reserve stack
448 vstm r12, { d8 - d15 } // store floating point registers
450 vstm r12, { s8 - s15 } // store floating point registers
454 ble zgemvn_kernel_L999
457 ble zgemvn_kernel_L999
465 ldr INC_X , OLD_INC_X
466 ldr INC_Y , OLD_INC_Y
469 beq zgemvn_kernel_L999
472 beq zgemvn_kernel_L999
478 lsl LDA, LDA, #4 // LDA * SIZE * 2
480 lsl LDA, LDA, #3 // LDA * SIZE * 2
484 bne zgemvn_kernel_S4_BEGIN
487 bne zgemvn_kernel_S4_BEGIN
490 zgemvn_kernel_F4_BEGIN:
495 asrs I, I, #2 // I = M / 4
496 ble zgemvn_kernel_F1_BEGIN
512 asrs J, N, #2 // J = N / 4
513 ble zgemvn_kernel_F4X1
516 zgemvn_kernel_F4X4_10:
521 bne zgemvn_kernel_F4X4_10
527 ble zgemvn_kernel_F4_END
529 zgemvn_kernel_F4X1_10:
534 bne zgemvn_kernel_F4X1_10
537 zgemvn_kernel_F4_END:
542 bne zgemvn_kernel_F4X4
545 zgemvn_kernel_F1_BEGIN:
549 ble zgemvn_kernel_L999
564 zgemvn_kernel_F1X1_10:
569 bne zgemvn_kernel_F1X1_10
572 zgemvn_kernel_F1_END:
577 bne zgemvn_kernel_F1X1
583 /*************************************************************************************************************/
585 zgemvn_kernel_S4_BEGIN:
588 lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
589 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
591 lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
592 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
598 asrs I, I, #2 // I = M / 4
599 ble zgemvn_kernel_S1_BEGIN
612 asrs J, N, #2 // J = N / 4
613 ble zgemvn_kernel_S4X1
616 zgemvn_kernel_S4X4_10:
621 bne zgemvn_kernel_S4X4_10
627 ble zgemvn_kernel_S4_END
629 zgemvn_kernel_S4X1_10:
634 bne zgemvn_kernel_S4X1_10
637 zgemvn_kernel_S4_END:
642 bne zgemvn_kernel_S4X4
645 zgemvn_kernel_S1_BEGIN:
649 ble zgemvn_kernel_L999
664 zgemvn_kernel_S1X1_10:
669 bne zgemvn_kernel_S1X1_10
672 zgemvn_kernel_S1_END:
677 bne zgemvn_kernel_S1X1
680 /*************************************************************************************************************/
687 vldm r3, { d8 - d15 } // restore floating point registers
689 vldm r3, { s8 - s15 } // restore floating point registers
692 mov r0, #0 // set return value