1 /***************************************************************************
2 Copyright (c) 2013, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
28 /**************************************************************************************
34 **************************************************************************************/
41 #define OLD_LDA [fp, #0 ]
43 #define OLD_INC_X [fp, #8 ]
45 #define OLD_INC_Y [fp, #16 ]
62 #define N [fp, #-252 ]
63 #define A [fp, #-256 ]
69 /**************************************************************************************
71 **************************************************************************************/
73 #if !defined(CONJ) && !defined(XCONJ)
79 #define FMAC_R2 fnmacs
83 #elif defined(CONJ) && !defined(XCONJ)
89 #define FMAC_R2 fnmacs
93 #elif !defined(CONJ) && defined(XCONJ)
100 #define FMAC_I1 fnmacs
101 #define FMAC_I2 fmacs
105 #define KMAC_R fnmacs
108 #define FMAC_R1 fmacs
109 #define FMAC_R2 fmacs
110 #define FMAC_I1 fnmacs
111 #define FMAC_I2 fmacs
137 fldmias XO! , { s2 - s3 }
138 fldmias AO1!, { s4 - s5 }
139 fldmias AO2!, { s8 - s9 }
155 fldmias YO, { s4 - s7 }
157 FMAC_R1 s4 , s0 , s12
158 FMAC_I1 s5 , s0 , s13
159 FMAC_R2 s4 , s1 , s13
160 FMAC_I2 s5 , s1 , s12
162 FMAC_R1 s6 , s0 , s14
163 FMAC_I1 s7 , s0 , s15
164 FMAC_R2 s6 , s1 , s15
165 FMAC_I2 s7 , s1 , s14
167 fstmias YO!, { s4 - s7 }
171 /************************************************************************************************/
191 fldmias XO! , { s2 - s3 }
192 fldmias AO1!, { s4 - s5 }
203 fldmias YO, { s4 - s5 }
205 FMAC_R1 s4 , s0 , s12
206 FMAC_I1 s5 , s0 , s13
207 FMAC_R2 s4 , s1 , s13
208 FMAC_I2 s5 , s1 , s12
210 fstmias YO!, { s4 - s5 }
214 /************************************************************************************************/
236 fldmias XO , { s2 - s3 }
237 fldmias AO1!, { s4 - s5 }
238 fldmias AO2!, { s8 - s9 }
256 fldmias YO, { s4 - s5 }
258 FMAC_R1 s4 , s0 , s12
259 FMAC_I1 s5 , s0 , s13
260 FMAC_R2 s4 , s1 , s13
261 FMAC_I2 s5 , s1 , s12
263 fstmias YO, { s4 - s5 }
267 fldmias YO, { s6 - s7 }
269 FMAC_R1 s6 , s0 , s14
270 FMAC_I1 s7 , s0 , s15
271 FMAC_R2 s6 , s1 , s15
272 FMAC_I2 s7 , s1 , s14
274 fstmias YO, { s6 - s7 }
280 /************************************************************************************************/
300 fldmias XO , { s2 - s3 }
301 fldmias AO1!, { s4 - s5 }
314 fldmias YO, { s4 - s5 }
316 FMAC_R1 s4 , s0 , s12
317 FMAC_I1 s5 , s0 , s13
318 FMAC_R2 s4 , s1 , s13
319 FMAC_I2 s5 , s1 , s12
321 fstmias YO, { s4 - s5 }
329 /**************************************************************************************
330 * End of macro definitions
331 **************************************************************************************/
338 sub sp, sp, #STACKSIZE // reserve stack
343 vstm r12, { d8 - d15 } // store floating point registers
345 vstm r12, { s8 - s15 } // store floating point registers
349 ble cgemvt_kernel_L999
352 ble cgemvt_kernel_L999
357 ldr INC_X , OLD_INC_X
358 ldr INC_Y , OLD_INC_Y
361 beq cgemvt_kernel_L999
364 beq cgemvt_kernel_L999
370 lsl LDA, LDA, #4 // LDA * SIZE
372 lsl LDA, LDA, #3 // LDA * SIZE
376 bne cgemvt_kernel_S2_BEGIN
379 bne cgemvt_kernel_S2_BEGIN
382 cgemvt_kernel_F2_BEGIN:
387 asrs J, J, #1 // J = N / 2
388 ble cgemvt_kernel_F1_BEGIN
401 asrs I, M, #2 // I = M / 4
402 ble cgemvt_kernel_F2X1
405 cgemvt_kernel_F2X4_10:
410 bne cgemvt_kernel_F2X4_10
416 ble cgemvt_kernel_F2_END
418 cgemvt_kernel_F2X1_10:
423 bne cgemvt_kernel_F2X1_10
426 cgemvt_kernel_F2_END:
431 bne cgemvt_kernel_F2X4
434 cgemvt_kernel_F1_BEGIN:
438 ble cgemvt_kernel_L999
448 asrs I, M, #2 // I = M / 4
449 ble cgemvt_kernel_F1X1
452 cgemvt_kernel_F1X4_10:
457 bne cgemvt_kernel_F1X4_10
463 ble cgemvt_kernel_F1_END
465 cgemvt_kernel_F1X1_10:
470 bne cgemvt_kernel_F1X1_10
473 cgemvt_kernel_F1_END:
481 /*************************************************************************************************************/
483 cgemvt_kernel_S2_BEGIN:
486 lsl INC_X, INC_X, #4 // INC_X * SIZE
487 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
489 lsl INC_X, INC_X, #3 // INC_X * SIZE
490 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
496 asrs J, J, #1 // J = N / 2
497 ble cgemvt_kernel_S1_BEGIN
510 asrs I, M, #2 // I = M / 4
511 ble cgemvt_kernel_S2X1
514 cgemvt_kernel_S2X4_10:
519 bne cgemvt_kernel_S2X4_10
525 ble cgemvt_kernel_S2_END
527 cgemvt_kernel_S2X1_10:
532 bne cgemvt_kernel_S2X1_10
535 cgemvt_kernel_S2_END:
540 bne cgemvt_kernel_S2X4
543 cgemvt_kernel_S1_BEGIN:
547 ble cgemvt_kernel_L999
557 asrs I, M, #2 // I = M / 4
558 ble cgemvt_kernel_S1X1
561 cgemvt_kernel_S1X4_10:
566 bne cgemvt_kernel_S1X4_10
572 ble cgemvt_kernel_S1_END
574 cgemvt_kernel_S1X1_10:
579 bne cgemvt_kernel_S1X1_10
582 cgemvt_kernel_S1_END:
588 /*************************************************************************************************************/
595 vldm r3, { d8 - d15 } // restore floating point registers
597 vldm r3, { s8 - s15 } // restore floating point registers
600 mov r0, #0 // set return value