1 /***************************************************************************
2 Copyright (c) 2013, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
28 /**************************************************************************************
34 **************************************************************************************/
41 #define OLD_LDA [fp, #0 ]
43 #define OLD_INC_X [fp, #8 ]
45 #define OLD_INC_Y [fp, #16 ]
62 #define FP_ZERO [fp, #-228]
63 #define FP_ZERO_0 [fp, #-228]
64 #define FP_ZERO_1 [fp, #-224]
66 #define N [fp, #-252 ]
67 #define A [fp, #-256 ]
73 /**************************************************************************************
75 **************************************************************************************/
77 #if !defined(CONJ) && !defined(XCONJ)
83 #define FMAC_R2 fnmacs
87 #elif defined(CONJ) && !defined(XCONJ)
93 #define FMAC_R2 fnmacs
97 #elif !defined(CONJ) && defined(XCONJ)
100 #define KMAC_I fnmacs
102 #define FMAC_R1 fmacs
103 #define FMAC_R2 fmacs
104 #define FMAC_I1 fnmacs
105 #define FMAC_I2 fmacs
109 #define KMAC_R fnmacs
112 #define FMAC_R1 fmacs
113 #define FMAC_R2 fmacs
114 #define FMAC_I1 fnmacs
115 #define FMAC_I2 fmacs
141 fldmias XO! , { s2 - s3 }
142 fldmias AO1!, { s4 - s5 }
143 fldmias AO2!, { s8 - s9 }
159 fldmias YO, { s4 - s7 }
161 FMAC_R1 s4 , s0 , s12
162 FMAC_I1 s5 , s0 , s13
163 FMAC_R2 s4 , s1 , s13
164 FMAC_I2 s5 , s1 , s12
166 FMAC_R1 s6 , s0 , s14
167 FMAC_I1 s7 , s0 , s15
168 FMAC_R2 s6 , s1 , s15
169 FMAC_I2 s7 , s1 , s14
171 fstmias YO!, { s4 - s7 }
175 /************************************************************************************************/
195 fldmias XO! , { s2 - s3 }
196 fldmias AO1!, { s4 - s5 }
207 fldmias YO, { s4 - s5 }
209 FMAC_R1 s4 , s0 , s12
210 FMAC_I1 s5 , s0 , s13
211 FMAC_R2 s4 , s1 , s13
212 FMAC_I2 s5 , s1 , s12
214 fstmias YO!, { s4 - s5 }
218 /************************************************************************************************/
240 fldmias XO , { s2 - s3 }
241 fldmias AO1!, { s4 - s5 }
242 fldmias AO2!, { s8 - s9 }
260 fldmias YO, { s4 - s5 }
262 FMAC_R1 s4 , s0 , s12
263 FMAC_I1 s5 , s0 , s13
264 FMAC_R2 s4 , s1 , s13
265 FMAC_I2 s5 , s1 , s12
267 fstmias YO, { s4 - s5 }
271 fldmias YO, { s6 - s7 }
273 FMAC_R1 s6 , s0 , s14
274 FMAC_I1 s7 , s0 , s15
275 FMAC_R2 s6 , s1 , s15
276 FMAC_I2 s7 , s1 , s14
278 fstmias YO, { s6 - s7 }
284 /************************************************************************************************/
304 fldmias XO , { s2 - s3 }
305 fldmias AO1!, { s4 - s5 }
318 fldmias YO, { s4 - s5 }
320 FMAC_R1 s4 , s0 , s12
321 FMAC_I1 s5 , s0 , s13
322 FMAC_R2 s4 , s1 , s13
323 FMAC_I2 s5 , s1 , s12
325 fstmias YO, { s4 - s5 }
333 /**************************************************************************************
334 * End of macro definitions
335 **************************************************************************************/
342 sub sp, sp, #STACKSIZE // reserve stack
347 vstm r12, { d8 - d15 } // store floating point registers
349 vstm r12, { s8 - s15 } // store floating point registers
357 ble cgemvt_kernel_L999
360 ble cgemvt_kernel_L999
365 ldr INC_X , OLD_INC_X
366 ldr INC_Y , OLD_INC_Y
369 beq cgemvt_kernel_L999
372 beq cgemvt_kernel_L999
378 lsl LDA, LDA, #4 // LDA * SIZE
380 lsl LDA, LDA, #3 // LDA * SIZE
384 bne cgemvt_kernel_S2_BEGIN
387 bne cgemvt_kernel_S2_BEGIN
390 cgemvt_kernel_F2_BEGIN:
395 asrs J, J, #1 // J = N / 2
396 ble cgemvt_kernel_F1_BEGIN
409 asrs I, M, #2 // I = M / 4
410 ble cgemvt_kernel_F2X1
413 cgemvt_kernel_F2X4_10:
418 bne cgemvt_kernel_F2X4_10
424 ble cgemvt_kernel_F2_END
426 cgemvt_kernel_F2X1_10:
431 bne cgemvt_kernel_F2X1_10
434 cgemvt_kernel_F2_END:
439 bne cgemvt_kernel_F2X4
442 cgemvt_kernel_F1_BEGIN:
446 ble cgemvt_kernel_L999
456 asrs I, M, #2 // I = M / 4
457 ble cgemvt_kernel_F1X1
460 cgemvt_kernel_F1X4_10:
465 bne cgemvt_kernel_F1X4_10
471 ble cgemvt_kernel_F1_END
473 cgemvt_kernel_F1X1_10:
478 bne cgemvt_kernel_F1X1_10
481 cgemvt_kernel_F1_END:
489 /*************************************************************************************************************/
491 cgemvt_kernel_S2_BEGIN:
494 lsl INC_X, INC_X, #4 // INC_X * SIZE
495 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
497 lsl INC_X, INC_X, #3 // INC_X * SIZE
498 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
504 asrs J, J, #1 // J = N / 2
505 ble cgemvt_kernel_S1_BEGIN
518 asrs I, M, #2 // I = M / 4
519 ble cgemvt_kernel_S2X1
522 cgemvt_kernel_S2X4_10:
527 bne cgemvt_kernel_S2X4_10
533 ble cgemvt_kernel_S2_END
535 cgemvt_kernel_S2X1_10:
540 bne cgemvt_kernel_S2X1_10
543 cgemvt_kernel_S2_END:
548 bne cgemvt_kernel_S2X4
551 cgemvt_kernel_S1_BEGIN:
555 ble cgemvt_kernel_L999
565 asrs I, M, #2 // I = M / 4
566 ble cgemvt_kernel_S1X1
569 cgemvt_kernel_S1X4_10:
574 bne cgemvt_kernel_S1X4_10
580 ble cgemvt_kernel_S1_END
582 cgemvt_kernel_S1X1_10:
587 bne cgemvt_kernel_S1X1_10
590 cgemvt_kernel_S1_END:
596 /*************************************************************************************************************/
603 vldm r3, { d8 - d15 } // restore floating point registers
605 vldm r3, { s8 - s15 } // restore floating point registers
608 mov r0, #0 // set return value