1 /***************************************************************************
2 Copyright (c) 2013, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
28 /**************************************************************************************
34 **************************************************************************************/
41 #if !defined(__ARM_PCS_VFP)
43 #define OLD_ALPHAI [fp, #0 ]
44 #define OLD_A_SOFTFP [fp, #4 ]
45 #define OLD_LDA [fp, #8 ]
47 #define OLD_INC_X [fp, #16 ]
49 #define OLD_INC_Y [fp, #24 ]
51 #define OLD_LDA [fp, #0 ]
53 #define OLD_INC_X [fp, #8 ]
55 #define OLD_INC_Y [fp, #16 ]
74 #define FP_ZERO [fp, #-228]
75 #define FP_ZERO_0 [fp, #-228]
76 #define FP_ZERO_1 [fp, #-224]
78 #define N [fp, #-252 ]
79 #define A [fp, #-256 ]
85 /**************************************************************************************
87 **************************************************************************************/
89 #if !defined(CONJ) && !defined(XCONJ)
95 #define FMAC_R2 fnmacs
99 #elif defined(CONJ) && !defined(XCONJ)
102 #define KMAC_I fnmacs
104 #define FMAC_R1 fmacs
105 #define FMAC_R2 fnmacs
106 #define FMAC_I1 fmacs
107 #define FMAC_I2 fmacs
109 #elif !defined(CONJ) && defined(XCONJ)
112 #define KMAC_I fnmacs
114 #define FMAC_R1 fmacs
115 #define FMAC_R2 fmacs
116 #define FMAC_I1 fnmacs
117 #define FMAC_I2 fmacs
121 #define KMAC_R fnmacs
124 #define FMAC_R1 fmacs
125 #define FMAC_R2 fmacs
126 #define FMAC_I1 fnmacs
127 #define FMAC_I2 fmacs
153 fldmias XO! , { s2 - s3 }
154 fldmias AO1!, { s4 - s5 }
155 fldmias AO2!, { s8 - s9 }
171 fldmias YO, { s4 - s7 }
173 FMAC_R1 s4 , s0 , s12
174 FMAC_I1 s5 , s0 , s13
175 FMAC_R2 s4 , s1 , s13
176 FMAC_I2 s5 , s1 , s12
178 FMAC_R1 s6 , s0 , s14
179 FMAC_I1 s7 , s0 , s15
180 FMAC_R2 s6 , s1 , s15
181 FMAC_I2 s7 , s1 , s14
183 fstmias YO!, { s4 - s7 }
187 /************************************************************************************************/
207 fldmias XO! , { s2 - s3 }
208 fldmias AO1!, { s4 - s5 }
219 fldmias YO, { s4 - s5 }
221 FMAC_R1 s4 , s0 , s12
222 FMAC_I1 s5 , s0 , s13
223 FMAC_R2 s4 , s1 , s13
224 FMAC_I2 s5 , s1 , s12
226 fstmias YO!, { s4 - s5 }
230 /************************************************************************************************/
252 fldmias XO , { s2 - s3 }
253 fldmias AO1!, { s4 - s5 }
254 fldmias AO2!, { s8 - s9 }
272 fldmias YO, { s4 - s5 }
274 FMAC_R1 s4 , s0 , s12
275 FMAC_I1 s5 , s0 , s13
276 FMAC_R2 s4 , s1 , s13
277 FMAC_I2 s5 , s1 , s12
279 fstmias YO, { s4 - s5 }
283 fldmias YO, { s6 - s7 }
285 FMAC_R1 s6 , s0 , s14
286 FMAC_I1 s7 , s0 , s15
287 FMAC_R2 s6 , s1 , s15
288 FMAC_I2 s7 , s1 , s14
290 fstmias YO, { s6 - s7 }
296 /************************************************************************************************/
316 fldmias XO , { s2 - s3 }
317 fldmias AO1!, { s4 - s5 }
330 fldmias YO, { s4 - s5 }
332 FMAC_R1 s4 , s0 , s12
333 FMAC_I1 s5 , s0 , s13
334 FMAC_R2 s4 , s1 , s13
335 FMAC_I2 s5 , s1 , s12
337 fstmias YO, { s4 - s5 }
345 /**************************************************************************************
346 * End of macro definitions
347 **************************************************************************************/
354 sub sp, sp, #STACKSIZE // reserve stack
359 vstm r12, { d8 - d15 } // store floating point registers
361 vstm r12, { s8 - s15 } // store floating point registers
369 ble cgemvt_kernel_L999
372 ble cgemvt_kernel_L999
374 #if !defined(__ARM_PCS_VFP)
377 ldr OLD_A, OLD_A_SOFTFP
383 ldr INC_X , OLD_INC_X
384 ldr INC_Y , OLD_INC_Y
387 beq cgemvt_kernel_L999
390 beq cgemvt_kernel_L999
396 lsl LDA, LDA, #4 // LDA * SIZE
398 lsl LDA, LDA, #3 // LDA * SIZE
402 bne cgemvt_kernel_S2_BEGIN
405 bne cgemvt_kernel_S2_BEGIN
408 cgemvt_kernel_F2_BEGIN:
413 asrs J, J, #1 // J = N / 2
414 ble cgemvt_kernel_F1_BEGIN
427 asrs I, M, #2 // I = M / 4
428 ble cgemvt_kernel_F2X1
431 cgemvt_kernel_F2X4_10:
436 bne cgemvt_kernel_F2X4_10
442 ble cgemvt_kernel_F2_END
444 cgemvt_kernel_F2X1_10:
449 bne cgemvt_kernel_F2X1_10
452 cgemvt_kernel_F2_END:
457 bne cgemvt_kernel_F2X4
460 cgemvt_kernel_F1_BEGIN:
464 ble cgemvt_kernel_L999
474 asrs I, M, #2 // I = M / 4
475 ble cgemvt_kernel_F1X1
478 cgemvt_kernel_F1X4_10:
483 bne cgemvt_kernel_F1X4_10
489 ble cgemvt_kernel_F1_END
491 cgemvt_kernel_F1X1_10:
496 bne cgemvt_kernel_F1X1_10
499 cgemvt_kernel_F1_END:
507 /*************************************************************************************************************/
509 cgemvt_kernel_S2_BEGIN:
512 lsl INC_X, INC_X, #4 // INC_X * SIZE
513 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
515 lsl INC_X, INC_X, #3 // INC_X * SIZE
516 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
522 asrs J, J, #1 // J = N / 2
523 ble cgemvt_kernel_S1_BEGIN
536 asrs I, M, #2 // I = M / 4
537 ble cgemvt_kernel_S2X1
540 cgemvt_kernel_S2X4_10:
545 bne cgemvt_kernel_S2X4_10
551 ble cgemvt_kernel_S2_END
553 cgemvt_kernel_S2X1_10:
558 bne cgemvt_kernel_S2X1_10
561 cgemvt_kernel_S2_END:
566 bne cgemvt_kernel_S2X4
569 cgemvt_kernel_S1_BEGIN:
573 ble cgemvt_kernel_L999
583 asrs I, M, #2 // I = M / 4
584 ble cgemvt_kernel_S1X1
587 cgemvt_kernel_S1X4_10:
592 bne cgemvt_kernel_S1X4_10
598 ble cgemvt_kernel_S1_END
600 cgemvt_kernel_S1X1_10:
605 bne cgemvt_kernel_S1X1_10
608 cgemvt_kernel_S1_END:
614 /*************************************************************************************************************/
621 vldm r3, { d8 - d15 } // restore floating point registers
623 vldm r3, { s8 - s15 } // restore floating point registers
626 mov r0, #0 // set return value