1 /***************************************************************************
2 Copyright (c) 2013, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
28 /**************************************************************************************
34 **************************************************************************************/
41 #if !defined(__ARM_PCS_VFP)
42 #define OLD_ALPHAR [fp, #0 ]
43 #define OLD_ALPHAI [fp, #8 ]
44 #define OLD_A_SOFTFP [fp, #16]
45 #define OLD_LDA [fp, #20]
47 #define OLD_INC_X [fp, #28]
49 #define OLD_INC_Y [fp, #36]
51 #define OLD_LDA [fp, #0 ]
53 #define OLD_INC_X [fp, #8 ]
55 #define OLD_INC_Y [fp, #16 ]
74 #define FP_ZERO [fp, #-228]
75 #define FP_ZERO_0 [fp, #-228]
76 #define FP_ZERO_1 [fp, #-224]
79 #define ALPHA_I [fp, #-236]
80 #define ALPHA_R [fp, #-244]
82 #define M [fp, #-252 ]
83 #define A [fp, #-256 ]
90 /**************************************************************************************/
92 #if !defined(CONJ) && !defined(XCONJ)
98 #define FMAC_R2 fnmacd
100 #define FMAC_I2 fmacd
102 #elif defined(CONJ) && !defined(XCONJ)
105 #define KMAC_I fnmacd
107 #define FMAC_R1 fmacd
108 #define FMAC_R2 fnmacd
109 #define FMAC_I1 fmacd
110 #define FMAC_I2 fmacd
112 #elif !defined(CONJ) && defined(XCONJ)
115 #define KMAC_I fnmacd
117 #define FMAC_R1 fmacd
118 #define FMAC_R2 fmacd
119 #define FMAC_I1 fnmacd
120 #define FMAC_I2 fmacd
124 #define KMAC_R fnmacd
127 #define FMAC_R1 fmacd
128 #define FMAC_R2 fmacd
129 #define FMAC_I1 fnmacd
130 #define FMAC_I2 fmacd
168 fldd d1 , [ AO1, #8 ]
170 fldd d2 , [ AO1, #16 ]
172 fldd d3 , [ AO1, #24 ]
174 fldd d0 , [ AO1, #32 ]
180 fldd d1 , [ AO1, #40 ]
183 fldd d2 , [ AO1, #48 ]
186 fldd d3 , [ AO1, #56 ]
188 pld [ AO2, #A_PRE+32 ]
207 fldmiad YO, { d4 - d7 }
214 FMAC_R1 d6 , d0 , d10
215 FMAC_I1 d7 , d0 , d11
216 FMAC_R2 d6 , d1 , d11
217 FMAC_I2 d7 , d1 , d10
219 fstmiad YO!, { d4 - d7 }
221 fldmiad YO, { d4 - d7 }
223 FMAC_R1 d4 , d0 , d12
224 FMAC_I1 d5 , d0 , d13
225 FMAC_R2 d4 , d1 , d13
226 FMAC_I2 d5 , d1 , d12
228 FMAC_R1 d6 , d0 , d14
229 FMAC_I1 d7 , d0 , d15
230 FMAC_R2 d6 , d1 , d15
231 FMAC_I2 d7 , d1 , d14
233 fstmiad YO!, { d4 - d7 }
250 fldd d1 , [ AO1, #8 ]
272 fldmiad YO, { d4 - d5 }
279 fstmiad YO, { d4 - d5 }
285 /****************************************************************************************/
312 fldd d1 , [ AO1, #8 ]
313 fldd d2 , [ AO1, #16 ]
314 fldd d3 , [ AO1, #24 ]
329 fldd d0 , [ AO1, #32 ]
330 fldd d1 , [ AO1, #40 ]
331 fldd d2 , [ AO1, #48 ]
332 fldd d3 , [ AO1, #56 ]
355 fldmiad YO, { d4 - d5 }
362 fstmiad YO, { d4 - d5 }
366 fldmiad YO, { d6 - d7 }
368 FMAC_R1 d6 , d0 , d10
369 FMAC_I1 d7 , d0 , d11
370 FMAC_R2 d6 , d1 , d11
371 FMAC_I2 d7 , d1 , d10
373 fstmiad YO, { d6 - d7 }
377 fldmiad YO, { d4 - d5 }
379 FMAC_R1 d4 , d0 , d12
380 FMAC_I1 d5 , d0 , d13
381 FMAC_R2 d4 , d1 , d13
382 FMAC_I2 d5 , d1 , d12
384 fstmiad YO, { d4 - d5 }
388 fldmiad YO, { d6 - d7 }
390 FMAC_R1 d6 , d0 , d14
391 FMAC_I1 d7 , d0 , d15
392 FMAC_R2 d6 , d1 , d15
393 FMAC_I2 d7 , d1 , d14
395 fstmiad YO, { d6 - d7 }
414 fldd d1 , [ AO1, #8 ]
436 fldmiad YO, { d4 - d5 }
443 fstmiad YO, { d4 - d5 }
451 /**************************************************************************************
452 * End of macro definitions
453 **************************************************************************************/
460 sub sp, sp, #STACKSIZE // reserve stack
465 vstm r12, { d8 - d15 } // store floating point registers
467 vstm r12, { s8 - s15 } // store floating point registers
475 ble zgemvn_kernel_L999
478 ble zgemvn_kernel_L999
480 #if !defined(__ARM_PCS_VFP)
483 ldr OLD_A, OLD_A_SOFTFP
492 ldr INC_X , OLD_INC_X
493 ldr INC_Y , OLD_INC_Y
496 beq zgemvn_kernel_L999
499 beq zgemvn_kernel_L999
505 lsl LDA, LDA, #4 // LDA * SIZE * 2
507 lsl LDA, LDA, #3 // LDA * SIZE * 2
511 bne zgemvn_kernel_S4_BEGIN
514 bne zgemvn_kernel_S4_BEGIN
517 zgemvn_kernel_F4_BEGIN:
522 asrs I, I, #2 // I = M / 4
523 ble zgemvn_kernel_F1_BEGIN
539 asrs J, N, #2 // J = N / 4
540 ble zgemvn_kernel_F4X1
543 zgemvn_kernel_F4X4_10:
548 bne zgemvn_kernel_F4X4_10
554 ble zgemvn_kernel_F4_END
556 zgemvn_kernel_F4X1_10:
561 bne zgemvn_kernel_F4X1_10
564 zgemvn_kernel_F4_END:
569 bne zgemvn_kernel_F4X4
572 zgemvn_kernel_F1_BEGIN:
576 ble zgemvn_kernel_L999
591 zgemvn_kernel_F1X1_10:
596 bne zgemvn_kernel_F1X1_10
599 zgemvn_kernel_F1_END:
604 bne zgemvn_kernel_F1X1
610 /*************************************************************************************************************/
612 zgemvn_kernel_S4_BEGIN:
615 lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
616 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
618 lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
619 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
625 asrs I, I, #2 // I = M / 4
626 ble zgemvn_kernel_S1_BEGIN
639 asrs J, N, #2 // J = N / 4
640 ble zgemvn_kernel_S4X1
643 zgemvn_kernel_S4X4_10:
648 bne zgemvn_kernel_S4X4_10
654 ble zgemvn_kernel_S4_END
656 zgemvn_kernel_S4X1_10:
661 bne zgemvn_kernel_S4X1_10
664 zgemvn_kernel_S4_END:
669 bne zgemvn_kernel_S4X4
672 zgemvn_kernel_S1_BEGIN:
676 ble zgemvn_kernel_L999
691 zgemvn_kernel_S1X1_10:
696 bne zgemvn_kernel_S1X1_10
699 zgemvn_kernel_S1_END:
704 bne zgemvn_kernel_S1X1
707 /*************************************************************************************************************/
714 vldm r3, { d8 - d15 } // restore floating point registers
716 vldm r3, { s8 - s15 } // restore floating point registers
719 mov r0, #0 // set return value