1 /***************************************************************************
2 Copyright (c) 2013, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
28 /**************************************************************************************
34 **************************************************************************************/
45 #define OLD_ALPHA_R s0
46 #define OLD_ALPHA_I s1
48 /******************************************************
49 * [fp, #-128] - [fp, #-64] is reserved
50 * for store and restore of floating point
52 *******************************************************/
54 #define A [fp, #-248 ]
55 #define LDC [fp, #-252 ]
56 #define M [fp, #-256 ]
57 #define N [fp, #-260 ]
58 #define K [fp, #-264 ]
60 #define FP_ZERO [fp, #-240]
61 #define FP_ZERO_0 [fp, # -240]
62 #define FP_ZERO_1 [fp, # -236]
64 #define ALPHA_I [fp, #-272]
65 #define ALPHA_R [fp, #-280]
67 #if !defined(__ARM_PCS_VFP)
68 #define OLD_ALPHAR_SOFTFP r3
69 #define OLD_ALPHAI_SOFTFP [fp, #4]
70 #define OLD_A_SOFTFP [fp, #8 ]
73 #define OLD_LDC [fp, #20 ]
77 #define OLD_LDC [fp, #12 ]
99 /**************************************************************************************
101 **************************************************************************************/
104 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
106 #define KMAC_R vmls.f32
109 #define FMAC_R1 fmacs
110 #define FMAC_R2 vmls.f32
111 #define FMAC_I1 fmacs
112 #define FMAC_I2 fmacs
114 #elif defined(CN) || defined(CT)
117 #define KMAC_I vmls.f32
119 #define FMAC_R1 fmacs
120 #define FMAC_R2 vmls.f32
121 #define FMAC_I1 fmacs
122 #define FMAC_I2 fmacs
124 #elif defined(NC) || defined(TC)
127 #define KMAC_I vmls.f32
129 #define FMAC_R1 fmacs
130 #define FMAC_R2 fmacs
131 #define FMAC_I1 vmls.f32
132 #define FMAC_I2 fmacs
136 #define KMAC_R vmls.f32
139 #define FMAC_R1 fmacs
140 #define FMAC_R2 fmacs
141 #define FMAC_I1 vmls.f32
142 #define FMAC_I2 fmacs
148 /**************************************************************************************
150 **************************************************************************************/
168 fldmias AO!, { s0 - s3 }
170 fldmias BO!, { s4 - s7 }
200 fldmias AO!, { s0 - s3 }
202 fldmias BO!, { s4 - s7 }
228 fldmias AO!, { s0 - s3 }
229 fldmias BO!, { s4 - s7 }
257 fldmias AO!, { s0 - s3 }
258 fldmias BO!, { s4 - s7 }
285 fldmias AO!, { s0 - s3 }
286 fldmias BO!, { s4 - s7 }
320 fldmias CO1, { s4 - s7 }
327 FMAC_R1 s6 , s0 , s10
328 FMAC_I1 s7 , s0 , s11
329 FMAC_R2 s6 , s1 , s11
330 FMAC_I2 s7 , s1 , s10
332 fstmias CO1, { s4 - s7 }
334 fldmias CO2, { s4 - s7 }
336 FMAC_R1 s4 , s0 , s12
337 FMAC_I1 s5 , s0 , s13
338 FMAC_R2 s4 , s1 , s13
339 FMAC_I2 s5 , s1 , s12
341 FMAC_R1 s6 , s0 , s14
342 FMAC_I1 s7 , s0 , s15
343 FMAC_R2 s6 , s1 , s15
344 FMAC_I2 s7 , s1 , s14
346 fstmias CO2, { s4 - s7 }
352 /******************************************************************************/
372 flds s7 , [ BO, #12 ]
399 flds s7 , [ BO, #12 ]
424 flds s7 , [ BO, #12 ]
451 flds s7 , [ BO, #12 ]
476 flds s7 , [ BO, #12 ]
503 fldmias CO1, { s4 - s5 }
510 fstmias CO1, { s4 - s5 }
512 fldmias CO2, { s4 - s5 }
514 FMAC_R1 s4 , s0 , s12
515 FMAC_I1 s5 , s0 , s13
516 FMAC_R2 s4 , s1 , s13
517 FMAC_I2 s5 , s1 , s12
519 fstmias CO2, { s4 - s5 }
526 /******************************************************************************/
543 flds s3 , [ AO, #12 ]
570 flds s3 , [ AO, #12 ]
595 flds s3 , [ AO, #12 ]
622 flds s3 , [ AO, #12 ]
647 flds s3 , [ AO, #12 ]
674 fldmias CO1, { s4 - s7 }
681 FMAC_R1 s6 , s0 , s10
682 FMAC_I1 s7 , s0 , s11
683 FMAC_R2 s6 , s1 , s11
684 FMAC_I2 s7 , s1 , s10
686 fstmias CO1, { s4 - s7 }
693 /******************************************************************************/
803 fldmias CO1, { s4 - s5 }
810 fstmias CO1, { s4 - s5 }
816 /**************************************************************************************
817 * End of macro definitions
818 **************************************************************************************/
826 sub sp, sp, #STACKSIZE // reserve stack
828 #if !defined(__ARM_PCS_VFP)
829 vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
830 vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
831 ldr OLD_A, OLD_A_SOFTFP
837 vstr OLD_ALPHA_R, ALPHA_R
838 vstr OLD_ALPHA_I, ALPHA_I
841 vstm r3, { s8 - s15} // store floating point registers
848 lsl r3, r3, #3 // ldc = ldc * 4 * 2
855 asrs J, J, #1 // J = J / 2
856 ble cgemm_kernel_L1_BEGIN
858 cgemm_kernel_L2_BEGIN:
860 ldr CO1, C // CO1 = C
862 lsl r4 , r4 , #1 // LDC * 2
864 str r3 , C // store C
872 cgemm_kernel_L2_M2_BEGIN:
875 asrs I, I, #1 // I = I / 2
876 ble cgemm_kernel_L2_M1_BEGIN
878 cgemm_kernel_L2_M2_20:
882 asrs L , K1, #3 // L = L / 8
884 blt cgemm_kernel_L2_M2_30
901 cgemm_kernel_L2_M2_22:
914 bgt cgemm_kernel_L2_M2_22
926 b cgemm_kernel_L2_M2_44
929 cgemm_kernel_L2_M2_30:
931 ble cgemm_kernel_L2_M2_40
934 ble cgemm_kernel_L2_M2_32
957 b cgemm_kernel_L2_M2_44
959 cgemm_kernel_L2_M2_32:
962 ble cgemm_kernel_L2_M2_40
974 b cgemm_kernel_L2_M2_44
977 cgemm_kernel_L2_M2_40:
982 cgemm_kernel_L2_M2_44:
984 ands L , K1, #7 // L = L % 8
985 ble cgemm_kernel_L2_M2_100
987 cgemm_kernel_L2_M2_46:
992 bne cgemm_kernel_L2_M2_46
994 cgemm_kernel_L2_M2_100:
998 cgemm_kernel_L2_M2_END:
1001 bne cgemm_kernel_L2_M2_20
1004 cgemm_kernel_L2_M1_BEGIN:
1007 tst I, #1 // I = I % 2
1008 ble cgemm_kernel_L2_END
1010 cgemm_kernel_L2_M1_20:
1015 asrs L , K1, #3 // L = L / 8
1016 ble cgemm_kernel_L2_M1_40
1018 cgemm_kernel_L2_M1_22:
1031 bgt cgemm_kernel_L2_M1_22
1034 cgemm_kernel_L2_M1_40:
1036 ands L , K1, #7 // L = L % 8
1037 ble cgemm_kernel_L2_M1_100
1039 cgemm_kernel_L2_M1_42:
1044 bgt cgemm_kernel_L2_M1_42
1046 cgemm_kernel_L2_M1_100:
1051 cgemm_kernel_L2_END:
1055 lsl r4, r4, #4 // k * 2 * 4 * 2
1056 add r3, r3, r4 // B = B + K * 2 * 8
1060 bgt cgemm_kernel_L2_BEGIN
1064 /*********************************************************************************************/
1066 cgemm_kernel_L1_BEGIN:
1070 ble cgemm_kernel_L999
1073 ldr CO1, C // CO1 = C
1076 str r3 , C // store C
1080 cgemm_kernel_L1_M2_BEGIN:
1083 asrs I, I, #1 // I = I / 2
1084 ble cgemm_kernel_L1_M1_BEGIN
1086 cgemm_kernel_L1_M2_20:
1090 asrs L , K1, #3 // L = L / 8
1092 blt cgemm_kernel_L1_M2_30
1109 cgemm_kernel_L1_M2_22:
1122 bgt cgemm_kernel_L1_M2_22
1134 b cgemm_kernel_L1_M2_44
1137 cgemm_kernel_L1_M2_30:
1139 ble cgemm_kernel_L1_M2_40
1142 ble cgemm_kernel_L1_M2_32
1165 b cgemm_kernel_L1_M2_44
1167 cgemm_kernel_L1_M2_32:
1170 ble cgemm_kernel_L1_M2_40
1182 b cgemm_kernel_L1_M2_44
1185 cgemm_kernel_L1_M2_40:
1190 cgemm_kernel_L1_M2_44:
1192 ands L , K1, #7 // L = L % 8
1193 ble cgemm_kernel_L1_M2_100
1195 cgemm_kernel_L1_M2_46:
1200 bne cgemm_kernel_L1_M2_46
1202 cgemm_kernel_L1_M2_100:
1206 cgemm_kernel_L1_M2_END:
1209 bne cgemm_kernel_L1_M2_20
1212 cgemm_kernel_L1_M1_BEGIN:
1215 tst I, #1 // I = I % 2
1216 ble cgemm_kernel_L1_END
1218 cgemm_kernel_L1_M1_20:
1223 asrs L , K1, #3 // L = L / 8
1224 ble cgemm_kernel_L1_M1_40
1226 cgemm_kernel_L1_M1_22:
1239 bgt cgemm_kernel_L1_M1_22
1242 cgemm_kernel_L1_M1_40:
1244 ands L , K1, #7 // L = L % 8
1245 ble cgemm_kernel_L1_M1_100
1247 cgemm_kernel_L1_M1_42:
1252 bgt cgemm_kernel_L1_M1_42
1254 cgemm_kernel_L1_M1_100:
1259 cgemm_kernel_L1_END:
1266 vldm r3, { s8 - s15} // restore floating point registers
1268 movs r0, #0 // set return value