2 * Copyright (c) 2017 ARM Limited.
4 * SPDX-License-Identifier: MIT
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 template <const unsigned int tail>
26 inline void sgemm_4x16_impl(
27 const float* const a, const float* const b, float *c,
28 const int M, const int K, const int N,
29 const int a_row_stride,
30 const int b_row_stride,
31 const int c_row_stride
35 inline void sgemm_4x16_impl<0>(
36 const float* const a, const float* const b, float *c,
37 const int M, const int K, const int N,
38 const int a_row_stride,
39 const int b_row_stride,
40 const int c_row_stride
42 const int TAIL_SIZE = 0;
43 const int M_BLOCK = 4;
44 const int N_BLOCK = 16;
46 const int m_blocks = iceildiv(M, M_BLOCK);
47 const int n_blocks = iceildiv(N, N_BLOCK);
49 // For each block of output rows
50 for (int mblock = 0; mblock < m_blocks; mblock++) {
51 // For each block of output columns
52 for (int nblock = 0; nblock < n_blocks; nblock++) {
53 const float *aptr = a + mblock*M_BLOCK*a_row_stride;
54 const float *bptr = b + nblock*N_BLOCK;
55 float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
56 int k = (K - TAIL_SIZE) / 4;
62 "vC11 .req v0\n" "vC12 .req v1\n" "vC13 .req v2\n" "vC14 .req v3\n"
63 "qC11 .req q0\n" "qC12 .req q1\n" "qC13 .req q2\n" "qC14 .req q3\n"
64 "vC21 .req v4\n" "vC22 .req v5\n" "vC23 .req v6\n" "vC24 .req v7\n"
65 "qC21 .req q4\n" "qC22 .req q5\n" "qC23 .req q6\n" "qC24 .req q7\n"
66 "vC31 .req v8\n" "vC32 .req v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
67 "qC31 .req q8\n" "qC32 .req q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
68 "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
69 "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
70 "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
71 "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
72 "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
73 "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
74 "vB1 .req v20\n" "qB1 .req q20\n"
75 "vB2 .req v21\n" "qB2 .req q21\n"
76 "vB3 .req v22\n" "qB3 .req q22\n"
77 "vB4 .req v23\n" "qB4 .req q23\n"
79 // Clear accumulators, initialise pointers
81 "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
83 "add aptr3, aptr2, %x[a_row_stride_bytes]\n"
85 "add aptr4, aptr3, %x[a_row_stride_bytes]\n"
87 "ldr qA1, [%x[aptr]], #0x10\n"
89 "ldr qA2, [ aptr2], #0x10\n"
91 "ldr qB1, [%x[bptr], #0x00]\n"
93 "ldr qB2, [%x[bptr], #0x10]\n"
95 "ldr qB3, [%x[bptr], #0x20]\n"
104 "subs %x[k], %x[k], #1\n"
108 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
109 "ldr qA3, [ aptr3], #0x10\n"
110 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
111 "ldr qA4, [ aptr4], #0x10\n"
112 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
113 "ldr qB4, [%x[bptr], #0x30]\n"
114 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
115 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
116 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
117 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
118 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
119 "ldr qB1, [%x[bptr], #0x00]\n"
120 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
121 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
122 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
123 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
124 "ldr qB2, [%x[bptr], #0x10]\n"
125 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
126 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
127 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
128 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
129 "ldr qB3, [%x[bptr], #0x20]\n"
130 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
132 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
133 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
134 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
135 "ldr qB4, [%x[bptr], #0x30]\n"
136 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
137 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
138 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
139 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
140 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
141 "ldr qB1, [%x[bptr], #0x00]\n"
142 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
143 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
144 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
145 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
146 "ldr qB2, [%x[bptr], #0x10]\n"
147 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
148 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
149 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
150 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
151 "ldr qB3, [%x[bptr], #0x20]\n"
152 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
154 "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
155 "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
156 "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
157 "ldr qB4, [%x[bptr], #0x30]\n"
158 "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
159 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
160 "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
161 "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
162 "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
163 "ldr qB1, [%x[bptr], #0x00]\n"
164 "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
165 "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
166 "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
167 "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
168 "ldr qB2, [%x[bptr], #0x10]\n"
169 "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
170 "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
171 "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
172 "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
173 "ldr qB3, [%x[bptr], #0x20]\n"
174 "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
176 "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
177 "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
178 "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
179 "ldr qB4, [%x[bptr], #0x30]\n"
180 "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
181 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
182 "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
183 "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
184 "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
185 "ldr qB1, [%x[bptr], #0x00]\n"
186 "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
187 "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
188 "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
189 "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
190 "ldr qB2, [%x[bptr], #0x10]\n"
191 "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
192 "subs %x[k], %x[k], #1\n"
193 "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
194 "ldr qA1, [%x[aptr]], #0x10\n"
195 "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
196 "ldr qA2, [ aptr2], #0x10\n"
197 "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
198 "ldr qB3, [%x[bptr], #0x20]\n"
199 "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
203 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
204 "ldr qA3, [ aptr3], #0x10\n"
205 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
206 "ldr qA4, [ aptr4], #0x10\n"
207 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
208 "ldr qB4, [%x[bptr], #0x30]\n"
209 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
210 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
211 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
212 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
213 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
214 "ldr qB1, [%x[bptr], #0x00]\n"
215 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
216 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
217 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
218 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
219 "ldr qB2, [%x[bptr], #0x10]\n"
220 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
221 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
222 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
223 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
224 "ldr qB3, [%x[bptr], #0x20]\n"
225 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
227 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
228 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
229 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
230 "ldr qB4, [%x[bptr], #0x30]\n"
231 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
232 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
233 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
234 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
235 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
236 "ldr qB1, [%x[bptr], #0x00]\n"
237 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
238 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
239 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
240 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
241 "ldr qB2, [%x[bptr], #0x10]\n"
242 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
243 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
244 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
245 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
246 "ldr qB3, [%x[bptr], #0x20]\n"
247 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
249 "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
250 "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
251 "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
252 "ldr qB4, [%x[bptr], #0x30]\n"
253 "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
254 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
255 "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
256 "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
257 "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
258 "ldr qB1, [%x[bptr], #0x00]\n"
259 "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
260 "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
261 "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
262 "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
263 "ldr qB2, [%x[bptr], #0x10]\n"
264 "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
265 "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
266 "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
267 "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
268 "ldr qB3, [%x[bptr], #0x20]\n"
269 "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
271 "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
272 "ldr qB4, [%x[bptr], #0x30]\n"
273 "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
274 "stp qC11, qC12, [%x[cptr], #0x00]\n"
275 "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
276 "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
277 "stp qC13, qC14, [%x[cptr], #0x20]\n"
278 "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
279 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
280 "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
281 "stp qC21, qC22, [%x[cptr], #0x00]\n"
282 "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
283 "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
284 "stp qC23, qC24, [%x[cptr], #0x20]\n"
285 "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
286 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
287 "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
288 "stp qC31, qC32, [%x[cptr], #0x00]\n"
289 "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
290 "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
291 "stp qC33, qC34, [%x[cptr], #0x20]\n"
292 "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
293 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
294 "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
295 "stp qC41, qC42, [%x[cptr], #0x00]\n"
296 "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
297 "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
298 "stp qC43, qC44, [%x[cptr], #0x20]\n"
299 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
301 ".unreq vB4\n" ".unreq qB4\n"
302 ".unreq vB3\n" ".unreq qB3\n"
303 ".unreq vB2\n" ".unreq qB2\n"
304 ".unreq vB1\n" ".unreq qB1\n"
305 ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
306 ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
307 ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
308 ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
309 ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
310 ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
311 ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
312 ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
313 ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
314 ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
315 ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
316 ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
321 : [aptr] "+r" (aptr),
325 : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
326 [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
327 [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
328 : "cc", "memory", "x20", "x21", "x22",
329 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
330 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
338 inline void sgemm_4x16_impl<1>(
339 const float* const a, const float* const b, float *c,
340 const int M, const int K, const int N,
341 const int a_row_stride,
342 const int b_row_stride,
343 const int c_row_stride
345 const int TAIL_SIZE = 1;
346 const int M_BLOCK = 4;
347 const int N_BLOCK = 16;
349 const int m_blocks = iceildiv(M, M_BLOCK);
350 const int n_blocks = iceildiv(N, N_BLOCK);
352 // For each block of output rows
353 for (int mblock = 0; mblock < m_blocks; mblock++) {
354 // For each block of output columns
355 for (int nblock = 0; nblock < n_blocks; nblock++) {
356 const float *aptr = a + mblock*M_BLOCK*a_row_stride;
357 const float *bptr = b + nblock*N_BLOCK;
358 float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
359 int k = (K - TAIL_SIZE) / 4;
365 "vC11 .req v0\n" "vC12 .req v1\n" "vC13 .req v2\n" "vC14 .req v3\n"
366 "qC11 .req q0\n" "qC12 .req q1\n" "qC13 .req q2\n" "qC14 .req q3\n"
367 "vC21 .req v4\n" "vC22 .req v5\n" "vC23 .req v6\n" "vC24 .req v7\n"
368 "qC21 .req q4\n" "qC22 .req q5\n" "qC23 .req q6\n" "qC24 .req q7\n"
369 "vC31 .req v8\n" "vC32 .req v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
370 "qC31 .req q8\n" "qC32 .req q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
371 "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
372 "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
373 "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
374 "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
375 "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
376 "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
377 "vB1 .req v20\n" "qB1 .req q20\n"
378 "vB2 .req v21\n" "qB2 .req q21\n"
379 "vB3 .req v22\n" "qB3 .req q22\n"
380 "vB4 .req v23\n" "qB4 .req q23\n"
382 // Clear accumulators, initialise pointers
384 "ldr qB1, [%x[bptr], #0x00]\n"
386 "ldr qB2, [%x[bptr], #0x10]\n"
388 "ldr qB3, [%x[bptr], #0x20]\n"
390 "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
392 "add aptr3, aptr2, %x[a_row_stride_bytes]\n"
394 "add aptr4, aptr3, %x[a_row_stride_bytes]\n"
398 // Prepare for tail in K
400 "ldr sA1, [%x[aptr]], #0x04\n"
402 "ldr sA2, [ aptr2], #0x04\n"
410 "b 2f\n" // Jump to tail
412 "3:" // Prepare for loop over K
414 "ldr qA1, [%x[aptr]], #0x10\n"
416 "ldr qA2, [ aptr2], #0x10\n"
424 "subs %x[k], %x[k], #1\n"
428 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
429 "ldr qA3, [ aptr3], #0x10\n"
430 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
431 "ldr qA4, [ aptr4], #0x10\n"
432 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
433 "ldr qB4, [%x[bptr], #0x30]\n"
434 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
435 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
436 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
437 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
438 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
439 "ldr qB1, [%x[bptr], #0x00]\n"
440 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
441 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
442 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
443 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
444 "ldr qB2, [%x[bptr], #0x10]\n"
445 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
446 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
447 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
448 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
449 "ldr qB3, [%x[bptr], #0x20]\n"
450 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
452 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
453 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
454 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
455 "ldr qB4, [%x[bptr], #0x30]\n"
456 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
457 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
458 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
459 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
460 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
461 "ldr qB1, [%x[bptr], #0x00]\n"
462 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
463 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
464 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
465 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
466 "ldr qB2, [%x[bptr], #0x10]\n"
467 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
468 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
469 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
470 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
471 "ldr qB3, [%x[bptr], #0x20]\n"
472 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
474 "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
475 "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
476 "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
477 "ldr qB4, [%x[bptr], #0x30]\n"
478 "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
479 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
480 "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
481 "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
482 "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
483 "ldr qB1, [%x[bptr], #0x00]\n"
484 "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
485 "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
486 "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
487 "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
488 "ldr qB2, [%x[bptr], #0x10]\n"
489 "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
490 "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
491 "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
492 "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
493 "ldr qB3, [%x[bptr], #0x20]\n"
494 "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
496 "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
497 "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
498 "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
499 "ldr qB4, [%x[bptr], #0x30]\n"
500 "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
501 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
502 "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
503 "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
504 "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
505 "ldr qB1, [%x[bptr], #0x00]\n"
506 "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
507 "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
508 "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
509 "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
510 "ldr qB2, [%x[bptr], #0x10]\n"
511 "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
512 "subs %x[k], %x[k], #1\n"
513 "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
514 "ldr qA1, [%x[aptr]], #0x10\n"
515 "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
516 "ldr qA2, [ aptr2], #0x10\n"
517 "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
518 "ldr qB3, [%x[bptr], #0x20]\n"
519 "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
522 "4:" // Tail iteration
523 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
524 "ldr qA3, [ aptr3], #0x10\n"
525 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
526 "ldr qA4, [ aptr4], #0x10\n"
527 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
528 "ldr qB4, [%x[bptr], #0x30]\n"
529 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
530 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
531 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
532 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
533 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
534 "ldr qB1, [%x[bptr], #0x00]\n"
535 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
536 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
537 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
538 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
539 "ldr qB2, [%x[bptr], #0x10]\n"
540 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
541 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
542 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
543 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
544 "ldr qB3, [%x[bptr], #0x20]\n"
545 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
547 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
548 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
549 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
550 "ldr qB4, [%x[bptr], #0x30]\n"
551 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
552 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
553 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
554 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
555 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
556 "ldr qB1, [%x[bptr], #0x00]\n"
557 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
558 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
559 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
560 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
561 "ldr qB2, [%x[bptr], #0x10]\n"
562 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
563 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
564 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
565 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
566 "ldr qB3, [%x[bptr], #0x20]\n"
567 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
569 "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
570 "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
571 "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
572 "ldr qB4, [%x[bptr], #0x30]\n"
573 "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
574 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
575 "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
576 "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
577 "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
578 "ldr qB1, [%x[bptr], #0x00]\n"
579 "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
580 "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
581 "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
582 "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
583 "ldr qB2, [%x[bptr], #0x10]\n"
584 "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
585 "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
586 "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
587 "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
588 "ldr qB3, [%x[bptr], #0x20]\n"
589 "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
591 "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
592 "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
593 "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
594 "ldr qB4, [%x[bptr], #0x30]\n"
595 "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
596 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
597 "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
598 "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
599 "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
600 "ldr qB1, [%x[bptr], #0x00]\n"
601 "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
602 "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
603 "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
604 "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
605 "ldr qB2, [%x[bptr], #0x10]\n"
606 "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
607 "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
608 "ldr sA1, [%x[aptr]], #0x04\n"
609 "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
610 "ldr sA2, [ aptr2], #0x04\n"
611 "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
612 "ldr qB3, [%x[bptr], #0x20]\n"
613 "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
616 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
617 "ldr qB4, [%x[bptr], #0x30]\n"
618 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
619 "stp qC11, qC12, [%x[cptr], #0x00]\n"
620 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
621 "ldr sA3, [ aptr3], #0x04\n"
622 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
623 "stp qC13, qC14, [%x[cptr], #0x20]\n"
624 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
625 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
626 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
627 "stp qC21, qC22, [%x[cptr], #0x00]\n"
628 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
629 "ldr sA4, [ aptr4], #0x04\n"
630 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
631 "stp qC23, qC24, [%x[cptr], #0x20]\n"
632 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
633 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
634 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
635 "stp qC31, qC32, [%x[cptr], #0x00]\n"
636 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
637 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
638 "stp qC33, qC34, [%x[cptr], #0x20]\n"
639 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
640 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
641 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
642 "stp qC41, qC42, [%x[cptr], #0x00]\n"
643 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
644 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
645 "stp qC43, qC44, [%x[cptr], #0x20]\n"
646 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
648 ".unreq vB4\n" ".unreq qB4\n"
649 ".unreq vB3\n" ".unreq qB3\n"
650 ".unreq vB2\n" ".unreq qB2\n"
651 ".unreq vB1\n" ".unreq qB1\n"
652 ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
653 ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
654 ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
655 ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
656 ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
657 ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
658 ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
659 ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
660 ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
661 ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
662 ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
663 ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
668 : [aptr] "+r" (aptr),
672 : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
673 [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
674 [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
675 : "cc", "memory", "x20", "x21", "x22",
676 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
677 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
685 inline void sgemm_4x16_impl<2>(
686 const float* const a, const float* const b, float *c,
687 const int M, const int K, const int N,
688 const int a_row_stride,
689 const int b_row_stride,
690 const int c_row_stride
692 const int TAIL_SIZE = 2;
693 const int M_BLOCK = 4;
694 const int N_BLOCK = 16;
696 const int m_blocks = iceildiv(M, M_BLOCK);
697 const int n_blocks = iceildiv(N, N_BLOCK);
699 // For each block of output rows
700 for (int mblock = 0; mblock < m_blocks; mblock++) {
701 // For each block of output columns
702 for (int nblock = 0; nblock < n_blocks; nblock++) {
703 const float *aptr = a + mblock*M_BLOCK*a_row_stride;
704 const float *bptr = b + nblock*N_BLOCK;
705 float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
706 int k = (K - TAIL_SIZE) / 4;
712 "vC11 .req v0\n" "vC12 .req v1\n" "vC13 .req v2\n" "vC14 .req v3\n"
713 "qC11 .req q0\n" "qC12 .req q1\n" "qC13 .req q2\n" "qC14 .req q3\n"
714 "vC21 .req v4\n" "vC22 .req v5\n" "vC23 .req v6\n" "vC24 .req v7\n"
715 "qC21 .req q4\n" "qC22 .req q5\n" "qC23 .req q6\n" "qC24 .req q7\n"
716 "vC31 .req v8\n" "vC32 .req v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
717 "qC31 .req q8\n" "qC32 .req q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
718 "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
719 "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
720 "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
721 "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
722 "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
723 "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
724 "vB1 .req v20\n" "qB1 .req q20\n"
725 "vB2 .req v21\n" "qB2 .req q21\n"
726 "vB3 .req v22\n" "qB3 .req q22\n"
727 "vB4 .req v23\n" "qB4 .req q23\n"
729 // Clear accumulators, initialise pointers
731 "ldr qB1, [%x[bptr], #0x00]\n"
733 "ldr qB2, [%x[bptr], #0x10]\n"
735 "ldr qB3, [%x[bptr], #0x20]\n"
737 "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
739 "add aptr3, aptr2, %x[a_row_stride_bytes]\n"
741 "add aptr4, aptr3, %x[a_row_stride_bytes]\n"
745 // Prepare for tail in K
747 "ldr dA1, [%x[aptr]], #0x08\n"
749 "ldr dA2, [ aptr2], #0x08\n"
757 "b 2f\n" // Jump to tail
759 "3:" // Prepare for loop over K
761 "ldr qA1, [%x[aptr]], #0x10\n"
763 "ldr qA2, [ aptr2], #0x10\n"
771 "subs %x[k], %x[k], #1\n"
775 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
776 "ldr qA3, [ aptr3], #0x10\n"
777 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
778 "ldr qA4, [ aptr4], #0x10\n"
779 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
780 "ldr qB4, [%x[bptr], #0x30]\n"
781 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
782 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
783 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
784 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
785 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
786 "ldr qB1, [%x[bptr], #0x00]\n"
787 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
788 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
789 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
790 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
791 "ldr qB2, [%x[bptr], #0x10]\n"
792 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
793 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
794 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
795 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
796 "ldr qB3, [%x[bptr], #0x20]\n"
797 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
799 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
800 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
801 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
802 "ldr qB4, [%x[bptr], #0x30]\n"
803 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
804 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
805 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
806 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
807 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
808 "ldr qB1, [%x[bptr], #0x00]\n"
809 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
810 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
811 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
812 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
813 "ldr qB2, [%x[bptr], #0x10]\n"
814 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
815 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
816 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
817 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
818 "ldr qB3, [%x[bptr], #0x20]\n"
819 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
821 "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
822 "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
823 "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
824 "ldr qB4, [%x[bptr], #0x30]\n"
825 "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
826 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
827 "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
828 "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
829 "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
830 "ldr qB1, [%x[bptr], #0x00]\n"
831 "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
832 "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
833 "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
834 "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
835 "ldr qB2, [%x[bptr], #0x10]\n"
836 "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
837 "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
838 "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
839 "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
840 "ldr qB3, [%x[bptr], #0x20]\n"
841 "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
843 "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
844 "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
845 "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
846 "ldr qB4, [%x[bptr], #0x30]\n"
847 "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
848 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
849 "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
850 "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
851 "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
852 "ldr qB1, [%x[bptr], #0x00]\n"
853 "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
854 "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
855 "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
856 "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
857 "ldr qB2, [%x[bptr], #0x10]\n"
858 "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
859 "subs %x[k], %x[k], #1\n"
860 "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
861 "ldr qA1, [%x[aptr]], #0x10\n"
862 "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
863 "ldr qA2, [ aptr2], #0x10\n"
864 "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
865 "ldr qB3, [%x[bptr], #0x20]\n"
866 "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
869 "4:" // Tail iteration
870 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
871 "ldr qA3, [ aptr3], #0x10\n"
872 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
873 "ldr qA4, [ aptr4], #0x10\n"
874 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
875 "ldr qB4, [%x[bptr], #0x30]\n"
876 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
877 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
878 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
879 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
880 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
881 "ldr qB1, [%x[bptr], #0x00]\n"
882 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
883 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
884 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
885 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
886 "ldr qB2, [%x[bptr], #0x10]\n"
887 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
888 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
889 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
890 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
891 "ldr qB3, [%x[bptr], #0x20]\n"
892 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
894 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
895 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
896 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
897 "ldr qB4, [%x[bptr], #0x30]\n"
898 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
899 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
900 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
901 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
902 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
903 "ldr qB1, [%x[bptr], #0x00]\n"
904 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
905 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
906 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
907 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
908 "ldr qB2, [%x[bptr], #0x10]\n"
909 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
910 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
911 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
912 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
913 "ldr qB3, [%x[bptr], #0x20]\n"
914 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
916 "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
917 "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
918 "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
919 "ldr qB4, [%x[bptr], #0x30]\n"
920 "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
921 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
922 "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
923 "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
924 "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
925 "ldr qB1, [%x[bptr], #0x00]\n"
926 "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
927 "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
928 "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
929 "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
930 "ldr qB2, [%x[bptr], #0x10]\n"
931 "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
932 "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
933 "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
934 "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
935 "ldr qB3, [%x[bptr], #0x20]\n"
936 "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
938 "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
939 "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
940 "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
941 "ldr qB4, [%x[bptr], #0x30]\n"
942 "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
943 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
944 "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
945 "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
946 "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
947 "ldr qB1, [%x[bptr], #0x00]\n"
948 "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
949 "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
950 "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
951 "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
952 "ldr qB2, [%x[bptr], #0x10]\n"
953 "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
954 "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
955 "ldr dA1, [%x[aptr]], #0x08\n"
956 "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
957 "ldr dA2, [ aptr2], #0x08\n"
958 "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
959 "ldr qB3, [%x[bptr], #0x20]\n"
960 "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
963 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
964 "ldr dA3, [ aptr3], #0x08\n"
965 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
966 "ldr dA4, [ aptr4], #0x08\n"
967 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
968 "ldr qB4, [%x[bptr], #0x30]\n"
969 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
970 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
971 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
972 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
973 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
974 "ldr qB1, [%x[bptr], #0x00]\n"
975 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
976 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
977 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
978 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
979 "ldr qB2, [%x[bptr], #0x10]\n"
980 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
981 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
982 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
983 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
984 "ldr qB3, [%x[bptr], #0x20]\n"
985 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
987 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
988 "ldr qB4, [%x[bptr], #0x30]\n"
989 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
990 "stp qC11, qC12, [%x[cptr], #0x00]\n"
991 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
992 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
993 "stp qC13, qC14, [%x[cptr], #0x20]\n"
994 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
995 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
996 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
997 "stp qC21, qC22, [%x[cptr], #0x00]\n"
998 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
999 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
1000 "stp qC23, qC24, [%x[cptr], #0x20]\n"
1001 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
1002 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1003 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
1004 "stp qC31, qC32, [%x[cptr], #0x00]\n"
1005 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
1006 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
1007 "stp qC33, qC34, [%x[cptr], #0x20]\n"
1008 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
1009 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1010 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
1011 "stp qC41, qC42, [%x[cptr], #0x00]\n"
1012 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
1013 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
1014 "stp qC43, qC44, [%x[cptr], #0x20]\n"
1015 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1017 ".unreq vB4\n" ".unreq qB4\n"
1018 ".unreq vB3\n" ".unreq qB3\n"
1019 ".unreq vB2\n" ".unreq qB2\n"
1020 ".unreq vB1\n" ".unreq qB1\n"
1021 ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
1022 ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
1023 ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
1024 ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
1025 ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
1026 ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
1027 ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
1028 ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
1029 ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
1030 ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
1031 ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
1032 ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
1037 : [aptr] "+r" (aptr),
1041 : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
1042 [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
1043 [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
1044 : "cc", "memory", "x20", "x21", "x22",
1045 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1046 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
1054 inline void sgemm_4x16_impl<3>(
1055 const float* const a, const float* const b, float *c,
1056 const int M, const int K, const int N,
1057 const int a_row_stride,
1058 const int b_row_stride,
1059 const int c_row_stride
1061 const int TAIL_SIZE = 3;
1062 const int M_BLOCK = 4;
1063 const int N_BLOCK = 16;
1065 const int m_blocks = iceildiv(M, M_BLOCK);
1066 const int n_blocks = iceildiv(N, N_BLOCK);
1068 // For each block of output rows
1069 for (int mblock = 0; mblock < m_blocks; mblock++) {
1070 // For each block of output columns
1071 for (int nblock = 0; nblock < n_blocks; nblock++) {
1072 const float *aptr = a + mblock*M_BLOCK*a_row_stride;
1073 const float *bptr = b + nblock*N_BLOCK;
1074 float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
1075 int k = (K - TAIL_SIZE) / 4;
1081 "vC11 .req v0\n" "vC12 .req v1\n" "vC13 .req v2\n" "vC14 .req v3\n"
1082 "qC11 .req q0\n" "qC12 .req q1\n" "qC13 .req q2\n" "qC14 .req q3\n"
1083 "vC21 .req v4\n" "vC22 .req v5\n" "vC23 .req v6\n" "vC24 .req v7\n"
1084 "qC21 .req q4\n" "qC22 .req q5\n" "qC23 .req q6\n" "qC24 .req q7\n"
1085 "vC31 .req v8\n" "vC32 .req v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
1086 "qC31 .req q8\n" "qC32 .req q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
1087 "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
1088 "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
1089 "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
1090 "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
1091 "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
1092 "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
1093 "vB1 .req v20\n" "qB1 .req q20\n"
1094 "vB2 .req v21\n" "qB2 .req q21\n"
1095 "vB3 .req v22\n" "qB3 .req q22\n"
1096 "vB4 .req v23\n" "qB4 .req q23\n"
1098 // Clear accumulators, initialise pointers
1099 "movi vC11.4s, #0\n"
1100 "ldr qB1, [%x[bptr], #0x00]\n"
1101 "movi vC12.4s, #0\n"
1102 "ldr qB2, [%x[bptr], #0x10]\n"
1103 "movi vC13.4s, #0\n"
1104 "ldr qB3, [%x[bptr], #0x20]\n"
1105 "movi vC14.4s, #0\n"
1106 "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
1107 "movi vC21.4s, #0\n"
1108 "add aptr3, aptr2, %x[a_row_stride_bytes]\n"
1109 "movi vC22.4s, #0\n"
1110 "add aptr4, aptr3, %x[a_row_stride_bytes]\n"
1111 "movi vC23.4s, #0\n"
1114 // Prepare for tail in K
1115 "movi vC24.4s, #0\n"
1116 "ldr dA1, [%x[aptr]], #0x08\n"
1117 "movi vC31.4s, #0\n"
1118 "ldr dA2, [ aptr2], #0x08\n"
1119 "movi vC32.4s, #0\n"
1120 "movi vC33.4s, #0\n"
1121 "movi vC34.4s, #0\n"
1122 "movi vC41.4s, #0\n"
1123 "movi vC42.4s, #0\n"
1124 "movi vC43.4s, #0\n"
1125 "movi vC44.4s, #0\n"
1126 "b 2f\n" // Jump to tail
1128 "3:" // Prepare for loop over K
1129 "movi vC24.4s, #0\n"
1130 "ldr qA1, [%x[aptr]], #0x10\n"
1131 "movi vC31.4s, #0\n"
1132 "ldr qA2, [ aptr2], #0x10\n"
1133 "movi vC32.4s, #0\n"
1134 "movi vC33.4s, #0\n"
1135 "movi vC34.4s, #0\n"
1136 "movi vC41.4s, #0\n"
1137 "movi vC42.4s, #0\n"
1138 "movi vC43.4s, #0\n"
1139 "movi vC44.4s, #0\n"
1140 "subs %x[k], %x[k], #1\n"
1144 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
1145 "ldr qA3, [ aptr3], #0x10\n"
1146 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
1147 "ldr qA4, [ aptr4], #0x10\n"
1148 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
1149 "ldr qB4, [%x[bptr], #0x30]\n"
1150 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
1151 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1152 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
1153 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
1154 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
1155 "ldr qB1, [%x[bptr], #0x00]\n"
1156 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
1157 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
1158 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
1159 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
1160 "ldr qB2, [%x[bptr], #0x10]\n"
1161 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
1162 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
1163 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
1164 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
1165 "ldr qB3, [%x[bptr], #0x20]\n"
1166 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
1168 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
1169 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
1170 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
1171 "ldr qB4, [%x[bptr], #0x30]\n"
1172 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
1173 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1174 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
1175 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
1176 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
1177 "ldr qB1, [%x[bptr], #0x00]\n"
1178 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
1179 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
1180 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
1181 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
1182 "ldr qB2, [%x[bptr], #0x10]\n"
1183 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
1184 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
1185 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
1186 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
1187 "ldr qB3, [%x[bptr], #0x20]\n"
1188 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
1190 "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
1191 "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
1192 "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
1193 "ldr qB4, [%x[bptr], #0x30]\n"
1194 "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
1195 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1196 "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
1197 "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
1198 "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
1199 "ldr qB1, [%x[bptr], #0x00]\n"
1200 "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
1201 "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
1202 "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
1203 "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
1204 "ldr qB2, [%x[bptr], #0x10]\n"
1205 "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
1206 "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
1207 "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
1208 "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
1209 "ldr qB3, [%x[bptr], #0x20]\n"
1210 "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
1212 "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
1213 "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
1214 "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
1215 "ldr qB4, [%x[bptr], #0x30]\n"
1216 "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
1217 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1218 "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
1219 "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
1220 "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
1221 "ldr qB1, [%x[bptr], #0x00]\n"
1222 "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
1223 "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
1224 "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
1225 "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
1226 "ldr qB2, [%x[bptr], #0x10]\n"
1227 "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
1228 "subs %x[k], %x[k], #1\n"
1229 "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
1230 "ldr qA1, [%x[aptr]], #0x10\n"
1231 "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
1232 "ldr qA2, [ aptr2], #0x10\n"
1233 "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
1234 "ldr qB3, [%x[bptr], #0x20]\n"
1235 "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
1238 "4:" // Tail iteration
1239 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
1240 "ldr qA3, [ aptr3], #0x10\n"
1241 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
1242 "ldr qA4, [ aptr4], #0x10\n"
1243 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
1244 "ldr qB4, [%x[bptr], #0x30]\n"
1245 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
1246 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1247 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
1248 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
1249 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
1250 "ldr qB1, [%x[bptr], #0x00]\n"
1251 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
1252 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
1253 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
1254 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
1255 "ldr qB2, [%x[bptr], #0x10]\n"
1256 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
1257 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
1258 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
1259 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
1260 "ldr qB3, [%x[bptr], #0x20]\n"
1261 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
1263 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
1264 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
1265 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
1266 "ldr qB4, [%x[bptr], #0x30]\n"
1267 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
1268 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1269 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
1270 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
1271 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
1272 "ldr qB1, [%x[bptr], #0x00]\n"
1273 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
1274 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
1275 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
1276 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
1277 "ldr qB2, [%x[bptr], #0x10]\n"
1278 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
1279 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
1280 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
1281 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
1282 "ldr qB3, [%x[bptr], #0x20]\n"
1283 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
1285 "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
1286 "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
1287 "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
1288 "ldr qB4, [%x[bptr], #0x30]\n"
1289 "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
1290 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1291 "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
1292 "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
1293 "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
1294 "ldr qB1, [%x[bptr], #0x00]\n"
1295 "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
1296 "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
1297 "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
1298 "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
1299 "ldr qB2, [%x[bptr], #0x10]\n"
1300 "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
1301 "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
1302 "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
1303 "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
1304 "ldr qB3, [%x[bptr], #0x20]\n"
1305 "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
1307 "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
1308 "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
1309 "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
1310 "ldr qB4, [%x[bptr], #0x30]\n"
1311 "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
1312 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1313 "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
1314 "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
1315 "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
1316 "ldr qB1, [%x[bptr], #0x00]\n"
1317 "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
1318 "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
1319 "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
1320 "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
1321 "ldr qB2, [%x[bptr], #0x10]\n"
1322 "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
1323 "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
1324 "ldr dA1, [%x[aptr]], #0x08\n"
1325 "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
1326 "ldr dA2, [ aptr2], #0x08\n"
1327 "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
1328 "ldr qB3, [%x[bptr], #0x20]\n"
1329 "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
1332 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
1333 "ldr dA3, [ aptr3], #0x08\n"
1334 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
1335 "ldr dA4, [ aptr4], #0x08\n"
1336 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
1337 "ldr qB4, [%x[bptr], #0x30]\n"
1338 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
1339 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1340 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
1341 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
1342 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
1343 "ldr qB1, [%x[bptr], #0x00]\n"
1344 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
1345 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
1346 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
1347 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
1348 "ldr qB2, [%x[bptr], #0x10]\n"
1349 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
1350 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
1351 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
1352 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
1353 "ldr qB3, [%x[bptr], #0x20]\n"
1354 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
1356 "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
1357 "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
1358 "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
1359 "ldr qB4, [%x[bptr], #0x30]\n"
1360 "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
1361 "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1362 "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
1363 "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
1364 "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
1365 "ldr qB1, [%x[bptr], #0x00]\n"
1366 "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
1367 "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
1368 "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
1369 "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
1370 "ldr qB2, [%x[bptr], #0x10]\n"
1371 "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
1372 "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
1373 "ldr sA1, [%x[aptr]], #0x04\n"
1374 "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
1375 "ldr sA2, [ aptr2], #0x04\n"
1376 "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
1377 "ldr qB3, [%x[bptr], #0x20]\n"
1378 "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
1380 "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
1381 "ldr qB4, [%x[bptr], #0x30]\n"
1382 "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
1383 "stp qC11, qC12, [%x[cptr], #0x00]\n"
1384 "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
1385 "ldr sA3, [ aptr3], #0x04\n"
1386 "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
1387 "stp qC13, qC14, [%x[cptr], #0x20]\n"
1388 "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
1389 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1390 "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
1391 "stp qC21, qC22, [%x[cptr], #0x00]\n"
1392 "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
1393 "ldr sA4, [ aptr4], #0x04\n"
1394 "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
1395 "stp qC23, qC24, [%x[cptr], #0x20]\n"
1396 "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
1397 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1398 "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
1399 "stp qC31, qC32, [%x[cptr], #0x00]\n"
1400 "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
1401 "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
1402 "stp qC33, qC34, [%x[cptr], #0x20]\n"
1403 "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
1404 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1405 "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
1406 "stp qC41, qC42, [%x[cptr], #0x00]\n"
1407 "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
1408 "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
1409 "stp qC43, qC44, [%x[cptr], #0x20]\n"
1410 "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1412 ".unreq vB4\n" ".unreq qB4\n"
1413 ".unreq vB3\n" ".unreq qB3\n"
1414 ".unreq vB2\n" ".unreq qB2\n"
1415 ".unreq vB1\n" ".unreq qB1\n"
1416 ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
1417 ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
1418 ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
1419 ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
1420 ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
1421 ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
1422 ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
1423 ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
1424 ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
1425 ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
1426 ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
1427 ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
1432 : [aptr] "+r" (aptr),
1436 : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
1437 [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
1438 [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
1439 : "cc", "memory", "x20", "x21", "x22",
1440 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1441 "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",