arm_compute v18.02
[platform/upstream/armcl.git] / arm_compute / core / NEON / kernels / convolution / winograd / gemm / a64_sgemm_4x16.hpp
1 /*
2  * Copyright (c) 2017 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24
25 template <const unsigned int tail>
26 inline void sgemm_4x16_impl(
27   const float* const a, const float* const b, float *c,
28   const int M, const int K, const int N,
29   const int a_row_stride,
30   const int b_row_stride,
31   const int c_row_stride
32 );
33
34 template <>
35 inline void sgemm_4x16_impl<0>(
36   const float* const a, const float* const b, float *c,
37   const int M, const int K, const int N,
38   const int a_row_stride,
39   const int b_row_stride,
40   const int c_row_stride
41 ) {
42   const int TAIL_SIZE = 0;
43   const int M_BLOCK = 4;
44   const int N_BLOCK = 16;
45
46   const int m_blocks = iceildiv(M, M_BLOCK);
47   const int n_blocks = iceildiv(N, N_BLOCK);
48
49   // For each block of output rows
50   for (int mblock = 0; mblock < m_blocks; mblock++) {
51     // For each block of output columns
52     for (int nblock = 0; nblock < n_blocks; nblock++) {
53       const float *aptr = a + mblock*M_BLOCK*a_row_stride;
54       const float *bptr = b + nblock*N_BLOCK;
55       float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
56       int k = (K - TAIL_SIZE) / 4;
57
58       asm volatile(
59         "aptr2 .req X20\n"
60         "aptr3 .req X21\n"
61         "aptr4 .req X22\n"
62         "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
63         "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
64         "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
65         "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
66         "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
67         "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
68         "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
69         "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
70         "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
71         "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
72         "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
73         "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
74         "vB1 .req v20\n" "qB1 .req q20\n"
75         "vB2 .req v21\n" "qB2 .req q21\n"
76         "vB3 .req v22\n" "qB3 .req q22\n"
77         "vB4 .req v23\n" "qB4 .req q23\n"
78
79         // Clear accumulators, initialise pointers
80         "movi vC11.4s, #0\n"
81         "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
82         "movi vC12.4s, #0\n"
83         "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
84         "movi vC13.4s, #0\n"
85         "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
86         "movi vC14.4s, #0\n"
87         "ldr qA1, [%x[aptr]], #0x10\n"
88         "movi vC21.4s, #0\n"
89         "ldr qA2, [   aptr2], #0x10\n"
90         "movi vC22.4s, #0\n"
91         "ldr qB1, [%x[bptr], #0x00]\n"
92         "movi vC23.4s, #0\n"
93         "ldr qB2, [%x[bptr], #0x10]\n"
94         "movi vC24.4s, #0\n"
95         "ldr qB3, [%x[bptr], #0x20]\n"
96         "movi vC31.4s, #0\n"
97         "movi vC32.4s, #0\n"
98         "movi vC33.4s, #0\n"
99         "movi vC34.4s, #0\n"
100         "movi vC41.4s, #0\n"
101         "movi vC42.4s, #0\n"
102         "movi vC43.4s, #0\n"
103         "movi vC44.4s, #0\n"
104         "subs %x[k], %x[k], #1\n"
105         "beq 2f\n"
106
107         "1:"  // Loop proper
108           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
109           "ldr qA3, [   aptr3], #0x10\n"
110           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
111           "ldr qA4, [   aptr4], #0x10\n"
112           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
113           "ldr qB4, [%x[bptr], #0x30]\n"
114           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
115           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
116           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
117           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
118           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
119           "ldr qB1, [%x[bptr], #0x00]\n"
120           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
121           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
122           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
123           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
124           "ldr qB2, [%x[bptr], #0x10]\n"
125           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
126           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
127           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
128           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
129           "ldr qB3, [%x[bptr], #0x20]\n"
130           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
131
132           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
133           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
134           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
135           "ldr qB4, [%x[bptr], #0x30]\n"
136           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
137           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
138           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
139           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
140           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
141           "ldr qB1, [%x[bptr], #0x00]\n"
142           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
143           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
144           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
145           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
146           "ldr qB2, [%x[bptr], #0x10]\n"
147           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
148           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
149           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
150           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
151           "ldr qB3, [%x[bptr], #0x20]\n"
152           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
153
154           "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
155           "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
156           "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
157           "ldr qB4, [%x[bptr], #0x30]\n"
158           "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
159           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
160           "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
161           "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
162           "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
163           "ldr qB1, [%x[bptr], #0x00]\n"
164           "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
165           "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
166           "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
167           "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
168           "ldr qB2, [%x[bptr], #0x10]\n"
169           "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
170           "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
171           "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
172           "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
173           "ldr qB3, [%x[bptr], #0x20]\n"
174           "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
175
176           "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
177           "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
178           "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
179           "ldr qB4, [%x[bptr], #0x30]\n"
180           "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
181           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
182           "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
183           "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
184           "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
185           "ldr qB1, [%x[bptr], #0x00]\n"
186           "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
187           "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
188           "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
189           "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
190           "ldr qB2, [%x[bptr], #0x10]\n"
191           "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
192           "subs %x[k], %x[k], #1\n"
193           "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
194           "ldr qA1, [%x[aptr]], #0x10\n"
195           "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
196           "ldr qA2, [   aptr2], #0x10\n"
197           "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
198           "ldr qB3, [%x[bptr], #0x20]\n"
199           "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
200           "bne 1b\n"
201
202         "2:"  // Tail
203           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
204           "ldr qA3, [   aptr3], #0x10\n"
205           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
206           "ldr qA4, [   aptr4], #0x10\n"
207           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
208           "ldr qB4, [%x[bptr], #0x30]\n"
209           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
210           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
211           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
212           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
213           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
214           "ldr qB1, [%x[bptr], #0x00]\n"
215           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
216           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
217           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
218           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
219           "ldr qB2, [%x[bptr], #0x10]\n"
220           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
221           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
222           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
223           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
224           "ldr qB3, [%x[bptr], #0x20]\n"
225           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
226
227           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
228           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
229           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
230           "ldr qB4, [%x[bptr], #0x30]\n"
231           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
232           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
233           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
234           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
235           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
236           "ldr qB1, [%x[bptr], #0x00]\n"
237           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
238           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
239           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
240           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
241           "ldr qB2, [%x[bptr], #0x10]\n"
242           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
243           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
244           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
245           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
246           "ldr qB3, [%x[bptr], #0x20]\n"
247           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
248
249           "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
250           "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
251           "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
252           "ldr qB4, [%x[bptr], #0x30]\n"
253           "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
254           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
255           "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
256           "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
257           "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
258           "ldr qB1, [%x[bptr], #0x00]\n"
259           "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
260           "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
261           "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
262           "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
263           "ldr qB2, [%x[bptr], #0x10]\n"
264           "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
265           "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
266           "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
267           "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
268           "ldr qB3, [%x[bptr], #0x20]\n"
269           "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
270
271           "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
272           "ldr qB4, [%x[bptr], #0x30]\n"
273           "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
274           "stp qC11, qC12, [%x[cptr], #0x00]\n"
275           "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
276           "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
277           "stp qC13, qC14, [%x[cptr], #0x20]\n"
278           "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
279           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
280           "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
281           "stp qC21, qC22, [%x[cptr], #0x00]\n"
282           "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
283           "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
284           "stp qC23, qC24, [%x[cptr], #0x20]\n"
285           "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
286           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
287           "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
288           "stp qC31, qC32, [%x[cptr], #0x00]\n"
289           "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
290           "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
291           "stp qC33, qC34, [%x[cptr], #0x20]\n"
292           "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
293           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
294           "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
295           "stp qC41, qC42, [%x[cptr], #0x00]\n"
296           "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
297           "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
298           "stp qC43, qC44, [%x[cptr], #0x20]\n"
299           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
300
301         ".unreq vB4\n" ".unreq qB4\n"
302         ".unreq vB3\n" ".unreq qB3\n"
303         ".unreq vB2\n" ".unreq qB2\n"
304         ".unreq vB1\n" ".unreq qB1\n"
305         ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
306         ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
307         ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
308         ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
309         ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
310         ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
311         ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
312         ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
313         ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
314         ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
315         ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
316         ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
317         ".unreq aptr2\n"
318         ".unreq aptr3\n"
319         ".unreq aptr4\n"
320
321         : [aptr] "+r" (aptr),
322           [bptr] "+r" (bptr),
323           [cptr] "+r" (cptr),
324           [k] "+r" (k)
325         : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
326           [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
327           [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
328         : "cc", "memory", "x20", "x21", "x22",
329           "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
330           "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
331           "v21", "v22", "v23"
332       );
333     }
334   }
335 }
336
337 template <>
338 inline void sgemm_4x16_impl<1>(
339   const float* const a, const float* const b, float *c,
340   const int M, const int K, const int N,
341   const int a_row_stride,
342   const int b_row_stride,
343   const int c_row_stride
344 ) {
345   const int TAIL_SIZE = 1;
346   const int M_BLOCK = 4;
347   const int N_BLOCK = 16;
348
349   const int m_blocks = iceildiv(M, M_BLOCK);
350   const int n_blocks = iceildiv(N, N_BLOCK);
351
352   // For each block of output rows
353   for (int mblock = 0; mblock < m_blocks; mblock++) {
354     // For each block of output columns
355     for (int nblock = 0; nblock < n_blocks; nblock++) {
356       const float *aptr = a + mblock*M_BLOCK*a_row_stride;
357       const float *bptr = b + nblock*N_BLOCK;
358       float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
359       int k = (K - TAIL_SIZE) / 4;
360
361       asm volatile(
362         "aptr2 .req X20\n"
363         "aptr3 .req X21\n"
364         "aptr4 .req X22\n"
365         "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
366         "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
367         "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
368         "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
369         "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
370         "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
371         "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
372         "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
373         "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
374         "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
375         "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
376         "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
377         "vB1 .req v20\n" "qB1 .req q20\n"
378         "vB2 .req v21\n" "qB2 .req q21\n"
379         "vB3 .req v22\n" "qB3 .req q22\n"
380         "vB4 .req v23\n" "qB4 .req q23\n"
381
382         // Clear accumulators, initialise pointers
383         "movi vC11.4s, #0\n"
384         "ldr qB1, [%x[bptr], #0x00]\n"
385         "movi vC12.4s, #0\n"
386         "ldr qB2, [%x[bptr], #0x10]\n"
387         "movi vC13.4s, #0\n"
388         "ldr qB3, [%x[bptr], #0x20]\n"
389         "movi vC14.4s, #0\n"
390         "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
391         "movi vC21.4s, #0\n"
392         "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
393         "movi vC22.4s, #0\n"
394         "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
395         "movi vC23.4s, #0\n"
396         "cbnz %x[k], 3f\n"
397
398         // Prepare for tail in K
399         "movi vC24.4s, #0\n"
400         "ldr sA1, [%x[aptr]], #0x04\n"
401         "movi vC31.4s, #0\n"
402         "ldr sA2, [   aptr2], #0x04\n"
403         "movi vC32.4s, #0\n"
404         "movi vC33.4s, #0\n"
405         "movi vC34.4s, #0\n"
406         "movi vC41.4s, #0\n"
407         "movi vC42.4s, #0\n"
408         "movi vC43.4s, #0\n"
409         "movi vC44.4s, #0\n"
410         "b 2f\n"  // Jump to tail
411
412         "3:"  // Prepare for loop over K
413           "movi vC24.4s, #0\n"
414           "ldr qA1, [%x[aptr]], #0x10\n"
415           "movi vC31.4s, #0\n"
416           "ldr qA2, [   aptr2], #0x10\n"
417           "movi vC32.4s, #0\n"
418           "movi vC33.4s, #0\n"
419           "movi vC34.4s, #0\n"
420           "movi vC41.4s, #0\n"
421           "movi vC42.4s, #0\n"
422           "movi vC43.4s, #0\n"
423           "movi vC44.4s, #0\n"
424           "subs %x[k], %x[k], #1\n"
425           "beq 4f\n"
426
427         "1:"  // Loop proper
428           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
429           "ldr qA3, [   aptr3], #0x10\n"
430           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
431           "ldr qA4, [   aptr4], #0x10\n"
432           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
433           "ldr qB4, [%x[bptr], #0x30]\n"
434           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
435           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
436           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
437           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
438           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
439           "ldr qB1, [%x[bptr], #0x00]\n"
440           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
441           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
442           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
443           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
444           "ldr qB2, [%x[bptr], #0x10]\n"
445           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
446           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
447           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
448           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
449           "ldr qB3, [%x[bptr], #0x20]\n"
450           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
451
452           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
453           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
454           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
455           "ldr qB4, [%x[bptr], #0x30]\n"
456           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
457           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
458           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
459           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
460           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
461           "ldr qB1, [%x[bptr], #0x00]\n"
462           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
463           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
464           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
465           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
466           "ldr qB2, [%x[bptr], #0x10]\n"
467           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
468           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
469           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
470           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
471           "ldr qB3, [%x[bptr], #0x20]\n"
472           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
473
474           "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
475           "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
476           "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
477           "ldr qB4, [%x[bptr], #0x30]\n"
478           "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
479           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
480           "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
481           "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
482           "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
483           "ldr qB1, [%x[bptr], #0x00]\n"
484           "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
485           "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
486           "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
487           "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
488           "ldr qB2, [%x[bptr], #0x10]\n"
489           "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
490           "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
491           "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
492           "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
493           "ldr qB3, [%x[bptr], #0x20]\n"
494           "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
495
496           "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
497           "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
498           "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
499           "ldr qB4, [%x[bptr], #0x30]\n"
500           "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
501           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
502           "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
503           "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
504           "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
505           "ldr qB1, [%x[bptr], #0x00]\n"
506           "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
507           "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
508           "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
509           "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
510           "ldr qB2, [%x[bptr], #0x10]\n"
511           "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
512           "subs %x[k], %x[k], #1\n"
513           "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
514           "ldr qA1, [%x[aptr]], #0x10\n"
515           "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
516           "ldr qA2, [   aptr2], #0x10\n"
517           "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
518           "ldr qB3, [%x[bptr], #0x20]\n"
519           "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
520           "bne 1b\n"
521
522         "4:"  // Tail iteration
523           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
524           "ldr qA3, [   aptr3], #0x10\n"
525           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
526           "ldr qA4, [   aptr4], #0x10\n"
527           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
528           "ldr qB4, [%x[bptr], #0x30]\n"
529           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
530           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
531           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
532           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
533           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
534           "ldr qB1, [%x[bptr], #0x00]\n"
535           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
536           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
537           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
538           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
539           "ldr qB2, [%x[bptr], #0x10]\n"
540           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
541           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
542           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
543           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
544           "ldr qB3, [%x[bptr], #0x20]\n"
545           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
546
547           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
548           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
549           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
550           "ldr qB4, [%x[bptr], #0x30]\n"
551           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
552           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
553           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
554           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
555           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
556           "ldr qB1, [%x[bptr], #0x00]\n"
557           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
558           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
559           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
560           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
561           "ldr qB2, [%x[bptr], #0x10]\n"
562           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
563           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
564           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
565           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
566           "ldr qB3, [%x[bptr], #0x20]\n"
567           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
568
569           "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
570           "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
571           "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
572           "ldr qB4, [%x[bptr], #0x30]\n"
573           "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
574           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
575           "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
576           "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
577           "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
578           "ldr qB1, [%x[bptr], #0x00]\n"
579           "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
580           "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
581           "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
582           "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
583           "ldr qB2, [%x[bptr], #0x10]\n"
584           "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
585           "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
586           "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
587           "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
588           "ldr qB3, [%x[bptr], #0x20]\n"
589           "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
590
591           "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
592           "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
593           "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
594           "ldr qB4, [%x[bptr], #0x30]\n"
595           "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
596           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
597           "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
598           "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
599           "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
600           "ldr qB1, [%x[bptr], #0x00]\n"
601           "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
602           "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
603           "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
604           "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
605           "ldr qB2, [%x[bptr], #0x10]\n"
606           "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
607           "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
608           "ldr sA1, [%x[aptr]], #0x04\n"
609           "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
610           "ldr sA2, [   aptr2], #0x04\n"
611           "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
612           "ldr qB3, [%x[bptr], #0x20]\n"
613           "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
614
615         "2:"  // Common tail
616           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
617           "ldr qB4, [%x[bptr], #0x30]\n"
618           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
619           "stp qC11, qC12, [%x[cptr], #0x00]\n"
620           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
621           "ldr sA3, [   aptr3], #0x04\n"
622           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
623           "stp qC13, qC14, [%x[cptr], #0x20]\n"
624           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
625           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
626           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
627           "stp qC21, qC22, [%x[cptr], #0x00]\n"
628           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
629           "ldr sA4, [   aptr4], #0x04\n"
630           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
631           "stp qC23, qC24, [%x[cptr], #0x20]\n"
632           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
633           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
634           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
635           "stp qC31, qC32, [%x[cptr], #0x00]\n"
636           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
637           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
638           "stp qC33, qC34, [%x[cptr], #0x20]\n"
639           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
640           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
641           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
642           "stp qC41, qC42, [%x[cptr], #0x00]\n"
643           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
644           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
645           "stp qC43, qC44, [%x[cptr], #0x20]\n"
646           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
647
648         ".unreq vB4\n" ".unreq qB4\n"
649         ".unreq vB3\n" ".unreq qB3\n"
650         ".unreq vB2\n" ".unreq qB2\n"
651         ".unreq vB1\n" ".unreq qB1\n"
652         ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
653         ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
654         ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
655         ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
656         ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
657         ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
658         ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
659         ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
660         ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
661         ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
662         ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
663         ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
664         ".unreq aptr2\n"
665         ".unreq aptr3\n"
666         ".unreq aptr4\n"
667
668         : [aptr] "+r" (aptr),
669           [bptr] "+r" (bptr),
670           [cptr] "+r" (cptr),
671           [k] "+r" (k)
672         : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
673           [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
674           [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
675         : "cc", "memory", "x20", "x21", "x22",
676           "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
677           "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
678           "v21", "v22", "v23"
679       );
680     }
681   }
682 }
683
684 template <>
685 inline void sgemm_4x16_impl<2>(
686   const float* const a, const float* const b, float *c,
687   const int M, const int K, const int N,
688   const int a_row_stride,
689   const int b_row_stride,
690   const int c_row_stride
691 ) {
692   const int TAIL_SIZE = 2;
693   const int M_BLOCK = 4;
694   const int N_BLOCK = 16;
695
696   const int m_blocks = iceildiv(M, M_BLOCK);
697   const int n_blocks = iceildiv(N, N_BLOCK);
698
699   // For each block of output rows
700   for (int mblock = 0; mblock < m_blocks; mblock++) {
701     // For each block of output columns
702     for (int nblock = 0; nblock < n_blocks; nblock++) {
703       const float *aptr = a + mblock*M_BLOCK*a_row_stride;
704       const float *bptr = b + nblock*N_BLOCK;
705       float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
706       int k = (K - TAIL_SIZE) / 4;
707
708       asm volatile(
709         "aptr2 .req X20\n"
710         "aptr3 .req X21\n"
711         "aptr4 .req X22\n"
712         "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
713         "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
714         "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
715         "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
716         "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
717         "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
718         "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
719         "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
720         "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
721         "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
722         "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
723         "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
724         "vB1 .req v20\n" "qB1 .req q20\n"
725         "vB2 .req v21\n" "qB2 .req q21\n"
726         "vB3 .req v22\n" "qB3 .req q22\n"
727         "vB4 .req v23\n" "qB4 .req q23\n"
728
729         // Clear accumulators, initialise pointers
730         "movi vC11.4s, #0\n"
731         "ldr qB1, [%x[bptr], #0x00]\n"
732         "movi vC12.4s, #0\n"
733         "ldr qB2, [%x[bptr], #0x10]\n"
734         "movi vC13.4s, #0\n"
735         "ldr qB3, [%x[bptr], #0x20]\n"
736         "movi vC14.4s, #0\n"
737         "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
738         "movi vC21.4s, #0\n"
739         "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
740         "movi vC22.4s, #0\n"
741         "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
742         "movi vC23.4s, #0\n"
743         "cbnz %x[k], 3f\n"
744
745         // Prepare for tail in K
746         "movi vC24.4s, #0\n"
747         "ldr dA1, [%x[aptr]], #0x08\n"
748         "movi vC31.4s, #0\n"
749         "ldr dA2, [   aptr2], #0x08\n"
750         "movi vC32.4s, #0\n"
751         "movi vC33.4s, #0\n"
752         "movi vC34.4s, #0\n"
753         "movi vC41.4s, #0\n"
754         "movi vC42.4s, #0\n"
755         "movi vC43.4s, #0\n"
756         "movi vC44.4s, #0\n"
757         "b 2f\n"  // Jump to tail
758
759         "3:"  // Prepare for loop over K
760           "movi vC24.4s, #0\n"
761           "ldr qA1, [%x[aptr]], #0x10\n"
762           "movi vC31.4s, #0\n"
763           "ldr qA2, [   aptr2], #0x10\n"
764           "movi vC32.4s, #0\n"
765           "movi vC33.4s, #0\n"
766           "movi vC34.4s, #0\n"
767           "movi vC41.4s, #0\n"
768           "movi vC42.4s, #0\n"
769           "movi vC43.4s, #0\n"
770           "movi vC44.4s, #0\n"
771           "subs %x[k], %x[k], #1\n"
772           "beq 4f\n"
773
774         "1:"  // Loop proper
775           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
776           "ldr qA3, [   aptr3], #0x10\n"
777           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
778           "ldr qA4, [   aptr4], #0x10\n"
779           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
780           "ldr qB4, [%x[bptr], #0x30]\n"
781           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
782           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
783           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
784           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
785           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
786           "ldr qB1, [%x[bptr], #0x00]\n"
787           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
788           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
789           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
790           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
791           "ldr qB2, [%x[bptr], #0x10]\n"
792           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
793           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
794           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
795           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
796           "ldr qB3, [%x[bptr], #0x20]\n"
797           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
798
799           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
800           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
801           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
802           "ldr qB4, [%x[bptr], #0x30]\n"
803           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
804           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
805           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
806           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
807           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
808           "ldr qB1, [%x[bptr], #0x00]\n"
809           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
810           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
811           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
812           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
813           "ldr qB2, [%x[bptr], #0x10]\n"
814           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
815           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
816           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
817           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
818           "ldr qB3, [%x[bptr], #0x20]\n"
819           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
820
821           "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
822           "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
823           "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
824           "ldr qB4, [%x[bptr], #0x30]\n"
825           "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
826           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
827           "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
828           "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
829           "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
830           "ldr qB1, [%x[bptr], #0x00]\n"
831           "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
832           "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
833           "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
834           "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
835           "ldr qB2, [%x[bptr], #0x10]\n"
836           "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
837           "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
838           "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
839           "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
840           "ldr qB3, [%x[bptr], #0x20]\n"
841           "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
842
843           "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
844           "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
845           "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
846           "ldr qB4, [%x[bptr], #0x30]\n"
847           "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
848           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
849           "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
850           "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
851           "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
852           "ldr qB1, [%x[bptr], #0x00]\n"
853           "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
854           "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
855           "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
856           "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
857           "ldr qB2, [%x[bptr], #0x10]\n"
858           "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
859           "subs %x[k], %x[k], #1\n"
860           "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
861           "ldr qA1, [%x[aptr]], #0x10\n"
862           "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
863           "ldr qA2, [   aptr2], #0x10\n"
864           "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
865           "ldr qB3, [%x[bptr], #0x20]\n"
866           "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
867           "bne 1b\n"
868
869         "4:"  // Tail iteration
870           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
871           "ldr qA3, [   aptr3], #0x10\n"
872           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
873           "ldr qA4, [   aptr4], #0x10\n"
874           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
875           "ldr qB4, [%x[bptr], #0x30]\n"
876           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
877           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
878           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
879           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
880           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
881           "ldr qB1, [%x[bptr], #0x00]\n"
882           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
883           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
884           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
885           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
886           "ldr qB2, [%x[bptr], #0x10]\n"
887           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
888           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
889           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
890           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
891           "ldr qB3, [%x[bptr], #0x20]\n"
892           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
893
894           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
895           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
896           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
897           "ldr qB4, [%x[bptr], #0x30]\n"
898           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
899           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
900           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
901           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
902           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
903           "ldr qB1, [%x[bptr], #0x00]\n"
904           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
905           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
906           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
907           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
908           "ldr qB2, [%x[bptr], #0x10]\n"
909           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
910           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
911           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
912           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
913           "ldr qB3, [%x[bptr], #0x20]\n"
914           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
915
916           "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
917           "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
918           "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
919           "ldr qB4, [%x[bptr], #0x30]\n"
920           "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
921           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
922           "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
923           "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
924           "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
925           "ldr qB1, [%x[bptr], #0x00]\n"
926           "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
927           "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
928           "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
929           "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
930           "ldr qB2, [%x[bptr], #0x10]\n"
931           "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
932           "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
933           "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
934           "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
935           "ldr qB3, [%x[bptr], #0x20]\n"
936           "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
937
938           "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
939           "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
940           "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
941           "ldr qB4, [%x[bptr], #0x30]\n"
942           "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
943           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
944           "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
945           "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
946           "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
947           "ldr qB1, [%x[bptr], #0x00]\n"
948           "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
949           "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
950           "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
951           "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
952           "ldr qB2, [%x[bptr], #0x10]\n"
953           "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
954           "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
955           "ldr dA1, [%x[aptr]], #0x08\n"
956           "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
957           "ldr dA2, [   aptr2], #0x08\n"
958           "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
959           "ldr qB3, [%x[bptr], #0x20]\n"
960           "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
961
962         "2:"  // Common tail
963           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
964           "ldr dA3, [   aptr3], #0x08\n"
965           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
966           "ldr dA4, [   aptr4], #0x08\n"
967           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
968           "ldr qB4, [%x[bptr], #0x30]\n"
969           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
970           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
971           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
972           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
973           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
974           "ldr qB1, [%x[bptr], #0x00]\n"
975           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
976           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
977           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
978           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
979           "ldr qB2, [%x[bptr], #0x10]\n"
980           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
981           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
982           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
983           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
984           "ldr qB3, [%x[bptr], #0x20]\n"
985           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
986
987           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
988           "ldr qB4, [%x[bptr], #0x30]\n"
989           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
990           "stp qC11, qC12, [%x[cptr], #0x00]\n"
991           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
992           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
993           "stp qC13, qC14, [%x[cptr], #0x20]\n"
994           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
995           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
996           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
997           "stp qC21, qC22, [%x[cptr], #0x00]\n"
998           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
999           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
1000           "stp qC23, qC24, [%x[cptr], #0x20]\n"
1001           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
1002           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1003           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
1004           "stp qC31, qC32, [%x[cptr], #0x00]\n"
1005           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
1006           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
1007           "stp qC33, qC34, [%x[cptr], #0x20]\n"
1008           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
1009           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1010           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
1011           "stp qC41, qC42, [%x[cptr], #0x00]\n"
1012           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
1013           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
1014           "stp qC43, qC44, [%x[cptr], #0x20]\n"
1015           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1016
1017         ".unreq vB4\n" ".unreq qB4\n"
1018         ".unreq vB3\n" ".unreq qB3\n"
1019         ".unreq vB2\n" ".unreq qB2\n"
1020         ".unreq vB1\n" ".unreq qB1\n"
1021         ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
1022         ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
1023         ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
1024         ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
1025         ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
1026         ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
1027         ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
1028         ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
1029         ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
1030         ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
1031         ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
1032         ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
1033         ".unreq aptr2\n"
1034         ".unreq aptr3\n"
1035         ".unreq aptr4\n"
1036
1037         : [aptr] "+r" (aptr),
1038           [bptr] "+r" (bptr),
1039           [cptr] "+r" (cptr),
1040           [k] "+r" (k)
1041         : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
1042           [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
1043           [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
1044         : "cc", "memory", "x20", "x21", "x22",
1045           "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1046           "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
1047           "v21", "v22", "v23"
1048       );
1049     }
1050   }
1051 }
1052
1053 template <>
1054 inline void sgemm_4x16_impl<3>(
1055   const float* const a, const float* const b, float *c,
1056   const int M, const int K, const int N,
1057   const int a_row_stride,
1058   const int b_row_stride,
1059   const int c_row_stride
1060 ) {
1061   const int TAIL_SIZE = 3;
1062   const int M_BLOCK = 4;
1063   const int N_BLOCK = 16;
1064
1065   const int m_blocks = iceildiv(M, M_BLOCK);
1066   const int n_blocks = iceildiv(N, N_BLOCK);
1067
1068   // For each block of output rows
1069   for (int mblock = 0; mblock < m_blocks; mblock++) {
1070     // For each block of output columns
1071     for (int nblock = 0; nblock < n_blocks; nblock++) {
1072       const float *aptr = a + mblock*M_BLOCK*a_row_stride;
1073       const float *bptr = b + nblock*N_BLOCK;
1074       float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
1075       int k = (K - TAIL_SIZE) / 4;
1076
1077       asm volatile(
1078         "aptr2 .req X20\n"
1079         "aptr3 .req X21\n"
1080         "aptr4 .req X22\n"
1081         "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
1082         "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
1083         "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
1084         "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
1085         "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
1086         "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
1087         "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
1088         "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
1089         "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
1090         "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
1091         "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
1092         "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
1093         "vB1 .req v20\n" "qB1 .req q20\n"
1094         "vB2 .req v21\n" "qB2 .req q21\n"
1095         "vB3 .req v22\n" "qB3 .req q22\n"
1096         "vB4 .req v23\n" "qB4 .req q23\n"
1097
1098         // Clear accumulators, initialise pointers
1099         "movi vC11.4s, #0\n"
1100         "ldr qB1, [%x[bptr], #0x00]\n"
1101         "movi vC12.4s, #0\n"
1102         "ldr qB2, [%x[bptr], #0x10]\n"
1103         "movi vC13.4s, #0\n"
1104         "ldr qB3, [%x[bptr], #0x20]\n"
1105         "movi vC14.4s, #0\n"
1106         "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
1107         "movi vC21.4s, #0\n"
1108         "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
1109         "movi vC22.4s, #0\n"
1110         "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
1111         "movi vC23.4s, #0\n"
1112         "cbnz %x[k], 3f\n"
1113
1114         // Prepare for tail in K
1115         "movi vC24.4s, #0\n"
1116         "ldr dA1, [%x[aptr]], #0x08\n"
1117         "movi vC31.4s, #0\n"
1118         "ldr dA2, [   aptr2], #0x08\n"
1119         "movi vC32.4s, #0\n"
1120         "movi vC33.4s, #0\n"
1121         "movi vC34.4s, #0\n"
1122         "movi vC41.4s, #0\n"
1123         "movi vC42.4s, #0\n"
1124         "movi vC43.4s, #0\n"
1125         "movi vC44.4s, #0\n"
1126         "b 2f\n"  // Jump to tail
1127
1128         "3:"  // Prepare for loop over K
1129           "movi vC24.4s, #0\n"
1130           "ldr qA1, [%x[aptr]], #0x10\n"
1131           "movi vC31.4s, #0\n"
1132           "ldr qA2, [   aptr2], #0x10\n"
1133           "movi vC32.4s, #0\n"
1134           "movi vC33.4s, #0\n"
1135           "movi vC34.4s, #0\n"
1136           "movi vC41.4s, #0\n"
1137           "movi vC42.4s, #0\n"
1138           "movi vC43.4s, #0\n"
1139           "movi vC44.4s, #0\n"
1140           "subs %x[k], %x[k], #1\n"
1141           "beq 4f\n"
1142
1143         "1:"  // Loop proper
1144           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
1145           "ldr qA3, [   aptr3], #0x10\n"
1146           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
1147           "ldr qA4, [   aptr4], #0x10\n"
1148           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
1149           "ldr qB4, [%x[bptr], #0x30]\n"
1150           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
1151           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1152           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
1153           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
1154           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
1155           "ldr qB1, [%x[bptr], #0x00]\n"
1156           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
1157           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
1158           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
1159           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
1160           "ldr qB2, [%x[bptr], #0x10]\n"
1161           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
1162           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
1163           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
1164           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
1165           "ldr qB3, [%x[bptr], #0x20]\n"
1166           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
1167
1168           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
1169           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
1170           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
1171           "ldr qB4, [%x[bptr], #0x30]\n"
1172           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
1173           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1174           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
1175           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
1176           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
1177           "ldr qB1, [%x[bptr], #0x00]\n"
1178           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
1179           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
1180           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
1181           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
1182           "ldr qB2, [%x[bptr], #0x10]\n"
1183           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
1184           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
1185           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
1186           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
1187           "ldr qB3, [%x[bptr], #0x20]\n"
1188           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
1189
1190           "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
1191           "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
1192           "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
1193           "ldr qB4, [%x[bptr], #0x30]\n"
1194           "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
1195           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1196           "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
1197           "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
1198           "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
1199           "ldr qB1, [%x[bptr], #0x00]\n"
1200           "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
1201           "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
1202           "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
1203           "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
1204           "ldr qB2, [%x[bptr], #0x10]\n"
1205           "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
1206           "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
1207           "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
1208           "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
1209           "ldr qB3, [%x[bptr], #0x20]\n"
1210           "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
1211
1212           "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
1213           "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
1214           "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
1215           "ldr qB4, [%x[bptr], #0x30]\n"
1216           "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
1217           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1218           "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
1219           "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
1220           "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
1221           "ldr qB1, [%x[bptr], #0x00]\n"
1222           "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
1223           "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
1224           "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
1225           "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
1226           "ldr qB2, [%x[bptr], #0x10]\n"
1227           "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
1228           "subs %x[k], %x[k], #1\n"
1229           "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
1230           "ldr qA1, [%x[aptr]], #0x10\n"
1231           "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
1232           "ldr qA2, [   aptr2], #0x10\n"
1233           "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
1234           "ldr qB3, [%x[bptr], #0x20]\n"
1235           "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
1236           "bne 1b\n"
1237
1238         "4:"  // Tail iteration
1239           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
1240           "ldr qA3, [   aptr3], #0x10\n"
1241           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
1242           "ldr qA4, [   aptr4], #0x10\n"
1243           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
1244           "ldr qB4, [%x[bptr], #0x30]\n"
1245           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
1246           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1247           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
1248           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
1249           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
1250           "ldr qB1, [%x[bptr], #0x00]\n"
1251           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
1252           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
1253           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
1254           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
1255           "ldr qB2, [%x[bptr], #0x10]\n"
1256           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
1257           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
1258           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
1259           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
1260           "ldr qB3, [%x[bptr], #0x20]\n"
1261           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
1262
1263           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
1264           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
1265           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
1266           "ldr qB4, [%x[bptr], #0x30]\n"
1267           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
1268           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1269           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
1270           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
1271           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
1272           "ldr qB1, [%x[bptr], #0x00]\n"
1273           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
1274           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
1275           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
1276           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
1277           "ldr qB2, [%x[bptr], #0x10]\n"
1278           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
1279           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
1280           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
1281           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
1282           "ldr qB3, [%x[bptr], #0x20]\n"
1283           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
1284
1285           "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
1286           "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
1287           "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
1288           "ldr qB4, [%x[bptr], #0x30]\n"
1289           "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
1290           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1291           "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
1292           "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
1293           "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
1294           "ldr qB1, [%x[bptr], #0x00]\n"
1295           "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
1296           "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
1297           "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
1298           "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
1299           "ldr qB2, [%x[bptr], #0x10]\n"
1300           "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
1301           "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
1302           "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
1303           "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
1304           "ldr qB3, [%x[bptr], #0x20]\n"
1305           "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
1306
1307           "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
1308           "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
1309           "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
1310           "ldr qB4, [%x[bptr], #0x30]\n"
1311           "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
1312           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1313           "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
1314           "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
1315           "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
1316           "ldr qB1, [%x[bptr], #0x00]\n"
1317           "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
1318           "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
1319           "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
1320           "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
1321           "ldr qB2, [%x[bptr], #0x10]\n"
1322           "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
1323           "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
1324           "ldr dA1, [%x[aptr]], #0x08\n"
1325           "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
1326           "ldr dA2, [   aptr2], #0x08\n"
1327           "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
1328           "ldr qB3, [%x[bptr], #0x20]\n"
1329           "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
1330
1331         "2:"  // Common tail
1332           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
1333           "ldr dA3, [   aptr3], #0x08\n"
1334           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
1335           "ldr dA4, [   aptr4], #0x08\n"
1336           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
1337           "ldr qB4, [%x[bptr], #0x30]\n"
1338           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
1339           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1340           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
1341           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
1342           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
1343           "ldr qB1, [%x[bptr], #0x00]\n"
1344           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
1345           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
1346           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
1347           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
1348           "ldr qB2, [%x[bptr], #0x10]\n"
1349           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
1350           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
1351           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
1352           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
1353           "ldr qB3, [%x[bptr], #0x20]\n"
1354           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
1355
1356           "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
1357           "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
1358           "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
1359           "ldr qB4, [%x[bptr], #0x30]\n"
1360           "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
1361           "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
1362           "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
1363           "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
1364           "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
1365           "ldr qB1, [%x[bptr], #0x00]\n"
1366           "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
1367           "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
1368           "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
1369           "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
1370           "ldr qB2, [%x[bptr], #0x10]\n"
1371           "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
1372           "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
1373           "ldr sA1, [%x[aptr]], #0x04\n"
1374           "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
1375           "ldr sA2, [   aptr2], #0x04\n"
1376           "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
1377           "ldr qB3, [%x[bptr], #0x20]\n"
1378           "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
1379
1380           "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
1381           "ldr qB4, [%x[bptr], #0x30]\n"
1382           "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
1383           "stp qC11, qC12, [%x[cptr], #0x00]\n"
1384           "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
1385           "ldr sA3, [   aptr3], #0x04\n"
1386           "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
1387           "stp qC13, qC14, [%x[cptr], #0x20]\n"
1388           "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
1389           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1390           "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
1391           "stp qC21, qC22, [%x[cptr], #0x00]\n"
1392           "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
1393           "ldr sA4, [   aptr4], #0x04\n"
1394           "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
1395           "stp qC23, qC24, [%x[cptr], #0x20]\n"
1396           "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
1397           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1398           "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
1399           "stp qC31, qC32, [%x[cptr], #0x00]\n"
1400           "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
1401           "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
1402           "stp qC33, qC34, [%x[cptr], #0x20]\n"
1403           "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
1404           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1405           "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
1406           "stp qC41, qC42, [%x[cptr], #0x00]\n"
1407           "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
1408           "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
1409           "stp qC43, qC44, [%x[cptr], #0x20]\n"
1410           "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
1411
1412         ".unreq vB4\n" ".unreq qB4\n"
1413         ".unreq vB3\n" ".unreq qB3\n"
1414         ".unreq vB2\n" ".unreq qB2\n"
1415         ".unreq vB1\n" ".unreq qB1\n"
1416         ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
1417         ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
1418         ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
1419         ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
1420         ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
1421         ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
1422         ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
1423         ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
1424         ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
1425         ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
1426         ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
1427         ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
1428         ".unreq aptr2\n"
1429         ".unreq aptr3\n"
1430         ".unreq aptr4\n"
1431
1432         : [aptr] "+r" (aptr),
1433           [bptr] "+r" (bptr),
1434           [cptr] "+r" (cptr),
1435           [k] "+r" (k)
1436         : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
1437           [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
1438           [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
1439         : "cc", "memory", "x20", "x21", "x22",
1440           "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
1441           "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
1442           "v21", "v22", "v23"
1443       );
1444     }
1445   }
1446 }