fix build error
[platform/upstream/openblas.git] / kernel / mips / cgemm_kernel_8x4_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4)    \
32 {                                                        \
33     LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);  \
34     LD_SP2_INC(pb0, 4, src_b0, src_b1);                  \
35                                                          \
36     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);     \
37     PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i);     \
38                                                          \
39     /* 0th col */                                        \
40     SPLATI_W2_SP(src_b0, 0, src_br, src_bi);             \
41     res0_r OP0## = src_a0r * src_br;                     \
42     res0_r OP1## = src_a0i * src_bi;                     \
43     res0_i OP2## = (OP4 src_a0r) * src_bi;               \
44     res0_i OP3## = src_a0i * src_br;                     \
45                                                          \
46     res1_r OP0## = src_a1r * src_br;                     \
47     res1_r OP1## = src_a1i * src_bi;                     \
48     res1_i OP2## = (OP4 src_a1r) * src_bi;               \
49     res1_i OP3## = src_a1i * src_br;                     \
50                                                          \
51     /* 1st col */                                        \
52     SPLATI_W2_SP(src_b0, 2, src_br, src_bi);             \
53     res2_r OP0## = src_a0r * src_br;                     \
54     res2_r OP1## = src_a0i * src_bi;                     \
55     res2_i OP2## = (OP4 src_a0r) * src_bi;               \
56     res2_i OP3## = src_a0i * src_br;                     \
57                                                          \
58     res3_r OP0## = src_a1r * src_br;                     \
59     res3_r OP1## = src_a1i * src_bi;                     \
60     res3_i OP2## = (OP4 src_a1r) * src_bi;               \
61     res3_i OP3## = src_a1i * src_br;                     \
62                                                          \
63     /* 2nd col */                                        \
64     SPLATI_W2_SP(src_b1, 0, src_br, src_bi);             \
65     res4_r OP0## = src_a0r * src_br;                     \
66     res4_r OP1## = src_a0i * src_bi;                     \
67     res4_i OP2## = (OP4 src_a0r) * src_bi;               \
68     res4_i OP3## = src_a0i * src_br;                     \
69                                                          \
70     res5_r OP0## = src_a1r * src_br;                     \
71     res5_r OP1## = src_a1i * src_bi;                     \
72     res5_i OP2## = (OP4 src_a1r) * src_bi;               \
73     res5_i OP3## = src_a1i * src_br;                     \
74                                                          \
75     /* 3rd col */                                        \
76     SPLATI_W2_SP(src_b1, 2, src_br, src_bi);             \
77     res6_r OP0## = src_a0r * src_br;                     \
78     res6_r OP1## = src_a0i * src_bi;                     \
79     res6_i OP2## = (OP4 src_a0r) * src_bi;               \
80     res6_i OP3## = src_a0i * src_br;                     \
81                                                          \
82     res7_r OP0## = src_a1r * src_br;                     \
83     res7_r OP1## = src_a1i * src_bi;                     \
84     res7_i OP2## = (OP4 src_a1r) * src_bi;               \
85     res7_i OP3## = src_a1i * src_br;                     \
86 }
87
88 #define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4)    \
89 {                                                        \
90     LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);  \
91     src_b0 = LD_SP(pb0);                                 \
92                                                          \
93     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);     \
94     PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i);     \
95                                                          \
96     /* 0th col */                                        \
97     SPLATI_W2_SP(src_b0, 0, src_br, src_bi);             \
98     res0_r OP0## = src_a0r * src_br;                     \
99     res0_r OP1## = src_a0i * src_bi;                     \
100     res0_i OP2## = (OP4 src_a0r) * src_bi;               \
101     res0_i OP3## = src_a0i * src_br;                     \
102                                                          \
103     res1_r OP0## = src_a1r * src_br;                     \
104     res1_r OP1## = src_a1i * src_bi;                     \
105     res1_i OP2## = (OP4 src_a1r) * src_bi;               \
106     res1_i OP3## = src_a1i * src_br;                     \
107                                                          \
108     /* 1st col */                                        \
109     SPLATI_W2_SP(src_b0, 2, src_br, src_bi);             \
110     res2_r OP0## = src_a0r * src_br;                     \
111     res2_r OP1## = src_a0i * src_bi;                     \
112     res2_i OP2## = (OP4 src_a0r) * src_bi;               \
113     res2_i OP3## = src_a0i * src_br;                     \
114                                                          \
115     res3_r OP0## = src_a1r * src_br;                     \
116     res3_r OP1## = src_a1i * src_bi;                     \
117     res3_i OP2## = (OP4 src_a1r) * src_bi;               \
118     res3_i OP3## = src_a1i * src_br;                     \
119 }
120
121 #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
122 {                                                                     \
123     LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);               \
124     src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
125     SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
126                                                                       \
127     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
128     PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i);                  \
129                                                                       \
130     /* 0th col */                                                     \
131     res0_r OP0## = src_a0r * src_br;                                  \
132     res0_r OP1## = src_a0i * src_bi;                                  \
133     res0_i OP2## = (OP4 src_a0r) * src_bi;                            \
134     res0_i OP3## = src_a0i * src_br;                                  \
135                                                                       \
136     res1_r OP0## = src_a1r * src_br;                                  \
137     res1_r OP1## = src_a1i * src_bi;                                  \
138     res1_i OP2## = (OP4 src_a1r) * src_bi;                            \
139     res1_i OP3## = src_a1i * src_br;                                  \
140 }
141
142 #define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4)  \
143 {                                                      \
144     LD_SP2_INC(pa0, 4, src_a0, src_a1);                \
145     LD_SP2_INC(pb0, 4, src_b0, src_b1);                \
146                                                        \
147     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);   \
148                                                        \
149     /* 0th col */                                      \
150     SPLATI_W2_SP(src_b0, 0, src_br, src_bi);           \
151     res0_r OP0## = src_a0r * src_br;                   \
152     res0_r OP1## = src_a0i * src_bi;                   \
153     res0_i OP2## = OP4 src_a0r * src_bi;               \
154     res0_i OP3## = src_a0i * src_br;                   \
155                                                        \
156     /* 1st col */                                      \
157     SPLATI_W2_SP(src_b0, 2, src_br, src_bi);           \
158     res2_r OP0## = src_a0r * src_br;                   \
159     res2_r OP1## = src_a0i * src_bi;                   \
160     res2_i OP2## = OP4 src_a0r * src_bi;               \
161     res2_i OP3## = src_a0i * src_br;                   \
162                                                        \
163     /* 2nd col */                                      \
164     SPLATI_W2_SP(src_b1, 0, src_br, src_bi);           \
165     res4_r OP0## = src_a0r * src_br;                   \
166     res4_r OP1## = src_a0i * src_bi;                   \
167     res4_i OP2## = OP4 src_a0r * src_bi;               \
168     res4_i OP3## = src_a0i * src_br;                   \
169                                                        \
170     /* 3rd col */                                      \
171     SPLATI_W2_SP(src_b1, 2, src_br, src_bi);           \
172     res6_r OP0## = src_a0r * src_br;                   \
173     res6_r OP1## = src_a0i * src_bi;                   \
174     res6_i OP2## = OP4 src_a0r * src_bi;               \
175     res6_i OP3## = src_a0i * src_br;                   \
176 }
177
178 #define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4)  \
179 {                                                      \
180     LD_SP2_INC(pa0, 4, src_a0, src_a1);                \
181     src_b0 = LD_SP(pb0);                               \
182                                                        \
183     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);   \
184                                                        \
185     /* 0th col */                                      \
186     SPLATI_W2_SP(src_b0, 0, src_br, src_bi);           \
187     res0_r OP0## = src_a0r * src_br;                   \
188     res0_r OP1## = src_a0i * src_bi;                   \
189     res0_i OP2## = OP4 src_a0r * src_bi;               \
190     res0_i OP3## = src_a0i * src_br;                   \
191                                                        \
192     /* 1st col */                                      \
193     SPLATI_W2_SP(src_b0, 2, src_br, src_bi);           \
194     res2_r OP0## = src_a0r * src_br;                   \
195     res2_r OP1## = src_a0i * src_bi;                   \
196     res2_i OP2## = OP4 src_a0r * src_bi;               \
197     res2_i OP3## = src_a0i * src_br;                   \
198 }
199
200 #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
201 {                                                                     \
202     LD_SP2_INC(pa0, 4, src_a0, src_a1);                               \
203     src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
204     SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
205                                                                       \
206     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
207                                                                       \
208     /* 0th col */                                                     \
209     res0_r OP0## = src_a0r * src_br;                                  \
210     res0_r OP1## = src_a0i * src_bi;                                  \
211     res0_i OP2## = OP4 src_a0r * src_bi;                              \
212     res0_i OP3## = src_a0i * src_br;                                  \
213 }
214
215 #define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4)  \
216 {                                                  \
217     a0_r = pa0[0];                                 \
218     a0_i = pa0[1];                                 \
219     b0_r = pb0[0];                                 \
220     b0_i = pb0[1];                                 \
221                                                    \
222     res0 OP0## = a0_r * b0_r;                      \
223     res0 OP1## = a0_i * b0_i;                      \
224     res1 OP2## = OP4 a0_r * b0_i;                  \
225     res1 OP3## = a0_i * b0_r;                      \
226                                                    \
227     a1_r = pa0[2];                                 \
228     a1_i = pa0[3];                                 \
229     res2 OP0## = a1_r * b0_r;                      \
230     res2 OP1## = a1_i * b0_i;                      \
231     res3 OP2## = OP4 a1_r * b0_i;                  \
232     res3 OP3## = a1_i * b0_r;                      \
233                                                    \
234     /* 1st col */                                  \
235     b1_r = pb0[2];                                 \
236     b1_i = pb0[3];                                 \
237     res4 OP0## = a0_r * b1_r;                      \
238     res4 OP1## = a0_i * b1_i;                      \
239     res5 OP2## = OP4 a0_r * b1_i;                  \
240     res5 OP3## = a0_i * b1_r;                      \
241                                                    \
242     res6 OP0## = a1_r * b1_r;                      \
243     res6 OP1## = a1_i * b1_i;                      \
244     res7 OP2## = OP4 a1_r * b1_i;                  \
245     res7 OP3## = a1_i * b1_r;                      \
246                                                    \
247     /* 2nd col */                                  \
248     b2_r = pb0[4];                                 \
249     b2_i = pb0[5];                                 \
250     res8 OP0## = a0_r * b2_r;                      \
251     res8 OP1## = a0_i * b2_i;                      \
252     res9 OP2## = OP4 a0_r * b2_i;                  \
253     res9 OP3## = a0_i * b2_r;                      \
254                                                    \
255     res10 OP0## = a1_r * b2_r;                     \
256     res10 OP1## = a1_i * b2_i;                     \
257     res11 OP2## = OP4 a1_r * b2_i;                 \
258     res11 OP3## = a1_i * b2_r;                     \
259                                                    \
260     /* 3rd col */                                  \
261     b3_r = pb0[6];                                 \
262     b3_i = pb0[7];                                 \
263     res12 OP0## = a0_r * b3_r;                     \
264     res12 OP1## = a0_i * b3_i;                     \
265     res13 OP2## = OP4 a0_r * b3_i;                 \
266     res13 OP3## = a0_i * b3_r;                     \
267                                                    \
268     res14 OP0## = a1_r * b3_r;                     \
269     res14 OP1## = a1_i * b3_i;                     \
270     res15 OP2## = OP4 a1_r * b3_i;                 \
271     res15 OP3## = a1_i * b3_r;                     \
272 }
273
274 #define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4)  \
275 {                                                  \
276     /* 0th col */                                  \
277     a0_r = pa0[0];                                 \
278     a0_i = pa0[1];                                 \
279     b0_r = pb0[0];                                 \
280     b0_i = pb0[1];                                 \
281                                                    \
282     res0 OP0## = a0_r * b0_r;                      \
283     res0 OP1## = a0_i * b0_i;                      \
284     res1 OP2## = OP4 a0_r * b0_i;                  \
285     res1 OP3## = a0_i * b0_r;                      \
286                                                    \
287     a1_r = pa0[2];                                 \
288     a1_i = pa0[3];                                 \
289     res2 OP0## = a1_r * b0_r;                      \
290     res2 OP1## = a1_i * b0_i;                      \
291     res3 OP2## = OP4 a1_r * b0_i;                  \
292     res3 OP3## = a1_i * b0_r;                      \
293                                                    \
294     /* 1st col */                                  \
295     b1_r = pb0[2];                                 \
296     b1_i = pb0[3];                                 \
297     res4 OP0## = a0_r * b1_r;                      \
298     res4 OP1## = a0_i * b1_i;                      \
299     res5 OP2## = OP4 a0_r * b1_i;                  \
300     res5 OP3## = a0_i * b1_r;                      \
301                                                    \
302     res6 OP0## = a1_r * b1_r;                      \
303     res6 OP1## = a1_i * b1_i;                      \
304     res7 OP2## = OP4 a1_r * b1_i;                  \
305     res7 OP3## = a1_i * b1_r;                      \
306 }
307
308 #define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4)  \
309 {                                                  \
310     /* 0th col */                                  \
311     a0_r = pa0[0];                                 \
312     a0_i = pa0[1];                                 \
313     b0_r = pb0[0];                                 \
314     b0_i = pb0[1];                                 \
315                                                    \
316     res0 OP0## = a0_r * b0_r;                      \
317     res0 OP1## = a0_i * b0_i;                      \
318     res1 OP2## = OP4 a0_r * b0_i;                  \
319     res1 OP3## = a0_i * b0_r;                      \
320                                                    \
321     a1_r = pa0[2];                                 \
322     a1_i = pa0[3];                                 \
323     res2 OP0## = a1_r * b0_r;                      \
324     res2 OP1## = a1_i * b0_i;                      \
325     res3 OP2## = OP4 a1_r * b0_i;                  \
326     res3 OP3## = a1_i * b0_r;                      \
327 }
328
329 #define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4)  \
330 {                                                  \
331     /* 0th col */                                  \
332     a0_r = pa0[0];                                 \
333     a0_i = pa0[1];                                 \
334     b0_r = pb0[0];                                 \
335     b0_i = pb0[1];                                 \
336                                                    \
337     res0 OP0## = a0_r * b0_r;                      \
338     res0 OP1## = a0_i * b0_i;                      \
339     res1 OP2## = OP4 a0_r * b0_i;                  \
340     res1 OP3## = a0_i * b0_r;                      \
341                                                    \
342     /* 1st col */                                  \
343     b1_r = pb0[2];                                 \
344     b1_i = pb0[3];                                 \
345     res2 OP0## = a0_r * b1_r;                      \
346     res2 OP1## = a0_i * b1_i;                      \
347     res3 OP2## = OP4 a0_r * b1_i;                  \
348     res3 OP3## = a0_i * b1_r;                      \
349                                                    \
350     /* 2nd col */                                  \
351     b2_r = pb0[4];                                 \
352     b2_i = pb0[5];                                 \
353     res4 OP0## = a0_r * b2_r;                      \
354     res4 OP1## = a0_i * b2_i;                      \
355     res5 OP2## = OP4 a0_r * b2_i;                  \
356     res5 OP3## = a0_i * b2_r;                      \
357                                                    \
358     /* 3rd col */                                  \
359     b3_r = pb0[6];                                 \
360     b3_i = pb0[7];                                 \
361     res6 OP0## = a0_r * b3_r;                      \
362     res6 OP1## = a0_i * b3_i;                      \
363     res7 OP2## = OP4 a0_r * b3_i;                  \
364     res7 OP3## = a0_i * b3_r;                      \
365 }
366
367 #define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4)  \
368 {                                                  \
369     /* 0th col */                                  \
370     a0_r = pa0[0];                                 \
371     a0_i = pa0[1];                                 \
372     b0_r = pb0[0];                                 \
373     b0_i = pb0[1];                                 \
374                                                    \
375     res0 OP0## = a0_r * b0_r;                      \
376     res0 OP1## = a0_i * b0_i;                      \
377     res1 OP2## = OP4 a0_r * b0_i;                  \
378     res1 OP3## = a0_i * b0_r;                      \
379                                                    \
380     /* 1st col */                                  \
381     b1_r = pb0[2];                                 \
382     b1_i = pb0[3];                                 \
383     res2 OP0## = a0_r * b1_r;                      \
384     res2 OP1## = a0_i * b1_i;                      \
385     res3 OP2## = OP4 a0_r * b1_i;                  \
386     res3 OP3## = a0_i * b1_r;                      \
387 }
388
389 #define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4)  \
390 {                                                  \
391     /* 0th col */                                  \
392     a0_r = pa0[0];                                 \
393     a0_i = pa0[1];                                 \
394     b0_r = pb0[0];                                 \
395     b0_i = pb0[1];                                 \
396                                                    \
397     res0 OP0## = a0_r * b0_r;                      \
398     res0 OP1## = a0_i * b0_i;                      \
399     res1 OP2## = OP4 a0_r * b0_i;                  \
400     res1 OP3## = a0_i * b0_r;                      \
401 }
402
403 #define CGEMM_SCALE_8X4_MSA                      \
404 {                                                \
405     LD_SP4(pc0, 4, dst0, dst1, dst2, dst3);      \
406                                                  \
407     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
408     PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
409                                                  \
410     dst0_r += alpha_r * res0_r;                  \
411     dst0_r -= alpha_i * res0_i;                  \
412     dst0_i += alpha_r * res0_i;                  \
413     dst0_i += alpha_i * res0_r;                  \
414                                                  \
415     dst1_r += alpha_r * res1_r;                  \
416     dst1_r -= alpha_i * res1_i;                  \
417     dst1_i += alpha_r * res1_i;                  \
418     dst1_i += alpha_i * res1_r;                  \
419                                                  \
420     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
421     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
422                                                  \
423     ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
424                                                  \
425     LD_SP4(pc1, 4, dst0, dst1, dst2, dst3);      \
426                                                  \
427     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
428     PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
429                                                  \
430     dst0_r += alpha_r * res2_r;                  \
431     dst0_r -= alpha_i * res2_i;                  \
432     dst0_i += alpha_r * res2_i;                  \
433     dst0_i += alpha_i * res2_r;                  \
434                                                  \
435     dst1_r += alpha_r * res3_r;                  \
436     dst1_r -= alpha_i * res3_i;                  \
437     dst1_i += alpha_r * res3_i;                  \
438     dst1_i += alpha_i * res3_r;                  \
439                                                  \
440     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
441     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
442                                                  \
443     ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4);  \
444                                                  \
445     LD_SP4(pc2, 4, dst0, dst1, dst2, dst3);      \
446                                                  \
447     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
448     PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
449                                                  \
450     dst0_r += alpha_r * res4_r;                  \
451     dst0_r -= alpha_i * res4_i;                  \
452     dst0_i += alpha_r * res4_i;                  \
453     dst0_i += alpha_i * res4_r;                  \
454                                                  \
455     dst1_r += alpha_r * res5_r;                  \
456     dst1_r -= alpha_i * res5_i;                  \
457     dst1_i += alpha_r * res5_i;                  \
458     dst1_i += alpha_i * res5_r;                  \
459                                                  \
460     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
461     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
462                                                  \
463     ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4);  \
464                                                  \
465     LD_SP4(pc3, 4, dst0, dst1, dst2, dst3);      \
466                                                  \
467     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
468     PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
469                                                  \
470     dst0_r += alpha_r * res6_r;                  \
471     dst0_r -= alpha_i * res6_i;                  \
472     dst0_i += alpha_r * res6_i;                  \
473     dst0_i += alpha_i * res6_r;                  \
474                                                  \
475     dst1_r += alpha_r * res7_r;                  \
476     dst1_r -= alpha_i * res7_i;                  \
477     dst1_i += alpha_r * res7_i;                  \
478     dst1_i += alpha_i * res7_r;                  \
479                                                  \
480     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
481     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
482                                                  \
483     ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4);  \
484 }
485
486 #define CGEMM_SCALE_8X2_MSA                      \
487 {                                                \
488     LD_SP4(pc0, 4, dst0, dst1, dst2, dst3);      \
489                                                  \
490     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
491     PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
492                                                  \
493     dst0_r += alpha_r * res0_r;                  \
494     dst0_r -= alpha_i * res0_i;                  \
495     dst0_i += alpha_r * res0_i;                  \
496     dst0_i += alpha_i * res0_r;                  \
497                                                  \
498     dst1_r += alpha_r * res1_r;                  \
499     dst1_r -= alpha_i * res1_i;                  \
500     dst1_i += alpha_r * res1_i;                  \
501     dst1_i += alpha_i * res1_r;                  \
502                                                  \
503     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
504     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
505                                                  \
506     ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
507                                                  \
508     LD_SP4(pc1, 4, dst0, dst1, dst2, dst3);      \
509                                                  \
510     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
511     PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
512                                                  \
513     dst0_r += alpha_r * res2_r;                  \
514     dst0_r -= alpha_i * res2_i;                  \
515     dst0_i += alpha_r * res2_i;                  \
516     dst0_i += alpha_i * res2_r;                  \
517                                                  \
518     dst1_r += alpha_r * res3_r;                  \
519     dst1_r -= alpha_i * res3_i;                  \
520     dst1_i += alpha_r * res3_i;                  \
521     dst1_i += alpha_i * res3_r;                  \
522                                                  \
523     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
524     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
525                                                  \
526     ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4);  \
527 }
528
529 #define CGEMM_SCALE_8X1_MSA                      \
530 {                                                \
531     LD_SP4(pc0, 4, dst0, dst1, dst2, dst3);      \
532                                                  \
533     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
534     PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
535                                                  \
536     dst0_r += alpha_r * res0_r;                  \
537     dst0_r -= alpha_i * res0_i;                  \
538     dst0_i += alpha_r * res0_i;                  \
539     dst0_i += alpha_i * res0_r;                  \
540                                                  \
541     dst1_r += alpha_r * res1_r;                  \
542     dst1_r -= alpha_i * res1_i;                  \
543     dst1_i += alpha_r * res1_i;                  \
544     dst1_i += alpha_i * res1_r;                  \
545                                                  \
546     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
547     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
548                                                  \
549     ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
550 }
551
552 #define CGEMM_SCALE_4X4_MSA                     \
553 {                                               \
554     LD_SP2(pc0, 4, dst0, dst1);                 \
555                                                 \
556     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
557                                                 \
558     dst0_r += alpha_r * res0_r;                 \
559     dst0_r -= alpha_i * res0_i;                 \
560     dst0_i += alpha_r * res0_i;                 \
561     dst0_i += alpha_i * res0_r;                 \
562                                                 \
563     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
564                                                 \
565     ST_SP2_INC(dst0, dst1, pc0, 4);             \
566                                                 \
567     LD_SP2(pc1, 4, dst0, dst1);                 \
568                                                 \
569     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
570                                                 \
571     dst0_r += alpha_r * res2_r;                 \
572     dst0_r -= alpha_i * res2_i;                 \
573     dst0_i += alpha_r * res2_i;                 \
574     dst0_i += alpha_i * res2_r;                 \
575                                                 \
576     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
577                                                 \
578     ST_SP2_INC(dst0, dst1, pc1, 4);             \
579                                                 \
580     LD_SP2(pc2, 4, dst0, dst1);                 \
581                                                 \
582     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
583                                                 \
584     dst0_r += alpha_r * res4_r;                 \
585     dst0_r -= alpha_i * res4_i;                 \
586     dst0_i += alpha_r * res4_i;                 \
587     dst0_i += alpha_i * res4_r;                 \
588                                                 \
589     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
590                                                 \
591     ST_SP2_INC(dst0, dst1, pc2, 4);             \
592                                                 \
593     LD_SP2(pc3, 4, dst0, dst1);                 \
594                                                 \
595     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
596                                                 \
597     dst0_r += alpha_r * res6_r;                 \
598     dst0_r -= alpha_i * res6_i;                 \
599     dst0_i += alpha_r * res6_i;                 \
600     dst0_i += alpha_i * res6_r;                 \
601                                                 \
602     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
603                                                 \
604     ST_SP2_INC(dst0, dst1, pc3, 4);             \
605 }
606
607 #define CGEMM_SCALE_4X2_MSA                     \
608 {                                               \
609     LD_SP2(pc0, 4, dst0, dst1);                 \
610                                                 \
611     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
612                                                 \
613     dst0_r += alpha_r * res0_r;                 \
614     dst0_r -= alpha_i * res0_i;                 \
615     dst0_i += alpha_r * res0_i;                 \
616     dst0_i += alpha_i * res0_r;                 \
617                                                 \
618     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
619                                                 \
620     ST_SP2_INC(dst0, dst1, pc0, 4);             \
621                                                 \
622     LD_SP2(pc1, 4, dst0, dst1);                 \
623                                                 \
624     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
625                                                 \
626     dst0_r += alpha_r * res2_r;                 \
627     dst0_r -= alpha_i * res2_i;                 \
628     dst0_i += alpha_r * res2_i;                 \
629     dst0_i += alpha_i * res2_r;                 \
630                                                 \
631     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
632                                                 \
633     ST_SP2_INC(dst0, dst1, pc1, 4);             \
634 }
635
636 #define CGEMM_SCALE_4X1_MSA                     \
637 {                                               \
638     LD_SP2(pc0, 4, dst0, dst1);                 \
639                                                 \
640     PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
641                                                 \
642     dst0_r += alpha_r * res0_r;                 \
643     dst0_r -= alpha_i * res0_i;                 \
644     dst0_i += alpha_r * res0_i;                 \
645     dst0_i += alpha_i * res0_r;                 \
646                                                 \
647     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
648                                                 \
649     ST_SP2_INC(dst0, dst1, pc0, 4);             \
650 }
651
652 #define CGEMM_SCALE_2X4        \
653 {                              \
654     /* 0th col */              \
655     pc0[0] += alphar * res0;   \
656     pc0[0] -= alphai * res1;   \
657     pc0[1] += alphar * res1;   \
658     pc0[1] += alphai * res0;   \
659     pc0[2] += alphar * res2;   \
660     pc0[2] -= alphai * res3;   \
661     pc0[3] += alphar * res3;   \
662     pc0[3] += alphai * res2;   \
663                                \
664     /* 1st col */              \
665     pc1[0] += alphar * res4;   \
666     pc1[0] -= alphai * res5;   \
667     pc1[1] += alphar * res5;   \
668     pc1[1] += alphai * res4;   \
669     pc1[2] += alphar * res6;   \
670     pc1[2] -= alphai * res7;   \
671     pc1[3] += alphar * res7;   \
672     pc1[3] += alphai * res6;   \
673                                \
674     /* 2nd col */              \
675     pc2[0] += alphar * res8;   \
676     pc2[0] -= alphai * res9;   \
677     pc2[1] += alphar * res9;   \
678     pc2[1] += alphai * res8;   \
679     pc2[2] += alphar * res10;  \
680     pc2[2] -= alphai * res11;  \
681     pc2[3] += alphar * res11;  \
682     pc2[3] += alphai * res10;  \
683                                \
684     /* 3rd col */              \
685     pc3[0] += alphar * res12;  \
686     pc3[0] -= alphai * res13;  \
687     pc3[1] += alphar * res13;  \
688     pc3[1] += alphai * res12;  \
689     pc3[2] += alphar * res14;  \
690     pc3[2] -= alphai * res15;  \
691     pc3[3] += alphar * res15;  \
692     pc3[3] += alphai * res14;  \
693 }
694
695 #define CGEMM_SCALE_2X2       \
696 {                             \
697     /* 0th col */             \
698     pc0[0] += alphar * res0;  \
699     pc0[0] -= alphai * res1;  \
700     pc0[1] += alphar * res1;  \
701     pc0[1] += alphai * res0;  \
702     pc0[2] += alphar * res2;  \
703     pc0[2] -= alphai * res3;  \
704     pc0[3] += alphar * res3;  \
705     pc0[3] += alphai * res2;  \
706                               \
707     /* 1st col */             \
708     pc1[0] += alphar * res4;  \
709     pc1[0] -= alphai * res5;  \
710     pc1[1] += alphar * res5;  \
711     pc1[1] += alphai * res4;  \
712     pc1[2] += alphar * res6;  \
713     pc1[2] -= alphai * res7;  \
714     pc1[3] += alphar * res7;  \
715     pc1[3] += alphai * res6;  \
716 }
717
718 #define CGEMM_SCALE_2X1       \
719 {                             \
720     pc0[0] += alphar * res0;  \
721     pc0[0] -= alphai * res1;  \
722     pc0[1] += alphar * res1;  \
723     pc0[1] += alphai * res0;  \
724                               \
725     pc0[2] += alphar * res2;  \
726     pc0[2] -= alphai * res3;  \
727     pc0[3] += alphar * res3;  \
728     pc0[3] += alphai * res2;  \
729 }
730
731 #define CGEMM_SCALE_1X4       \
732 {                             \
733     pc0[0] += alphar * res0;  \
734     pc0[0] -= alphai * res1;  \
735     pc0[1] += alphar * res1;  \
736     pc0[1] += alphai * res0;  \
737                               \
738     pc1[0] += alphar * res2;  \
739     pc1[0] -= alphai * res3;  \
740     pc1[1] += alphar * res3;  \
741     pc1[1] += alphai * res2;  \
742                               \
743     pc2[0] += alphar * res4;  \
744     pc2[0] -= alphai * res5;  \
745     pc2[1] += alphar * res5;  \
746     pc2[1] += alphai * res4;  \
747                               \
748     pc3[0] += alphar * res6;  \
749     pc3[0] -= alphai * res7;  \
750     pc3[1] += alphar * res7;  \
751     pc3[1] += alphai * res6;  \
752 }
753
754 #define CGEMM_SCALE_1X2       \
755 {                             \
756     pc0[0] += alphar * res0;  \
757     pc0[0] -= alphai * res1;  \
758     pc0[1] += alphar * res1;  \
759     pc0[1] += alphai * res0;  \
760                               \
761     pc1[2] += alphar * res2;  \
762     pc1[2] -= alphai * res3;  \
763     pc1[3] += alphar * res3;  \
764     pc1[3] += alphai * res2;  \
765 }
766
767 #define CGEMM_SCALE_1X1       \
768 {                             \
769     pc0[0] += alphar * res0;  \
770     pc0[0] -= alphai * res1;  \
771     pc0[1] += alphar * res1;  \
772     pc0[1] += alphai * res0;  \
773 }
774
775 #define CGEMM_TRMM_SCALE_8X4_MSA                 \
776 {                                                \
777     dst0_r = alpha_r * res0_r;                   \
778     dst0_r -= alpha_i * res0_i;                  \
779     dst0_i = alpha_r * res0_i;                   \
780     dst0_i += alpha_i * res0_r;                  \
781                                                  \
782     dst1_r = alpha_r * res1_r;                   \
783     dst1_r -= alpha_i * res1_i;                  \
784     dst1_i = alpha_r * res1_i;                   \
785     dst1_i += alpha_i * res1_r;                  \
786                                                  \
787     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
788     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
789                                                  \
790     ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
791                                                  \
792     dst0_r = alpha_r * res2_r;                   \
793     dst0_r -= alpha_i * res2_i;                  \
794     dst0_i = alpha_r * res2_i;                   \
795     dst0_i += alpha_i * res2_r;                  \
796                                                  \
797     dst1_r = alpha_r * res3_r;                   \
798     dst1_r -= alpha_i * res3_i;                  \
799     dst1_i = alpha_r * res3_i;                   \
800     dst1_i += alpha_i * res3_r;                  \
801                                                  \
802     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
803     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
804                                                  \
805     ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4);  \
806                                                  \
807     dst0_r = alpha_r * res4_r;                   \
808     dst0_r -= alpha_i * res4_i;                  \
809     dst0_i = alpha_r * res4_i;                   \
810     dst0_i += alpha_i * res4_r;                  \
811                                                  \
812     dst1_r = alpha_r * res5_r;                   \
813     dst1_r -= alpha_i * res5_i;                  \
814     dst1_i = alpha_r * res5_i;                   \
815     dst1_i += alpha_i * res5_r;                  \
816                                                  \
817     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
818     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
819                                                  \
820     ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4);  \
821                                                  \
822     dst0_r = alpha_r * res6_r;                   \
823     dst0_r -= alpha_i * res6_i;                  \
824     dst0_i = alpha_r * res6_i;                   \
825     dst0_i += alpha_i * res6_r;                  \
826                                                  \
827     dst1_r = alpha_r * res7_r;                   \
828     dst1_r -= alpha_i * res7_i;                  \
829     dst1_i = alpha_r * res7_i;                   \
830     dst1_i += alpha_i * res7_r;                  \
831                                                  \
832     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
833     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
834                                                  \
835     ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4);  \
836 }
837
838 #define CGEMM_TRMM_SCALE_8X2_MSA                 \
839 {                                                \
840     dst0_r = alpha_r * res0_r;                   \
841     dst0_r -= alpha_i * res0_i;                  \
842     dst0_i = alpha_r * res0_i;                   \
843     dst0_i += alpha_i * res0_r;                  \
844                                                  \
845     dst1_r = alpha_r * res1_r;                   \
846     dst1_r -= alpha_i * res1_i;                  \
847     dst1_i = alpha_r * res1_i;                   \
848     dst1_i += alpha_i * res1_r;                  \
849                                                  \
850     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
851     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
852                                                  \
853     ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
854                                                  \
855     dst0_r = alpha_r * res2_r;                   \
856     dst0_r -= alpha_i * res2_i;                  \
857     dst0_i = alpha_r * res2_i;                   \
858     dst0_i += alpha_i * res2_r;                  \
859                                                  \
860     dst1_r = alpha_r * res3_r;                   \
861     dst1_r -= alpha_i * res3_i;                  \
862     dst1_i = alpha_r * res3_i;                   \
863     dst1_i += alpha_i * res3_r;                  \
864                                                  \
865     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
866     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
867                                                  \
868     ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4);  \
869 }
870
871 #define CGEMM_TRMM_SCALE_8X1_MSA                 \
872 {                                                \
873     dst0_r = alpha_r * res0_r;                   \
874     dst0_r -= alpha_i * res0_i;                  \
875     dst0_i = alpha_r * res0_i;                   \
876     dst0_i += alpha_i * res0_r;                  \
877                                                  \
878     dst1_r = alpha_r * res1_r;                   \
879     dst1_r -= alpha_i * res1_i;                  \
880     dst1_i = alpha_r * res1_i;                   \
881     dst1_i += alpha_i * res1_r;                  \
882                                                  \
883     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
884     ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
885                                                  \
886     ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
887 }
888
889 #define CGEMM_TRMM_SCALE_4X4_MSA              \
890 {                                             \
891     dst0_r = alpha_r * res0_r;                \
892     dst0_r -= alpha_i * res0_i;               \
893     dst0_i = alpha_r * res0_i;                \
894     dst0_i += alpha_i * res0_r;               \
895                                               \
896     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
897                                               \
898     ST_SP2_INC(dst0, dst1, pc0, 4);           \
899                                               \
900     dst0_r = alpha_r * res2_r;                \
901     dst0_r -= alpha_i * res2_i;               \
902     dst0_i = alpha_r * res2_i;                \
903     dst0_i += alpha_i * res2_r;               \
904                                               \
905     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
906                                               \
907     ST_SP2_INC(dst0, dst1, pc1, 4);           \
908                                               \
909     dst0_r = alpha_r * res4_r;                \
910     dst0_r -= alpha_i * res4_i;               \
911     dst0_i = alpha_r * res4_i;                \
912     dst0_i += alpha_i * res4_r;               \
913                                               \
914     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
915                                               \
916     ST_SP2_INC(dst0, dst1, pc2, 4);           \
917                                               \
918     dst0_r = alpha_r * res6_r;                \
919     dst0_r -= alpha_i * res6_i;               \
920     dst0_i = alpha_r * res6_i;                \
921     dst0_i += alpha_i * res6_r;               \
922                                               \
923     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
924                                               \
925     ST_SP2_INC(dst0, dst1, pc3, 4);           \
926 }
927
928 #define CGEMM_TRMM_SCALE_4X2_MSA              \
929 {                                             \
930     dst0_r = alpha_r * res0_r;                \
931     dst0_r -= alpha_i * res0_i;               \
932     dst0_i = alpha_r * res0_i;                \
933     dst0_i += alpha_i * res0_r;               \
934                                               \
935     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
936                                               \
937     ST_SP2_INC(dst0, dst1, pc0, 4);           \
938                                               \
939     dst0_r = alpha_r * res2_r;                \
940     dst0_r -= alpha_i * res2_i;               \
941     dst0_i = alpha_r * res2_i;                \
942     dst0_i += alpha_i * res2_r;               \
943                                               \
944     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
945                                               \
946     ST_SP2_INC(dst0, dst1, pc1, 4);           \
947 }
948
949 #define CGEMM_TRMM_SCALE_4X1_MSA              \
950 {                                             \
951     dst0_r = alpha_r * res0_r;                \
952     dst0_r -= alpha_i * res0_i;               \
953     dst0_i = alpha_r * res0_i;                \
954     dst0_i += alpha_i * res0_r;               \
955                                               \
956     ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
957                                               \
958     ST_SP2_INC(dst0, dst1, pc0, 4);           \
959 }
960
961 #define CGEMM_TRMM_SCALE_2X4   \
962 {                              \
963     /* 0th col */              \
964     pc0[0] = alphar * res0;    \
965     pc0[0] -= alphai * res1;   \
966     pc0[1] = alphar * res1;    \
967     pc0[1] += alphai * res0;   \
968     pc0[2] = alphar * res2;    \
969     pc0[2] -= alphai * res3;   \
970     pc0[3] = alphar * res3;    \
971     pc0[3] += alphai * res2;   \
972                                \
973     /* 1st col */              \
974     pc1[0] = alphar * res4;    \
975     pc1[0] -= alphai * res5;   \
976     pc1[1] = alphar * res5;    \
977     pc1[1] += alphai * res4;   \
978     pc1[2] = alphar * res6;    \
979     pc1[2] -= alphai * res7;   \
980     pc1[3] = alphar * res7;    \
981     pc1[3] += alphai * res6;   \
982                                \
983     /* 2nd col */              \
984     pc2[0] = alphar * res8;    \
985     pc2[0] -= alphai * res9;   \
986     pc2[1] = alphar * res9;    \
987     pc2[1] += alphai * res8;   \
988     pc2[2] = alphar * res10;   \
989     pc2[2] -= alphai * res11;  \
990     pc2[3] = alphar * res11;   \
991     pc2[3] += alphai * res10;  \
992                                \
993     /* 3rd col */              \
994     pc3[0] = alphar * res12;   \
995     pc3[0] -= alphai * res13;  \
996     pc3[1] = alphar * res13;   \
997     pc3[1] += alphai * res12;  \
998     pc3[2] = alphar * res14;   \
999     pc3[2] -= alphai * res15;  \
1000     pc3[3] = alphar * res15;   \
1001     pc3[3] += alphai * res14;  \
1002 }
1003
1004 #define CGEMM_TRMM_SCALE_2X2  \
1005 {                             \
1006     /* 0th col */             \
1007     pc0[0] = alphar * res0;   \
1008     pc0[0] -= alphai * res1;  \
1009     pc0[1] = alphar * res1;   \
1010     pc0[1] += alphai * res0;  \
1011     pc0[2] = alphar * res2;   \
1012     pc0[2] -= alphai * res3;  \
1013     pc0[3] = alphar * res3;   \
1014     pc0[3] += alphai * res2;  \
1015                               \
1016     /* 1st col */             \
1017     pc1[0] = alphar * res4;   \
1018     pc1[0] -= alphai * res5;  \
1019     pc1[1] = alphar * res5;   \
1020     pc1[1] += alphai * res4;  \
1021     pc1[2] = alphar * res6;   \
1022     pc1[2] -= alphai * res7;  \
1023     pc1[3] = alphar * res7;   \
1024     pc1[3] += alphai * res6;  \
1025 }
1026
1027 #define CGEMM_TRMM_SCALE_2X1  \
1028 {                             \
1029     pc0[0] = alphar * res0;   \
1030     pc0[0] -= alphai * res1;  \
1031     pc0[1] = alphar * res1;   \
1032     pc0[1] += alphai * res0;  \
1033                               \
1034     pc0[2] = alphar * res2;   \
1035     pc0[2] -= alphai * res3;  \
1036     pc0[3] = alphar * res3;   \
1037     pc0[3] += alphai * res2;  \
1038 }
1039
1040 #define CGEMM_TRMM_SCALE_1X4  \
1041 {                             \
1042     pc0[0] = alphar * res0;   \
1043     pc0[0] -= alphai * res1;  \
1044     pc0[1] = alphar * res1;   \
1045     pc0[1] += alphai * res0;  \
1046                               \
1047     pc1[0] = alphar * res2;   \
1048     pc1[0] -= alphai * res3;  \
1049     pc1[1] = alphar * res3;   \
1050     pc1[1] += alphai * res2;  \
1051                               \
1052     pc2[0] = alphar * res4;   \
1053     pc2[0] -= alphai * res5;  \
1054     pc2[1] = alphar * res5;   \
1055     pc2[1] += alphai * res4;  \
1056                               \
1057     pc3[0] = alphar * res6;   \
1058     pc3[0] -= alphai * res7;  \
1059     pc3[1] = alphar * res7;   \
1060     pc3[1] += alphai * res6;  \
1061 }
1062
1063 #define CGEMM_TRMM_SCALE_1X2  \
1064 {                             \
1065     pc0[0] = alphar * res0;   \
1066     pc0[0] -= alphai * res1;  \
1067     pc0[1] = alphar * res1;   \
1068     pc0[1] += alphai * res0;  \
1069                               \
1070     pc1[2] = alphar * res2;   \
1071     pc1[2] -= alphai * res3;  \
1072     pc1[3] = alphar * res3;   \
1073     pc1[3] += alphai * res2;  \
1074 }
1075
1076 #define CGEMM_TRMM_SCALE_1X1  \
1077 {                             \
1078     pc0[0] = alphar * res0;   \
1079     pc0[0] -= alphai * res1;  \
1080     pc0[1] = alphar * res1;   \
1081     pc0[1] += alphai * res0;  \
1082 }
1083
1084 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
1085           FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
1086 #ifdef TRMMKERNEL
1087          , BLASLONG offset
1088 #endif
1089           )
1090 {
1091     BLASLONG i, j, l, temp;
1092 #if defined(TRMMKERNEL)
1093     BLASLONG off;
1094 #endif
1095     FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
1096     FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
1097     FLOAT res8, res9, res10, res11, res12, res13, res14, res15;
1098     FLOAT a0_r, a1_r, a0_i, a1_i, b0_i, b1_i, b2_i, b3_i;
1099     FLOAT b0_r, b1_r, b2_r, b3_r;
1100     v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1;
1101     v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
1102     v4f32 dst0, dst1, dst2, dst3, alpha_r, alpha_i;
1103     v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
1104     v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
1105     v4f32 dst0_r, dst0_i, dst1_r, dst1_i;
1106
1107     alpha_r = COPY_FLOAT_TO_VECTOR(alphar);
1108     alpha_i = COPY_FLOAT_TO_VECTOR(alphai);
1109
1110 #if defined(TRMMKERNEL) && !defined(LEFT)
1111     off = -offset;
1112 #endif
1113
1114     for (j = (n >> 2); j--;)
1115     {
1116         pc0 = C;
1117         pc1 = pc0 + 2 * ldc;
1118         pc2 = pc1 + 2 * ldc;
1119         pc3 = pc2 + 2 * ldc;
1120
1121 #if defined(TRMMKERNEL) && defined(LEFT)
1122         off = offset;
1123 #endif
1124
1125         pa0 = A;
1126
1127         for (i = (m >> 3); i--;)
1128         {
1129 #if defined(TRMMKERNEL)
1130 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1131             pb0 = B;
1132 #else
1133             pa0 += off * 2 * 8;
1134             pb0 = B + off * 2 * 4;
1135 #endif
1136
1137 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1138             temp = k - off;
1139 #elif defined(LEFT)
1140             temp = off + 8; // number of values in A
1141 #else
1142             temp = off + 4; // number of values in B
1143 #endif
1144 #else
1145             pb0 = B;
1146             temp = k;
1147 #endif
1148
1149 #ifdef ENABLE_PREFETCH
1150             __asm__ __volatile__(
1151                 "pref   0,   64(%[pa0])   \n\t"
1152                 "pref   0,   96(%[pa0])   \n\t"
1153                 "pref   0,   32(%[pb0])   \n\t"
1154
1155                 :
1156                 : [pa0] "r" (pa0), [pb0] "r" (pb0)
1157             );
1158 #endif
1159
1160 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1161             CGEMM_KERNEL_8X4_MSA(, -, , +, +);
1162 #endif
1163 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1164             CGEMM_KERNEL_8X4_MSA(, +, , +, -);
1165 #endif
1166 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1167             CGEMM_KERNEL_8X4_MSA(, +, , -, +);
1168 #endif
1169 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1170             CGEMM_KERNEL_8X4_MSA(, -, , -, -);
1171 #endif
1172
1173             for (l = (temp - 1); l--;)
1174             {
1175 #ifdef ENABLE_PREFETCH
1176                 __asm__ __volatile__(
1177                     "pref   0,   64(%[pa0])   \n\t"
1178                     "pref   0,   96(%[pa0])   \n\t"
1179                     "pref   0,   32(%[pb0])   \n\t"
1180
1181                     :
1182                     : [pa0] "r" (pa0), [pb0] "r" (pb0)
1183                 );
1184 #endif
1185
1186 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1187                 CGEMM_KERNEL_8X4_MSA(+, -, +, +,);
1188 #endif
1189 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1190                 CGEMM_KERNEL_8X4_MSA(+, +, -, +,);
1191 #endif
1192 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1193                 CGEMM_KERNEL_8X4_MSA(+, +, +, -,);
1194 #endif
1195 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1196                 CGEMM_KERNEL_8X4_MSA(+, -, -, -,);
1197 #endif
1198             }
1199
1200 #if defined(TRMMKERNEL)
1201             CGEMM_TRMM_SCALE_8X4_MSA
1202 #else
1203             CGEMM_SCALE_8X4_MSA
1204 #endif
1205
1206 #if defined(TRMMKERNEL)
1207 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1208             temp = k - off;
1209 #ifdef LEFT
1210             temp -= 8; // number of values in A
1211 #else
1212             temp -= 4; // number of values in B
1213 #endif
1214             pa0 += temp * 2 * 8;
1215             pb0 += temp * 2 * 4;
1216 #endif
1217
1218 #ifdef LEFT
1219             off += 8; // number of values in A
1220 #endif
1221 #endif
1222         }
1223
1224         if (m & 4)
1225         {
1226 #if defined(TRMMKERNEL)
1227 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1228             pb0 = B;
1229 #else
1230             pa0 += off * 2 * 4;
1231             pb0 = B + off * 2 * 4;
1232 #endif
1233
1234 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1235             temp = k - off;
1236 #elif defined(LEFT)
1237             temp = off + 4; // number of values in A
1238 #else
1239             temp = off + 4; // number of values in B
1240 #endif
1241 #else
1242             pb0 = B;
1243             temp = k;
1244 #endif
1245
1246 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1247             CGEMM_KERNEL_4X4_MSA(, -, , +, +);
1248 #endif
1249 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1250             CGEMM_KERNEL_4X4_MSA(, +, , +, -);
1251 #endif
1252 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1253             CGEMM_KERNEL_4X4_MSA(, +, , -, +);
1254 #endif
1255 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1256             CGEMM_KERNEL_4X4_MSA(, -, , -, -);
1257 #endif
1258
1259             for (l = (temp - 1); l--;)
1260             {
1261 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1262                 CGEMM_KERNEL_4X4_MSA(+, -, +, +,);
1263 #endif
1264 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1265                 CGEMM_KERNEL_4X4_MSA(+, +, -, +,);
1266 #endif
1267 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1268                 CGEMM_KERNEL_4X4_MSA(+, +, +, -,);
1269 #endif
1270 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1271                 CGEMM_KERNEL_4X4_MSA(+, -, -, -,);
1272 #endif
1273             }
1274
1275 #if defined(TRMMKERNEL)
1276             CGEMM_TRMM_SCALE_4X4_MSA
1277 #else
1278             CGEMM_SCALE_4X4_MSA
1279 #endif
1280
1281 #if defined(TRMMKERNEL)
1282 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1283             temp = k - off;
1284 #ifdef LEFT
1285             temp -= 4; // number of values in A
1286 #else
1287             temp -= 4; // number of values in B
1288 #endif
1289             pa0 += temp * 2 * 4;
1290             pb0 += temp * 2 * 4;
1291 #endif
1292
1293 #ifdef LEFT
1294             off += 4; // number of values in A
1295 #endif
1296 #endif
1297         }
1298
1299         if (m & 2)
1300         {
1301 #if defined(TRMMKERNEL)
1302 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1303             pb0 = B;
1304 #else
1305             pa0 += off * 2 * 2;
1306             pb0 = B + off * 2 * 4;
1307 #endif
1308
1309 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1310             temp = k - off;
1311 #elif defined(LEFT)
1312             temp = off + 2; // number of values in A
1313 #else
1314             temp = off + 4; // number of values in B
1315 #endif
1316 #else
1317             pb0 = B;
1318             temp = k;
1319 #endif
1320
1321 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1322             CGEMM_KERNEL_2X4(, -, , +, +);
1323 #endif
1324 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1325             CGEMM_KERNEL_2X4(, +, , +, -);
1326 #endif
1327 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1328             CGEMM_KERNEL_2X4(, +, , -, +);
1329 #endif
1330 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1331             CGEMM_KERNEL_2X4(, -, , -, -);
1332 #endif
1333
1334             pa0 += 4;
1335             pb0 += 8;
1336
1337             for (l = (temp - 1); l--;)
1338             {
1339 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1340                 CGEMM_KERNEL_2X4(+, -, +, +,);
1341 #endif
1342 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1343                 CGEMM_KERNEL_2X4(+, +, -, +,);
1344 #endif
1345 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1346                 CGEMM_KERNEL_2X4(+, +, +, -,);
1347 #endif
1348 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1349                 CGEMM_KERNEL_2X4(+, -, -, -,);
1350 #endif
1351
1352                 pa0 += 4;
1353                 pb0 += 8;
1354             }
1355
1356 #if defined(TRMMKERNEL)
1357             CGEMM_TRMM_SCALE_2X4
1358 #else
1359             CGEMM_SCALE_2X4
1360 #endif
1361             pc0 += 4;
1362             pc1 += 4;
1363             pc2 += 4;
1364             pc3 += 4;
1365
1366 #if defined(TRMMKERNEL)
1367 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1368             temp = k - off;
1369 #ifdef LEFT
1370             temp -= 2; // number of values in A
1371 #else
1372             temp -= 4; // number of values in B
1373 #endif
1374             pa0 += temp * 2 * 2;
1375             pb0 += temp * 2 * 4;
1376 #endif
1377
1378 #ifdef LEFT
1379             off += 2; // number of values in A
1380 #endif
1381 #endif
1382         }
1383
1384         if (m & 1)
1385         {
1386 #if defined(TRMMKERNEL)
1387 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1388             pb0 = B;
1389 #else
1390             pa0 += off * 2 * 1;
1391             pb0 = B + off * 2 * 4;
1392 #endif
1393
1394 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1395             temp = k - off;
1396 #elif defined(LEFT)
1397             temp = off + 1; // number of values in A
1398 #else
1399             temp = off + 4; // number of values in B
1400 #endif
1401 #else
1402             pb0 = B;
1403             temp = k;
1404 #endif
1405
1406 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1407             CGEMM_KERNEL_1X4(, -, , +, +);
1408 #endif
1409 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1410             CGEMM_KERNEL_1X4(, +, , +, -);
1411 #endif
1412 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1413             CGEMM_KERNEL_1X4(, +, , -, +);
1414 #endif
1415 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1416             CGEMM_KERNEL_1X4(, -, , -, -);
1417 #endif
1418
1419             pa0 += 2;
1420             pb0 += 8;
1421
1422             for (l = (temp - 1); l--;)
1423             {
1424 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1425                 CGEMM_KERNEL_1X4(+, -, +, +,);
1426 #endif
1427 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1428                 CGEMM_KERNEL_1X4(+, +, -, +,);
1429 #endif
1430 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1431                 CGEMM_KERNEL_1X4(+, +, +, -,);
1432 #endif
1433 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1434                 CGEMM_KERNEL_1X4(+, -, -, -,);
1435 #endif
1436
1437                 pa0 += 2;
1438                 pb0 += 8;
1439             }
1440
1441 #if defined(TRMMKERNEL)
1442             CGEMM_TRMM_SCALE_1X4
1443 #else
1444             CGEMM_SCALE_1X4
1445 #endif
1446             pc0 += 2;
1447             pc1 += 2;
1448             pc2 += 2;
1449             pc3 += 2;
1450
1451 #if defined(TRMMKERNEL)
1452 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1453             temp = k - off;
1454 #ifdef LEFT
1455             temp -= 1; // number of values in A
1456 #else
1457             temp -= 4; // number of values in B
1458 #endif
1459             pa0 += temp * 2 * 1;
1460             pb0 += temp * 2 * 4;
1461 #endif
1462
1463 #ifdef LEFT
1464             off += 1; // number of values in A
1465 #endif
1466 #endif
1467         }
1468
1469 #if defined(TRMMKERNEL) && !defined(LEFT)
1470         off += 4; // number of values in A
1471 #endif
1472
1473         B += (k << 3);
1474         C += (ldc << 3);
1475     }
1476
1477     if (n & 2)
1478     {
1479         pc0 = C;
1480         pc1 = pc0 + 2 * ldc;
1481
1482 #if defined(TRMMKERNEL) && defined(LEFT)
1483         off = offset;
1484 #endif
1485
1486         pa0 = A;
1487
1488         for (i = (m >> 3); i--;)
1489         {
1490 #if defined(TRMMKERNEL)
1491 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1492             pb0 = B;
1493 #else
1494             pa0 += off * 2 * 8;
1495             pb0 = B + off * 2 * 2;
1496 #endif
1497
1498 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1499             temp = k - off;
1500 #elif defined(LEFT)
1501             temp = off + 8; // number of values in A
1502 #else
1503             temp = off + 2; // number of values in B
1504 #endif
1505 #else
1506             pb0 = B;
1507             temp = k;
1508 #endif
1509
1510 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1511             CGEMM_KERNEL_8X2_MSA(, -, , +, +);
1512 #endif
1513 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1514             CGEMM_KERNEL_8X2_MSA(, +, , +, -);
1515 #endif
1516 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1517             CGEMM_KERNEL_8X2_MSA(, +, , -, +);
1518 #endif
1519 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1520             CGEMM_KERNEL_8X2_MSA(, -, , -, -);
1521 #endif
1522
1523             pb0 += 4;
1524
1525             for (l = (temp - 1); l--;)
1526             {
1527 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1528                 CGEMM_KERNEL_8X2_MSA(+, -, +, +,);
1529 #endif
1530 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1531                 CGEMM_KERNEL_8X2_MSA(+, +, -, +,);
1532 #endif
1533 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1534                 CGEMM_KERNEL_8X2_MSA(+, +, +, -,);
1535 #endif
1536 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1537                 CGEMM_KERNEL_8X2_MSA(+, -, -, -,);
1538 #endif
1539
1540                 pb0 += 4;
1541             }
1542
1543 #if defined(TRMMKERNEL)
1544             CGEMM_TRMM_SCALE_8X2_MSA
1545 #else
1546             CGEMM_SCALE_8X2_MSA
1547 #endif
1548
1549 #if defined(TRMMKERNEL)
1550 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1551             temp = k - off;
1552 #ifdef LEFT
1553             temp -= 8; // number of values in A
1554 #else
1555             temp -= 2; // number of values in B
1556 #endif
1557             pa0 += temp * 2 * 8;
1558             pb0 += temp * 2 * 2;
1559 #endif
1560
1561 #ifdef LEFT
1562             off += 8; // number of values in A
1563 #endif
1564 #endif
1565         }
1566
1567         if (m & 4)
1568         {
1569 #if defined(TRMMKERNEL)
1570 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1571             pb0 = B;
1572 #else
1573             pa0 += off * 2 * 4;
1574             pb0 = B + off * 2 * 2;
1575 #endif
1576
1577 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1578             temp = k - off;
1579 #elif defined(LEFT)
1580             temp = off + 4; // number of values in A
1581 #else
1582             temp = off + 2; // number of values in B
1583 #endif
1584 #else
1585             pb0 = B;
1586             temp = k;
1587 #endif
1588
1589 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1590             CGEMM_KERNEL_4X2_MSA(, -, , +, +);
1591 #endif
1592 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1593             CGEMM_KERNEL_4X2_MSA(, +, , +, -);
1594 #endif
1595 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1596             CGEMM_KERNEL_4X2_MSA(, +, , -, +);
1597 #endif
1598 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1599             CGEMM_KERNEL_4X2_MSA(, -, , -, -);
1600 #endif
1601
1602             pb0 += 4;
1603
1604             for (l = (temp - 1); l--;)
1605             {
1606 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1607                 CGEMM_KERNEL_4X2_MSA(+, -, +, +,);
1608 #endif
1609 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1610                 CGEMM_KERNEL_4X2_MSA(+, +, -, +,);
1611 #endif
1612 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1613                 CGEMM_KERNEL_4X2_MSA(+, +, +, -,);
1614 #endif
1615 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1616                 CGEMM_KERNEL_4X2_MSA(+, -, -, -,);
1617 #endif
1618
1619                 pb0 += 4;
1620             }
1621
1622 #if defined(TRMMKERNEL)
1623             CGEMM_TRMM_SCALE_4X2_MSA
1624 #else
1625             CGEMM_SCALE_4X2_MSA
1626 #endif
1627
1628 #if defined(TRMMKERNEL)
1629 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1630             temp = k - off;
1631 #ifdef LEFT
1632             temp -= 4; // number of values in A
1633 #else
1634             temp -= 2; // number of values in B
1635 #endif
1636             pa0 += temp * 2 * 4;
1637             pb0 += temp * 2 * 2;
1638 #endif
1639
1640 #ifdef LEFT
1641             off += 4; // number of values in A
1642 #endif
1643 #endif
1644         }
1645
1646         if (m & 2)
1647         {
1648 #if defined(TRMMKERNEL)
1649 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1650             pb0 = B;
1651 #else
1652             pa0 += off * 2 * 2;
1653             pb0 = B + off * 2 * 2;
1654 #endif
1655
1656 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1657             temp = k - off;
1658 #elif defined(LEFT)
1659             temp = off + 2; // number of values in A
1660 #else
1661             temp = off + 2; // number of values in B
1662 #endif
1663 #else
1664             pb0 = B;
1665             temp = k;
1666 #endif
1667
1668 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1669             CGEMM_KERNEL_2X2(, -, , +, +);
1670 #endif
1671 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1672             CGEMM_KERNEL_2X2(, +, , +, -);
1673 #endif
1674 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1675             CGEMM_KERNEL_2X2(, +, , -, +);
1676 #endif
1677 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1678             CGEMM_KERNEL_2X2(, -, , -, -);
1679 #endif
1680
1681             pa0 += 4;
1682             pb0 += 4;
1683
1684             for (l = (temp - 1); l--;)
1685             {
1686 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1687                 CGEMM_KERNEL_2X2(+, -, +, +,);
1688 #endif
1689 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1690                 CGEMM_KERNEL_2X2(+, +, -, +,);
1691 #endif
1692 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1693                 CGEMM_KERNEL_2X2(+, +, +, -,);
1694 #endif
1695 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1696                 CGEMM_KERNEL_2X2(+, -, -, -,);
1697 #endif
1698
1699                 pa0 += 4;
1700                 pb0 += 4;
1701             }
1702
1703 #if defined(TRMMKERNEL)
1704             CGEMM_TRMM_SCALE_2X2
1705 #else
1706             CGEMM_SCALE_2X2
1707 #endif
1708             pc0 += 4;
1709             pc1 += 4;
1710
1711 #if defined(TRMMKERNEL)
1712 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1713             temp = k - off;
1714 #ifdef LEFT
1715             temp -= 2; // number of values in A
1716 #else
1717             temp -= 2; // number of values in B
1718 #endif
1719             pa0 += temp * 2 * 2;
1720             pb0 += temp * 2 * 2;
1721 #endif
1722
1723 #ifdef LEFT
1724             off += 2; // number of values in A
1725 #endif
1726 #endif
1727         }
1728
1729         if (m & 1)
1730         {
1731 #if defined(TRMMKERNEL)
1732 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1733             pb0 = B;
1734 #else
1735             pa0 += off * 2 * 1;
1736             pb0 = B + off * 2 * 2;
1737 #endif
1738
1739 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1740             temp = k - off;
1741 #elif defined(LEFT)
1742             temp = off + 1; // number of values in A
1743 #else
1744             temp = off + 2; // number of values in B
1745 #endif
1746 #else
1747             pb0 = B;
1748             temp = k;
1749 #endif
1750
1751 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1752             CGEMM_KERNEL_1X2(, -, , +, +);
1753 #endif
1754 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1755             CGEMM_KERNEL_1X2(, +, , +, -);
1756 #endif
1757 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1758             CGEMM_KERNEL_1X2(, +, , -, +);
1759 #endif
1760 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1761             CGEMM_KERNEL_1X2(, -, , -, -);
1762 #endif
1763
1764             pa0 += 2;
1765             pb0 += 4;
1766
1767             for (l = (temp - 1); l--;)
1768             {
1769 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1770                 CGEMM_KERNEL_1X2(+, -, +, +,);
1771 #endif
1772 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1773                 CGEMM_KERNEL_1X2(+, +, -, +,);
1774 #endif
1775 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1776                 CGEMM_KERNEL_1X2(+, +, +, -,);
1777 #endif
1778 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1779                 CGEMM_KERNEL_1X2(+, -, -, -,);
1780 #endif
1781
1782                 pa0 += 2;
1783                 pb0 += 4;
1784             }
1785
1786 #if defined(TRMMKERNEL)
1787             CGEMM_TRMM_SCALE_1X2
1788 #else
1789             CGEMM_SCALE_1X2
1790 #endif
1791             pc0 += 2;
1792             pc1 += 2;
1793
1794 #if defined(TRMMKERNEL)
1795 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1796             temp = k - off;
1797 #ifdef LEFT
1798             temp -= 1; // number of values in A
1799 #else
1800             temp -= 2; // number of values in B
1801 #endif
1802             pa0 += temp * 2 * 1;
1803             pb0 += temp * 2 * 2;
1804 #endif
1805
1806 #ifdef LEFT
1807             off += 1; // number of values in A
1808 #endif
1809 #endif
1810         }
1811
1812 #if defined(TRMMKERNEL) && !defined(LEFT)
1813         off += 2; // number of values in A
1814 #endif
1815
1816         B += (k << 2);
1817         C += (ldc << 2);
1818     }
1819
1820     if (n & 1)
1821     {
1822         pc0 = C;
1823
1824 #if defined(TRMMKERNEL) && defined(LEFT)
1825         off = offset;
1826 #endif
1827
1828         pa0 = A;
1829
1830         for (i = (m >> 3); i--;)
1831         {
1832 #if defined(TRMMKERNEL)
1833 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1834             pb0 = B;
1835 #else
1836             pa0 += off * 2 * 8;
1837             pb0 = B + off * 2 * 1;
1838 #endif
1839
1840 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1841             temp = k - off;
1842 #elif defined(LEFT)
1843             temp = off + 8; // number of values in A
1844 #else
1845             temp = off + 1; // number of values in B
1846 #endif
1847 #else
1848             pb0 = B;
1849             temp = k;
1850 #endif
1851
1852 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1853             CGEMM_KERNEL_8X1_MSA(, -, , +, +);
1854 #endif
1855 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1856             CGEMM_KERNEL_8X1_MSA(, +, , +, -);
1857 #endif
1858 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1859             CGEMM_KERNEL_8X1_MSA(, +, , -, +);
1860 #endif
1861 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1862             CGEMM_KERNEL_8X1_MSA(, -, , -, -);
1863 #endif
1864
1865             pb0 += 2;
1866
1867             for (l = (temp - 1); l--;)
1868             {
1869 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1870                 CGEMM_KERNEL_8X1_MSA(+, -, +, +,);
1871 #endif
1872 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1873                 CGEMM_KERNEL_8X1_MSA(+, +, -, +,);
1874 #endif
1875 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1876                 CGEMM_KERNEL_8X1_MSA(+, +, +, -,);
1877 #endif
1878 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1879                 CGEMM_KERNEL_8X1_MSA(+, -, -, -,);
1880 #endif
1881
1882                 pb0 += 2;
1883             }
1884
1885 #if defined(TRMMKERNEL)
1886             CGEMM_TRMM_SCALE_8X1_MSA
1887 #else
1888             CGEMM_SCALE_8X1_MSA
1889 #endif
1890
1891 #if defined(TRMMKERNEL)
1892 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1893             temp = k - off;
1894 #ifdef LEFT
1895             temp -= 8; // number of values in A
1896 #else
1897             temp -= 1; // number of values in B
1898 #endif
1899             pa0 += temp * 2 * 8;
1900             pb0 += temp * 2 * 1;
1901 #endif
1902
1903 #ifdef LEFT
1904             off += 8; // number of values in A
1905 #endif
1906 #endif
1907         }
1908
1909         if (m & 4)
1910         {
1911 #if defined(TRMMKERNEL)
1912 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1913             pb0 = B;
1914 #else
1915             pa0 += off * 2 * 4;
1916             pb0 = B + off * 2 * 1;
1917 #endif
1918
1919 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1920             temp = k - off;
1921 #elif defined(LEFT)
1922             temp = off + 4; // number of values in A
1923 #else
1924             temp = off + 1; // number of values in B
1925 #endif
1926 #else
1927             pb0 = B;
1928             temp = k;
1929 #endif
1930
1931 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1932             CGEMM_KERNEL_4X1_MSA(, -, , +, +);
1933 #endif
1934 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1935             CGEMM_KERNEL_4X1_MSA(, +, , +, -);
1936 #endif
1937 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1938             CGEMM_KERNEL_4X1_MSA(, +, , -, +);
1939 #endif
1940 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1941             CGEMM_KERNEL_4X1_MSA(, -, , -, -);
1942 #endif
1943
1944             pb0 += 2;
1945
1946             for (l = (temp - 1); l--;)
1947             {
1948 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1949                 CGEMM_KERNEL_4X1_MSA(+, -, +, +,);
1950 #endif
1951 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1952                 CGEMM_KERNEL_4X1_MSA(+, +, -, +,);
1953 #endif
1954 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1955                 CGEMM_KERNEL_4X1_MSA(+, +, +, -,);
1956 #endif
1957 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1958                 CGEMM_KERNEL_4X1_MSA(+, -, -, -,);
1959 #endif
1960
1961                 pb0 += 2;
1962             }
1963
1964 #if defined(TRMMKERNEL)
1965             CGEMM_TRMM_SCALE_4X1_MSA
1966 #else
1967             CGEMM_SCALE_4X1_MSA
1968 #endif
1969
1970 #if defined(TRMMKERNEL)
1971 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1972             temp = k - off;
1973 #ifdef LEFT
1974             temp -= 4; // number of values in A
1975 #else
1976             temp -= 1; // number of values in B
1977 #endif
1978             pa0 += temp * 2 * 4;
1979             pb0 += temp * 2 * 1;
1980 #endif
1981
1982 #ifdef LEFT
1983             off += 4; // number of values in A
1984 #endif
1985 #endif
1986         }
1987
1988         if (m & 2)
1989         {
1990 #if defined(TRMMKERNEL)
1991 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1992             pb0 = B;
1993 #else
1994             pa0 += off * 2 * 2;
1995             pb0 = B + off * 2 * 1;
1996 #endif
1997
1998 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1999             temp = k - off;
2000 #elif defined(LEFT)
2001             temp = off + 2; // number of values in A
2002 #else
2003             temp = off + 1; // number of values in B
2004 #endif
2005 #else
2006             pb0 = B;
2007             temp = k;
2008 #endif
2009
2010 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
2011             CGEMM_KERNEL_2X1(, -, , +, +);
2012 #endif
2013 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
2014             CGEMM_KERNEL_2X1(, +, , +, -);
2015 #endif
2016 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
2017             CGEMM_KERNEL_2X1(, +, , -, +);
2018 #endif
2019 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
2020             CGEMM_KERNEL_2X1(, -, , -, -);
2021 #endif
2022
2023             pa0 += 4;
2024             pb0 += 2;
2025
2026             for (l = (temp - 1); l--;)
2027             {
2028 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
2029                 CGEMM_KERNEL_2X1(+, -, +, +,);
2030 #endif
2031 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
2032                 CGEMM_KERNEL_2X1(+, +, -, +,);
2033 #endif
2034 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
2035                 CGEMM_KERNEL_2X1(+, +, +, -,);
2036 #endif
2037 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
2038                 CGEMM_KERNEL_2X1(+, -, -, -,);
2039 #endif
2040
2041                 pa0 += 4;
2042                 pb0 += 2;
2043             }
2044
2045 #if defined(TRMMKERNEL)
2046             CGEMM_TRMM_SCALE_2X1
2047 #else
2048             CGEMM_SCALE_2X1
2049 #endif
2050             pc0 += 4;
2051
2052 #if defined(TRMMKERNEL)
2053 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2054             temp = k - off;
2055 #ifdef LEFT
2056             temp -= 2; // number of values in A
2057 #else
2058             temp -= 1; // number of values in B
2059 #endif
2060             pa0 += temp * 2 * 2;
2061             pb0 += temp * 2 * 1;
2062 #endif
2063
2064 #ifdef LEFT
2065             off += 2; // number of values in A
2066 #endif
2067 #endif
2068         }
2069
2070         if (m & 1)
2071         {
2072 #if defined(TRMMKERNEL)
2073 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2074             pb0 = B;
2075 #else
2076             pa0 += off * 2 * 1;
2077             pb0 = B + off * 2 * 1;
2078 #endif
2079
2080 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2081             temp = k - off;
2082 #elif defined(LEFT)
2083             temp = off + 1; // number of values in A
2084 #else
2085             temp = off + 1; // number of values in B
2086 #endif
2087 #else
2088             pb0 = B;
2089             temp = k;
2090 #endif
2091
2092 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
2093             CGEMM_KERNEL_1X1(, -, , +, +);
2094 #endif
2095 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
2096             CGEMM_KERNEL_1X1(, +, , +, -);
2097 #endif
2098 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
2099             CGEMM_KERNEL_1X1(, +, , -, +);
2100 #endif
2101 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
2102             CGEMM_KERNEL_1X1(, -, , -, -);
2103 #endif
2104
2105             pa0 += 2;
2106             pb0 += 2;
2107
2108             for (l = (temp - 1); l--;)
2109             {
2110 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
2111                 CGEMM_KERNEL_1X1(+, -, +, +,);
2112 #endif
2113 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
2114                 CGEMM_KERNEL_1X1(+, +, -, +,);
2115 #endif
2116 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
2117                 CGEMM_KERNEL_1X1(+, +, +, -,);
2118 #endif
2119 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
2120                 CGEMM_KERNEL_1X1(+, -, -, -,);
2121 #endif
2122
2123                 pa0 += 2;
2124                 pb0 += 2;
2125             }
2126
2127 #if defined(TRMMKERNEL)
2128             CGEMM_TRMM_SCALE_1X1
2129 #else
2130             CGEMM_SCALE_1X1
2131 #endif
2132             pc0 += 2;
2133
2134 #if defined(TRMMKERNEL)
2135 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2136             temp = k - off;
2137 #ifdef LEFT
2138             temp -= 1; // number of values in A
2139 #else
2140             temp -= 1; // number of values in B
2141 #endif
2142             pa0 += temp * 2 * 1;
2143             pb0 += temp * 2 * 1;
2144 #endif
2145
2146 #ifdef LEFT
2147             off += 1; // number of values in A
2148 #endif
2149 #endif
2150         }
2151
2152 #if defined(TRMMKERNEL) && !defined(LEFT)
2153         off += 1; // number of values in A
2154 #endif
2155
2156         B += (k << 1);
2157         C += (ldc << 1);
2158     }
2159
2160     return 0;
2161 }