fix build error
[platform/upstream/openblas.git] / kernel / mips / zgemm_kernel_4x4_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4)    \
32 {                                                        \
33     LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);  \
34     LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3);  \
35                                                          \
36     PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);     \
37     PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i);     \
38                                                          \
39     /* 0th col */                                        \
40     SPLATI_D2_DP(src_b0, src_br, src_bi);                \
41     res0_r OP0## = src_a0r * src_br;                     \
42     res0_r OP1## = src_a0i * src_bi;                     \
43     res0_i OP2## = OP4 src_a0r * src_bi;                 \
44     res0_i OP3## = src_a0i * src_br;                     \
45                                                          \
46     res1_r OP0## = src_a1r * src_br;                     \
47     res1_r OP1## = src_a1i * src_bi;                     \
48     res1_i OP2## = OP4 src_a1r * src_bi;                 \
49     res1_i OP3## = src_a1i * src_br;                     \
50                                                          \
51     /* 1st col */                                        \
52     SPLATI_D2_DP(src_b1, src_br, src_bi);                \
53     res2_r OP0## = src_a0r * src_br;                     \
54     res2_r OP1## = src_a0i * src_bi;                     \
55     res2_i OP2## = OP4 src_a0r * src_bi;                 \
56     res2_i OP3## = src_a0i * src_br;                     \
57                                                          \
58     res3_r OP0## = src_a1r * src_br;                     \
59     res3_r OP1## = src_a1i * src_bi;                     \
60     res3_i OP2## = OP4 src_a1r * src_bi;                 \
61     res3_i OP3## = src_a1i * src_br;                     \
62                                                          \
63     /* 2nd col */                                        \
64     SPLATI_D2_DP(src_b2, src_br, src_bi);                \
65     res4_r OP0## = src_a0r * src_br;                     \
66     res4_r OP1## = src_a0i * src_bi;                     \
67     res4_i OP2## = OP4 src_a0r * src_bi;                 \
68     res4_i OP3## = src_a0i * src_br;                     \
69                                                          \
70     res5_r OP0## = src_a1r * src_br;                     \
71     res5_r OP1## = src_a1i * src_bi;                     \
72     res5_i OP2## = OP4 src_a1r * src_bi;                 \
73     res5_i OP3## = src_a1i * src_br;                     \
74                                                          \
75     /* 3rd col */                                        \
76     SPLATI_D2_DP(src_b3, src_br, src_bi);                \
77     res6_r OP0## = src_a0r * src_br;                     \
78     res6_r OP1## = src_a0i * src_bi;                     \
79     res6_i OP2## = OP4 src_a0r * src_bi;                 \
80     res6_i OP3## = src_a0i * src_br;                     \
81                                                          \
82     res7_r OP0## = src_a1r * src_br;                     \
83     res7_r OP1## = src_a1i * src_bi;                     \
84     res7_i OP2## = OP4 src_a1r * src_bi;                 \
85     res7_i OP3## = src_a1i * src_br;                     \
86 }
87
88 #define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4)    \
89 {                                                        \
90     LD_DP2_INC(pa0, 2, src_a0, src_a1);                  \
91     LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3);  \
92                                                          \
93     PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);     \
94                                                          \
95     /* 0th col */                                        \
96     SPLATI_D2_DP(src_b0, src_br, src_bi);                \
97     res0_r OP0## = src_a0r * src_br;                     \
98     res0_r OP1## = src_a0i * src_bi;                     \
99     res0_i OP2## = OP4 src_a0r * src_bi;                 \
100     res0_i OP3## = src_a0i * src_br;                     \
101                                                          \
102     /* 1st col */                                        \
103     SPLATI_D2_DP(src_b1, src_br, src_bi);                \
104     res2_r OP0## = src_a0r * src_br;                     \
105     res2_r OP1## = src_a0i * src_bi;                     \
106     res2_i OP2## = OP4 src_a0r * src_bi;                 \
107     res2_i OP3## = src_a0i * src_br;                     \
108                                                          \
109     /* 2nd col */                                        \
110     SPLATI_D2_DP(src_b2, src_br, src_bi);                \
111     res4_r OP0## = src_a0r * src_br;                     \
112     res4_r OP1## = src_a0i * src_bi;                     \
113     res4_i OP2## = OP4 src_a0r * src_bi;                 \
114     res4_i OP3## = src_a0i * src_br;                     \
115                                                          \
116     /* 3rd col */                                        \
117     SPLATI_D2_DP(src_b3, src_br, src_bi);                \
118     res6_r OP0## = src_a0r * src_br;                     \
119     res6_r OP1## = src_a0i * src_bi;                     \
120     res6_i OP2## = OP4 src_a0r * src_bi;                 \
121     res6_i OP3## = src_a0i * src_br;                     \
122 }
123
124 #define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4)    \
125 {                                                        \
126     src_a0 = LD_DP(pa0);                                 \
127     LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3);  \
128                                                          \
129     PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i);     \
130                                                          \
131     /* 0th and 1st col */                                \
132     PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi);       \
133     res0_r OP0## = src_a0r * src_br;                     \
134     res0_r OP1## = src_a0i * src_bi;                     \
135     res0_i OP2## = OP4 src_a0r * src_bi;                 \
136     res0_i OP3## = src_a0i * src_br;                     \
137                                                          \
138     /* 2nd and 3rd col */                                \
139     PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi);       \
140     res1_r OP0## = src_a0r * src_br;                     \
141     res1_r OP1## = src_a0i * src_bi;                     \
142     res1_i OP2## = OP4 src_a0r * src_bi;                 \
143     res1_i OP3## = src_a0i * src_br;                     \
144 }
145
146 #define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4)    \
147 {                                                        \
148     LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);  \
149     LD_DP2_INC(pb0, 2, src_b0, src_b1);                  \
150                                                          \
151     PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);     \
152     PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i);     \
153                                                          \
154     /* 0th col */                                        \
155     SPLATI_D2_DP(src_b0, src_br, src_bi);                \
156     res0_r OP0## = src_a0r * src_br;                     \
157     res0_r OP1## = src_a0i * src_bi;                     \
158     res0_i OP2## = OP4 src_a0r * src_bi;                 \
159     res0_i OP3## = src_a0i * src_br;                     \
160                                                          \
161     res1_r OP0## = src_a1r * src_br;                     \
162     res1_r OP1## = src_a1i * src_bi;                     \
163     res1_i OP2## = OP4 src_a1r * src_bi;                 \
164     res1_i OP3## = src_a1i * src_br;                     \
165                                                          \
166     /* 1st col */                                        \
167     SPLATI_D2_DP(src_b1, src_br, src_bi);                \
168     res2_r OP0## = src_a0r * src_br;                     \
169     res2_r OP1## = src_a0i * src_bi;                     \
170     res2_i OP2## = OP4 src_a0r * src_bi;                 \
171     res2_i OP3## = src_a0i * src_br;                     \
172                                                          \
173     res3_r OP0## = src_a1r * src_br;                     \
174     res3_r OP1## = src_a1i * src_bi;                     \
175     res3_i OP2## = OP4 src_a1r * src_bi;                 \
176     res3_i OP3## = src_a1i * src_br;                     \
177 }
178
179 #define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4)  \
180 {                                                      \
181     LD_DP2_INC(pa0, 2, src_a0, src_a1);                \
182     LD_DP2_INC(pb0, 2, src_b0, src_b1);                \
183                                                        \
184     PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);   \
185                                                        \
186     /* 0th col */                                      \
187     SPLATI_D2_DP(src_b0, src_br, src_bi);              \
188     res0_r OP0## = src_a0r * src_br;                   \
189     res0_r OP1## = src_a0i * src_bi;                   \
190     res0_i OP2## = OP4 src_a0r * src_bi;               \
191     res0_i OP3## = src_a0i * src_br;                   \
192                                                        \
193     /* 1st col */                                      \
194     SPLATI_D2_DP(src_b1, src_br, src_bi);              \
195     res2_r OP0## = src_a0r * src_br;                   \
196     res2_r OP1## = src_a0i * src_bi;                   \
197     res2_i OP2## = OP4 src_a0r * src_bi;               \
198     res2_i OP3## = src_a0i * src_br;                   \
199 }
200
201 #define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4)  \
202 {                                                      \
203     src_a0 = LD_DP(pa0);                               \
204     LD_DP2_INC(pb0, 2, src_b0, src_b1);                \
205                                                        \
206     PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i);   \
207                                                        \
208     /* 0th and 1st col */                              \
209     PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi);     \
210     res0_r OP0## = src_a0r * src_br;                   \
211     res0_r OP1## = src_a0i * src_bi;                   \
212     res0_i OP2## = OP4 src_a0r * src_bi;               \
213     res0_i OP3## = src_a0i * src_br;                   \
214 }
215
216 #define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4)    \
217 {                                                        \
218     LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);  \
219     src_b0 = LD_DP(pb0);                                 \
220                                                          \
221     PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);     \
222     PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i);     \
223                                                          \
224     /* 0th col */                                        \
225     SPLATI_D2_DP(src_b0, src_br, src_bi);                \
226     res0_r OP0## = src_a0r * src_br;                     \
227     res0_r OP1## = src_a0i * src_bi;                     \
228     res0_i OP2## = OP4 src_a0r * src_bi;                 \
229     res0_i OP3## = src_a0i * src_br;                     \
230                                                          \
231     res1_r OP0## = src_a1r * src_br;                     \
232     res1_r OP1## = src_a1i * src_bi;                     \
233     res1_i OP2## = OP4 src_a1r * src_bi;                 \
234     res1_i OP3## = src_a1i * src_br;                     \
235 }
236
237 #define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4)  \
238 {                                                      \
239     LD_DP2_INC(pa0, 2, src_a0, src_a1);                \
240     src_b0 = LD_DP(pb0);                               \
241                                                        \
242     PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);   \
243                                                        \
244     /* 0th col */                                      \
245     SPLATI_D2_DP(src_b0, src_br, src_bi);              \
246     res0_r OP0## = src_a0r * src_br;                   \
247     res0_r OP1## = src_a0i * src_bi;                   \
248     res0_i OP2## = OP4 src_a0r * src_bi;               \
249     res0_i OP3## = src_a0i * src_br;                   \
250 }
251
252 #define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4)  \
253 {                                                  \
254     /* 0th col */                                  \
255     a0_r = pa0[0];                                 \
256     a0_i = pa0[1];                                 \
257     b0_r = pb0[0];                                 \
258     b0_i = pb0[1];                                 \
259                                                    \
260     res0 OP0## = a0_r * b0_r;                      \
261     res0 OP1## = a0_i * b0_i;                      \
262     res1 OP2## = OP4 a0_r * b0_i;                  \
263     res1 OP3## = a0_i * b0_r;                      \
264 }
265
266 #define ZGEMM_SCALE_4X4_MSA                      \
267 {                                                \
268     LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);      \
269                                                  \
270     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);   \
271     PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i);   \
272                                                  \
273     dst0_r += alpha_r * res0_r;                  \
274     dst0_r -= alpha_i * res0_i;                  \
275     dst0_i += alpha_r * res0_i;                  \
276     dst0_i += alpha_i * res0_r;                  \
277                                                  \
278     dst1_r += alpha_r * res1_r;                  \
279     dst1_r -= alpha_i * res1_i;                  \
280     dst1_i += alpha_r * res1_i;                  \
281     dst1_i += alpha_i * res1_r;                  \
282                                                  \
283     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
284     ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
285                                                  \
286     LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);      \
287                                                  \
288     PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i);   \
289     PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i);   \
290                                                  \
291     dst0_r += alpha_r * res2_r;                  \
292     dst0_r -= alpha_i * res2_i;                  \
293     dst0_i += alpha_r * res2_i;                  \
294     dst0_i += alpha_i * res2_r;                  \
295                                                  \
296     dst1_r += alpha_r * res3_r;                  \
297     dst1_r -= alpha_i * res3_i;                  \
298     dst1_i += alpha_r * res3_i;                  \
299     dst1_i += alpha_i * res3_r;                  \
300                                                  \
301     ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
302     ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
303                                                  \
304     ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
305     ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);  \
306                                                  \
307     LD_DP4(pc2, 2, dst0, dst1, dst2, dst3);      \
308                                                  \
309     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);   \
310     PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i);   \
311                                                  \
312     dst0_r += alpha_r * res4_r;                  \
313     dst0_r -= alpha_i * res4_i;                  \
314     dst0_i += alpha_r * res4_i;                  \
315     dst0_i += alpha_i * res4_r;                  \
316                                                  \
317     dst1_r += alpha_r * res5_r;                  \
318     dst1_r -= alpha_i * res5_i;                  \
319     dst1_i += alpha_r * res5_i;                  \
320     dst1_i += alpha_i * res5_r;                  \
321                                                  \
322     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
323     ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
324                                                  \
325     LD_DP4(pc3, 2, dst4, dst5, dst6, dst7);      \
326                                                  \
327     PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i);   \
328     PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i);   \
329                                                  \
330     dst0_r += alpha_r * res6_r;                  \
331     dst0_r -= alpha_i * res6_i;                  \
332     dst0_i += alpha_r * res6_i;                  \
333     dst0_i += alpha_i * res6_r;                  \
334                                                  \
335     dst1_r += alpha_r * res7_r;                  \
336     dst1_r -= alpha_i * res7_i;                  \
337     dst1_i += alpha_r * res7_i;                  \
338     dst1_i += alpha_i * res7_r;                  \
339                                                  \
340     ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
341     ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
342                                                  \
343     ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);  \
344     ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);  \
345 }
346
347 #define ZGEMM_SCALE_2X4_MSA                     \
348 {                                               \
349     LD_DP2(pc0, 2, dst0, dst1);                 \
350                                                 \
351     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
352                                                 \
353     dst0_r += alpha_r * res0_r;                 \
354     dst0_r -= alpha_i * res0_i;                 \
355     dst0_i += alpha_r * res0_i;                 \
356     dst0_i += alpha_i * res0_r;                 \
357                                                 \
358     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
359                                                 \
360     LD_DP2(pc1, 2, dst2, dst3);                 \
361                                                 \
362     PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i);  \
363                                                 \
364     dst0_r += alpha_r * res2_r;                 \
365     dst0_r -= alpha_i * res2_i;                 \
366     dst0_i += alpha_r * res2_i;                 \
367     dst0_i += alpha_i * res2_r;                 \
368                                                 \
369     ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);    \
370                                                 \
371     ST_DP2_INC(dst0, dst1, pc0, 2);             \
372     ST_DP2_INC(dst2, dst3, pc1, 2);             \
373                                                 \
374     LD_DP2(pc2, 2, dst0, dst1);                 \
375                                                 \
376     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
377                                                 \
378     dst0_r += alpha_r * res4_r;                 \
379     dst0_r -= alpha_i * res4_i;                 \
380     dst0_i += alpha_r * res4_i;                 \
381     dst0_i += alpha_i * res4_r;                 \
382                                                 \
383     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
384                                                 \
385     LD_DP2(pc3, 2, dst2, dst3);                 \
386                                                 \
387     PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i);  \
388                                                 \
389     dst0_r += alpha_r * res6_r;                 \
390     dst0_r -= alpha_i * res6_i;                 \
391     dst0_i += alpha_r * res6_i;                 \
392     dst0_i += alpha_i * res6_r;                 \
393                                                 \
394     ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);    \
395                                                 \
396     ST_DP2_INC(dst0, dst1, pc2, 2);             \
397     ST_DP2_INC(dst2, dst3, pc3, 2);             \
398 }
399
400 #define ZGEMM_SCALE_1X4_MSA                     \
401 {                                               \
402     dst0 = LD_DP(pc0);                          \
403     dst1 = LD_DP(pc1);                          \
404                                                 \
405     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
406                                                 \
407     dst0_r += alpha_r * res0_r;                 \
408     dst0_r -= alpha_i * res0_i;                 \
409     dst0_i += alpha_r * res0_i;                 \
410     dst0_i += alpha_i * res0_r;                 \
411                                                 \
412     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
413                                                 \
414     dst2 = LD_DP(pc2);                          \
415     dst3 = LD_DP(pc3);                          \
416                                                 \
417     PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i);  \
418                                                 \
419     dst0_r += alpha_r * res1_r;                 \
420     dst0_r -= alpha_i * res1_i;                 \
421     dst0_i += alpha_r * res1_i;                 \
422     dst0_i += alpha_i * res1_r;                 \
423                                                 \
424     ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);    \
425                                                 \
426     ST_DP(dst0, pc0);                           \
427     ST_DP(dst1, pc1);                           \
428     ST_DP(dst2, pc2);                           \
429     ST_DP(dst3, pc3);                           \
430 }
431
432 #define ZGEMM_SCALE_4X2_MSA                      \
433 {                                                \
434     LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);      \
435                                                  \
436     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);   \
437     PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i);   \
438                                                  \
439     dst0_r += alpha_r * res0_r;                  \
440     dst0_r -= alpha_i * res0_i;                  \
441     dst0_i += alpha_r * res0_i;                  \
442     dst0_i += alpha_i * res0_r;                  \
443                                                  \
444     dst1_r += alpha_r * res1_r;                  \
445     dst1_r -= alpha_i * res1_i;                  \
446     dst1_i += alpha_r * res1_i;                  \
447     dst1_i += alpha_i * res1_r;                  \
448                                                  \
449     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
450     ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
451                                                  \
452     LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);      \
453                                                  \
454     PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i);   \
455     PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i);   \
456                                                  \
457     dst0_r += alpha_r * res2_r;                  \
458     dst0_r -= alpha_i * res2_i;                  \
459     dst0_i += alpha_r * res2_i;                  \
460     dst0_i += alpha_i * res2_r;                  \
461                                                  \
462     dst1_r += alpha_r * res3_r;                  \
463     dst1_r -= alpha_i * res3_i;                  \
464     dst1_i += alpha_r * res3_i;                  \
465     dst1_i += alpha_i * res3_r;                  \
466                                                  \
467     ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
468     ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
469                                                  \
470     ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
471     ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);  \
472 }
473
474 #define ZGEMM_SCALE_2X2_MSA                     \
475 {                                               \
476     LD_DP2(pc0, 2, dst0, dst1);                 \
477                                                 \
478     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
479                                                 \
480     dst0_r += alpha_r * res0_r;                 \
481     dst0_r -= alpha_i * res0_i;                 \
482     dst0_i += alpha_r * res0_i;                 \
483     dst0_i += alpha_i * res0_r;                 \
484                                                 \
485     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
486                                                 \
487     ST_DP2_INC(dst0, dst1, pc0, 2);             \
488                                                 \
489     LD_DP2(pc1, 2, dst2, dst3);                 \
490                                                 \
491     PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i);  \
492                                                 \
493     dst0_r += alpha_r * res2_r;                 \
494     dst0_r -= alpha_i * res2_i;                 \
495     dst0_i += alpha_r * res2_i;                 \
496     dst0_i += alpha_i * res2_r;                 \
497                                                 \
498     ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);    \
499                                                 \
500     ST_DP2_INC(dst2, dst3, pc1, 2);             \
501 }
502
503 #define ZGEMM_SCALE_1X2_MSA                     \
504 {                                               \
505     dst0 = LD_DP(pc0);                          \
506     dst1 = LD_DP(pc1);                          \
507                                                 \
508     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
509                                                 \
510     dst0_r += alpha_r * res0_r;                 \
511     dst0_r -= alpha_i * res0_i;                 \
512     dst0_i += alpha_r * res0_i;                 \
513     dst0_i += alpha_i * res0_r;                 \
514                                                 \
515     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
516                                                 \
517     ST_DP(dst0, pc0);                           \
518     ST_DP(dst1, pc1);                           \
519 }
520
521 #define ZGEMM_SCALE_4X1_MSA                      \
522 {                                                \
523     LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);      \
524                                                  \
525     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);   \
526     PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i);   \
527                                                  \
528     dst0_r += alpha_r * res0_r;                  \
529     dst0_r -= alpha_i * res0_i;                  \
530     dst0_i += alpha_r * res0_i;                  \
531     dst0_i += alpha_i * res0_r;                  \
532                                                  \
533     dst1_r += alpha_r * res1_r;                  \
534     dst1_r -= alpha_i * res1_i;                  \
535     dst1_i += alpha_r * res1_i;                  \
536     dst1_i += alpha_i * res1_r;                  \
537                                                  \
538     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
539     ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
540                                                  \
541     ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
542 }
543
544 #define ZGEMM_SCALE_2X1_MSA                     \
545 {                                               \
546     LD_DP2(pc0, 2, dst0, dst1);                 \
547                                                 \
548     PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
549                                                 \
550     dst0_r += alpha_r * res0_r;                 \
551     dst0_r -= alpha_i * res0_i;                 \
552     dst0_i += alpha_r * res0_i;                 \
553     dst0_i += alpha_i * res0_r;                 \
554                                                 \
555     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
556                                                 \
557     ST_DP2_INC(dst0, dst1, pc0, 2);             \
558 }
559
560 #define ZGEMM_SCALE_1X1       \
561 {                             \
562     pc0[0] += alphar * res0;  \
563     pc0[0] -= alphai * res1;  \
564     pc0[1] += alphar * res1;  \
565     pc0[1] += alphai * res0;  \
566 }
567
568 #define ZGEMM_TRMM_SCALE_4X4_MSA                 \
569 {                                                \
570     dst0_r = alpha_r * res0_r;                   \
571     dst0_r -= alpha_i * res0_i;                  \
572     dst0_i = alpha_r * res0_i;                   \
573     dst0_i += alpha_i * res0_r;                  \
574                                                  \
575     dst1_r = alpha_r * res1_r;                   \
576     dst1_r -= alpha_i * res1_i;                  \
577     dst1_i = alpha_r * res1_i;                   \
578     dst1_i += alpha_i * res1_r;                  \
579                                                  \
580     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
581     ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
582                                                  \
583     dst0_r = alpha_r * res2_r;                   \
584     dst0_r -= alpha_i * res2_i;                  \
585     dst0_i = alpha_r * res2_i;                   \
586     dst0_i += alpha_i * res2_r;                  \
587                                                  \
588     dst1_r = alpha_r * res3_r;                   \
589     dst1_r -= alpha_i * res3_i;                  \
590     dst1_i = alpha_r * res3_i;                   \
591     dst1_i += alpha_i * res3_r;                  \
592                                                  \
593     ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
594     ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
595                                                  \
596     ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
597     ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);  \
598                                                  \
599     dst0_r = alpha_r * res4_r;                   \
600     dst0_r -= alpha_i * res4_i;                  \
601     dst0_i = alpha_r * res4_i;                   \
602     dst0_i += alpha_i * res4_r;                  \
603                                                  \
604     dst1_r = alpha_r * res5_r;                   \
605     dst1_r -= alpha_i * res5_i;                  \
606     dst1_i = alpha_r * res5_i;                   \
607     dst1_i += alpha_i * res5_r;                  \
608                                                  \
609     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
610     ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
611                                                  \
612     dst0_r = alpha_r * res6_r;                   \
613     dst0_r -= alpha_i * res6_i;                  \
614     dst0_i = alpha_r * res6_i;                   \
615     dst0_i += alpha_i * res6_r;                  \
616                                                  \
617     dst1_r = alpha_r * res7_r;                   \
618     dst1_r -= alpha_i * res7_i;                  \
619     dst1_i = alpha_r * res7_i;                   \
620     dst1_i += alpha_i * res7_r;                  \
621                                                  \
622     ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
623     ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
624                                                  \
625     ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);  \
626     ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);  \
627 }
628
629 #define ZGEMM_TRMM_SCALE_2X4_MSA              \
630 {                                             \
631     dst0_r = alpha_r * res0_r;                \
632     dst0_r -= alpha_i * res0_i;               \
633     dst0_i = alpha_r * res0_i;                \
634     dst0_i += alpha_i * res0_r;               \
635                                               \
636     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
637                                               \
638     dst0_r = alpha_r * res2_r;                \
639     dst0_r -= alpha_i * res2_i;               \
640     dst0_i = alpha_r * res2_i;                \
641     dst0_i += alpha_i * res2_r;               \
642                                               \
643     ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);  \
644                                               \
645     ST_DP2_INC(dst0, dst1, pc0, 2);           \
646     ST_DP2_INC(dst2, dst3, pc1, 2);           \
647                                               \
648     dst0_r = alpha_r * res4_r;                \
649     dst0_r -= alpha_i * res4_i;               \
650     dst0_i = alpha_r * res4_i;                \
651     dst0_i += alpha_i * res4_r;               \
652                                               \
653     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
654                                               \
655     dst0_r = alpha_r * res6_r;                \
656     dst0_r -= alpha_i * res6_i;               \
657     dst0_i = alpha_r * res6_i;                \
658     dst0_i += alpha_i * res6_r;               \
659                                               \
660     ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);  \
661                                               \
662     ST_DP2_INC(dst0, dst1, pc2, 2);           \
663     ST_DP2_INC(dst2, dst3, pc3, 2);           \
664 }
665
666 #define ZGEMM_TRMM_SCALE_1X4_MSA              \
667 {                                             \
668     dst0_r = alpha_r * res0_r;                \
669     dst0_r -= alpha_i * res0_i;               \
670     dst0_i = alpha_r * res0_i;                \
671     dst0_i += alpha_i * res0_r;               \
672                                               \
673     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
674                                               \
675     dst0_r = alpha_r * res1_r;                \
676     dst0_r -= alpha_i * res1_i;               \
677     dst0_i = alpha_r * res1_i;                \
678     dst0_i += alpha_i * res1_r;               \
679                                               \
680     ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);  \
681                                               \
682     ST_DP(dst0, pc0);                         \
683     ST_DP(dst1, pc1);                         \
684     ST_DP(dst2, pc2);                         \
685     ST_DP(dst3, pc3);                         \
686 }
687
688 #define ZGEMM_TRMM_SCALE_4X2_MSA                 \
689 {                                                \
690     dst0_r = alpha_r * res0_r;                   \
691     dst0_r -= alpha_i * res0_i;                  \
692     dst0_i = alpha_r * res0_i;                   \
693     dst0_i += alpha_i * res0_r;                  \
694                                                  \
695     dst1_r = alpha_r * res1_r;                   \
696     dst1_r -= alpha_i * res1_i;                  \
697     dst1_i = alpha_r * res1_i;                   \
698     dst1_i += alpha_i * res1_r;                  \
699                                                  \
700     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
701     ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
702                                                  \
703     dst0_r = alpha_r * res2_r;                   \
704     dst0_r -= alpha_i * res2_i;                  \
705     dst0_i = alpha_r * res2_i;                   \
706     dst0_i += alpha_i * res2_r;                  \
707                                                  \
708     dst1_r = alpha_r * res3_r;                   \
709     dst1_r -= alpha_i * res3_i;                  \
710     dst1_i = alpha_r * res3_i;                   \
711     dst1_i += alpha_i * res3_r;                  \
712                                                  \
713     ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
714     ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
715                                                  \
716     ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
717     ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);  \
718 }
719
720 #define ZGEMM_TRMM_SCALE_2X2_MSA              \
721 {                                             \
722     dst0_r = alpha_r * res0_r;                \
723     dst0_r -= alpha_i * res0_i;               \
724     dst0_i = alpha_r * res0_i;                \
725     dst0_i += alpha_i * res0_r;               \
726                                               \
727     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
728                                               \
729     ST_DP2_INC(dst0, dst1, pc0, 2);           \
730                                               \
731     dst0_r = alpha_r * res2_r;                \
732     dst0_r -= alpha_i * res2_i;               \
733     dst0_i = alpha_r * res2_i;                \
734     dst0_i += alpha_i * res2_r;               \
735                                               \
736     ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);  \
737                                               \
738     ST_DP2_INC(dst2, dst3, pc1, 2);           \
739 }
740
741 #define ZGEMM_TRMM_SCALE_1X2_MSA              \
742 {                                             \
743     dst0_r = alpha_r * res0_r;                \
744     dst0_r -= alpha_i * res0_i;               \
745     dst0_i = alpha_r * res0_i;                \
746     dst0_i += alpha_i * res0_r;               \
747                                               \
748     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
749                                               \
750     ST_DP(dst0, pc0);                         \
751     ST_DP(dst1, pc1);                         \
752 }
753
754 #define ZGEMM_TRMM_SCALE_4X1_MSA                 \
755 {                                                \
756     dst0_r = alpha_r * res0_r;                   \
757     dst0_r -= alpha_i * res0_i;                  \
758     dst0_i = alpha_r * res0_i;                   \
759     dst0_i += alpha_i * res0_r;                  \
760                                                  \
761     dst1_r = alpha_r * res1_r;                   \
762     dst1_r -= alpha_i * res1_i;                  \
763     dst1_i = alpha_r * res1_i;                   \
764     dst1_i += alpha_i * res1_r;                  \
765                                                  \
766     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
767     ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
768                                                  \
769     ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
770 }
771
772 #define ZGEMM_TRMM_SCALE_2X1_MSA              \
773 {                                             \
774     dst0_r = alpha_r * res0_r;                \
775     dst0_r -= alpha_i * res0_i;               \
776     dst0_i = alpha_r * res0_i;                \
777     dst0_i += alpha_i * res0_r;               \
778                                               \
779     ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
780                                               \
781     ST_DP2_INC(dst0, dst1, pc0, 2);           \
782 }
783
784 #define ZGEMM_TRMM_SCALE_1X1  \
785 {                             \
786     pc0[0] = alphar * res0;   \
787     pc0[0] -= alphai * res1;  \
788     pc0[1] = alphar * res1;   \
789     pc0[1] += alphai * res0;  \
790 }
791
792 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
793           FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
794 #ifdef TRMMKERNEL
795           , BLASLONG offset
796 #endif
797           )
798 {
799     BLASLONG i, j, l, temp;
800 #if defined(TRMMKERNEL)
801     BLASLONG off;
802 #endif
803     FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
804     FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i;
805     v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3;
806     v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
807     v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
808     v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i;
809     v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
810     v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
811
812     alpha_r = COPY_DOUBLE_TO_VECTOR(alphar);
813     alpha_i = COPY_DOUBLE_TO_VECTOR(alphai);
814
815 #if defined(TRMMKERNEL) && !defined(LEFT)
816     off = -offset;
817 #endif
818
819     for (j = (n >> 2); j--;)
820     {
821         pc0 = C;
822         pc1 = pc0 + 2 * ldc;
823         pc2 = pc1 + 2 * ldc;
824         pc3 = pc2 + 2 * ldc;
825
826         pa0 = A;
827
828 #if defined(TRMMKERNEL) && defined(LEFT)
829         off = offset;
830 #endif
831
832         for (i = (m >> 2); i--;)
833         {
834 #if defined(TRMMKERNEL)
835 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
836             pb0 = B;
837 #else
838             pa0 += off * 2 * 4;
839             pb0 = B + off * 2 * 4;
840 #endif
841
842 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
843             temp = k - off;
844 #elif defined(LEFT)
845             temp = off + 4; // number of values in A
846 #else
847             temp = off + 4; // number of values in B
848 #endif
849 #else
850             pb0 = B;
851             temp = k;
852 #endif
853
854 #ifdef ENABLE_PREFETCH
855             __asm__ __volatile__(
856                 "pref   0,   64(%[pa0])   \n\t"
857                 "pref   0,   96(%[pa0])   \n\t"
858                 "pref   0,   64(%[pb0])   \n\t"
859                 "pref   0,   96(%[pb0])   \n\t"
860
861                 :
862                 : [pa0] "r" (pa0), [pb0] "r" (pb0)
863             );
864 #endif
865
866 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
867             ZGEMM_KERNEL_4X4_MSA(, -, , +, +);
868 #endif
869 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
870             ZGEMM_KERNEL_4X4_MSA(, +, , +, -);
871 #endif
872 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
873             ZGEMM_KERNEL_4X4_MSA(, +, , -, +);
874 #endif
875 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
876             ZGEMM_KERNEL_4X4_MSA(, -, , -, -);
877 #endif
878
879             for (l = (temp - 1); l--;)
880             {
881 #ifdef ENABLE_PREFETCH
882             __asm__ __volatile__(
883                 "pref   0,   64(%[pa0])   \n\t"
884                 "pref   0,   96(%[pa0])   \n\t"
885                 "pref   0,   64(%[pb0])   \n\t"
886                 "pref   0,   96(%[pb0])   \n\t"
887
888                 :
889                 : [pa0] "r" (pa0), [pb0] "r" (pb0)
890             );
891 #endif
892
893 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
894                 ZGEMM_KERNEL_4X4_MSA(+, -, +, +,);
895 #endif
896 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
897                 ZGEMM_KERNEL_4X4_MSA(+, +, -, +,);
898 #endif
899 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
900                 ZGEMM_KERNEL_4X4_MSA(+, +, +, -,);
901 #endif
902 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
903                 ZGEMM_KERNEL_4X4_MSA(+, -, -, -,);
904 #endif
905             }
906
907 #if defined(TRMMKERNEL)
908             ZGEMM_TRMM_SCALE_4X4_MSA
909 #else
910             ZGEMM_SCALE_4X4_MSA
911 #endif
912
913 #if defined(TRMMKERNEL)
914 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
915             temp = k - off;
916 #ifdef LEFT
917             temp -= 4; // number of values in A
918 #else
919             temp -= 4; // number of values in B
920 #endif
921             pa0 += temp * 2 * 4;
922             pb0 += temp * 2 * 4;
923 #endif
924
925 #ifdef LEFT
926             off += 4; // number of values in A
927 #endif
928 #endif
929         }
930
931         if (m & 2)
932         {
933 #if defined(TRMMKERNEL)
934 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
935             pb0 = B;
936 #else
937             pa0 += off * 2 * 2;
938             pb0 = B + off * 2 * 4;
939 #endif
940
941 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
942             temp = k - off;
943 #elif defined(LEFT)
944             temp = off + 2; // number of values in A
945 #else
946             temp = off + 4; // number of values in B
947 #endif
948 #else
949             pb0 = B;
950             temp = k;
951 #endif
952
953 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
954             ZGEMM_KERNEL_2X4_MSA(, -, , +, +);
955 #endif
956 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
957             ZGEMM_KERNEL_2X4_MSA(, +, , +, -);
958 #endif
959 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
960             ZGEMM_KERNEL_2X4_MSA(, +, , -, +);
961 #endif
962 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
963             ZGEMM_KERNEL_2X4_MSA(, -, , -, -);
964 #endif
965
966             for (l = (temp - 1); l--;)
967             {
968 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
969                 ZGEMM_KERNEL_2X4_MSA(+, -, +, +,);
970 #endif
971 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
972                 ZGEMM_KERNEL_2X4_MSA(+, +, -, +,);
973 #endif
974 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
975                 ZGEMM_KERNEL_2X4_MSA(+, +, +, -,);
976 #endif
977 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
978                 ZGEMM_KERNEL_2X4_MSA(+, -, -, -,);
979 #endif
980             }
981
982 #if defined(TRMMKERNEL)
983             ZGEMM_TRMM_SCALE_2X4_MSA
984 #else
985             ZGEMM_SCALE_2X4_MSA
986 #endif
987
988 #if defined(TRMMKERNEL)
989 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
990             temp = k - off;
991 #ifdef LEFT
992             temp -= 2; // number of values in A
993 #else
994             temp -= 4; // number of values in B
995 #endif
996             pa0 += temp * 2 * 2;
997             pb0 += temp * 2 * 4;
998 #endif
999
1000 #ifdef LEFT
1001             off += 2; // number of values in A
1002 #endif
1003 #endif
1004         }
1005
1006         if (m & 1)
1007         {
1008 #if defined(TRMMKERNEL)
1009 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1010             pb0 = B;
1011 #else
1012             pa0 += off * 2 * 1;
1013             pb0 = B + off * 2 * 4;
1014 #endif
1015
1016 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1017             temp = k - off;
1018 #elif defined(LEFT)
1019             temp = off + 1; // number of values in A
1020 #else
1021             temp = off + 4; // number of values in B
1022 #endif
1023 #else
1024             pb0 = B;
1025             temp = k;
1026 #endif
1027
1028 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1029             ZGEMM_KERNEL_1X4_MSA(, -, , +, +);
1030 #endif
1031 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1032             ZGEMM_KERNEL_1X4_MSA(, +, , +, -);
1033 #endif
1034 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1035             ZGEMM_KERNEL_1X4_MSA(, +, , -, +);
1036 #endif
1037 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1038             ZGEMM_KERNEL_1X4_MSA(, -, , -, -);
1039 #endif
1040
1041             pa0 += 2;
1042
1043             for (l = (temp - 1); l--;)
1044             {
1045 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1046                 ZGEMM_KERNEL_1X4_MSA(+, -, +, +,);
1047 #endif
1048 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1049                 ZGEMM_KERNEL_1X4_MSA(+, +, -, +,);
1050 #endif
1051 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1052                 ZGEMM_KERNEL_1X4_MSA(+, +, +, -,);
1053 #endif
1054 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1055                 ZGEMM_KERNEL_1X4_MSA(+, -, -, -,);
1056 #endif
1057
1058                 pa0 += 2;
1059             }
1060
1061 #if defined(TRMMKERNEL)
1062             ZGEMM_TRMM_SCALE_1X4_MSA
1063 #else
1064             ZGEMM_SCALE_1X4_MSA
1065 #endif
1066             pc0 += 2;
1067             pc1 += 2;
1068             pc2 += 2;
1069             pc3 += 2;
1070
1071 #if defined(TRMMKERNEL)
1072 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1073             temp = k - off;
1074 #ifdef LEFT
1075             temp -= 1; // number of values in A
1076 #else
1077             temp -= 4; // number of values in B
1078 #endif
1079             pa0 += temp * 2 * 1;
1080             pb0 += temp * 2 * 4;
1081 #endif
1082
1083 #ifdef LEFT
1084             off += 1; // number of values in A
1085 #endif
1086 #endif
1087         }
1088
1089 #if defined(TRMMKERNEL) && !defined(LEFT)
1090         off += 4; // number of values in A
1091 #endif
1092
1093         B += (k << 3);
1094         C += (ldc << 3);
1095     }
1096
1097     if (n & 2)
1098     {
1099         pc0 = C;
1100         pc1 = pc0 + 2 * ldc;
1101
1102         pa0 = A;
1103
1104 #if defined(TRMMKERNEL) && defined(LEFT)
1105         off = offset;
1106 #endif
1107
1108         for (i = (m >> 2); i--;)
1109         {
1110 #if defined(TRMMKERNEL)
1111 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1112             pb0 = B;
1113 #else
1114             pa0 += off * 2 * 4;
1115             pb0 = B + off * 2 * 2;
1116 #endif
1117
1118 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1119             temp = k - off;
1120 #elif defined(LEFT)
1121             temp = off + 4; // number of values in A
1122 #else
1123             temp = off + 2; // number of values in B
1124 #endif
1125 #else
1126             pb0 = B;
1127             temp = k;
1128 #endif
1129
1130 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1131             ZGEMM_KERNEL_4X2_MSA(, -, , +, +);
1132 #endif
1133 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1134             ZGEMM_KERNEL_4X2_MSA(, +, , +, -);
1135 #endif
1136 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1137             ZGEMM_KERNEL_4X2_MSA(, +, , -, +);
1138 #endif
1139 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1140             ZGEMM_KERNEL_4X2_MSA(, -, , -, -);
1141 #endif
1142
1143             for (l = (temp - 1); l--;)
1144             {
1145 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1146                 ZGEMM_KERNEL_4X2_MSA(+, -, +, +,);
1147 #endif
1148 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1149                 ZGEMM_KERNEL_4X2_MSA(+, +, -, +,);
1150 #endif
1151 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1152                 ZGEMM_KERNEL_4X2_MSA(+, +, +, -,);
1153 #endif
1154 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1155                 ZGEMM_KERNEL_4X2_MSA(+, -, -, -,);
1156 #endif
1157             }
1158
1159 #if defined(TRMMKERNEL)
1160             ZGEMM_TRMM_SCALE_4X2_MSA
1161 #else
1162             ZGEMM_SCALE_4X2_MSA
1163 #endif
1164
1165 #if defined(TRMMKERNEL)
1166 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1167             temp = k - off;
1168 #ifdef LEFT
1169             temp -= 4; // number of values in A
1170 #else
1171             temp -= 2; // number of values in B
1172 #endif
1173             pa0 += temp * 2 * 4;
1174             pb0 += temp * 2 * 2;
1175 #endif
1176
1177 #ifdef LEFT
1178             off += 4; // number of values in A
1179 #endif
1180 #endif
1181         }
1182
1183         if (m & 2)
1184         {
1185 #if defined(TRMMKERNEL)
1186 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1187             pb0 = B;
1188 #else
1189             pa0 += off * 2 * 2;
1190             pb0 = B + off * 2 * 2;
1191 #endif
1192
1193 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1194             temp = k - off;
1195 #elif defined(LEFT)
1196             temp = off + 2; // number of values in A
1197 #else
1198             temp = off + 2; // number of values in B
1199 #endif
1200 #else
1201             pb0 = B;
1202             temp = k;
1203 #endif
1204
1205 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1206             ZGEMM_KERNEL_2X2_MSA(, -, , +, +);
1207 #endif
1208 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1209             ZGEMM_KERNEL_2X2_MSA(, +, , +, -);
1210 #endif
1211 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1212             ZGEMM_KERNEL_2X2_MSA(, +, , -, +);
1213 #endif
1214 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1215             ZGEMM_KERNEL_2X2_MSA(, -, , -, -);
1216 #endif
1217
1218             for (l = (temp - 1); l--;)
1219             {
1220 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1221                 ZGEMM_KERNEL_2X2_MSA(+, -, +, +,);
1222 #endif
1223 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1224                 ZGEMM_KERNEL_2X2_MSA(+, +, -, +,);
1225 #endif
1226 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1227                 ZGEMM_KERNEL_2X2_MSA(+, +, +, -,);
1228 #endif
1229 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1230                 ZGEMM_KERNEL_2X2_MSA(+, -, -, -,);
1231 #endif
1232             }
1233
1234 #if defined(TRMMKERNEL)
1235             ZGEMM_TRMM_SCALE_2X2_MSA
1236 #else
1237             ZGEMM_SCALE_2X2_MSA
1238 #endif
1239
1240 #if defined(TRMMKERNEL)
1241 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1242             temp = k - off;
1243 #ifdef LEFT
1244             temp -= 2; // number of values in A
1245 #else
1246             temp -= 2; // number of values in B
1247 #endif
1248             pa0 += temp * 2 * 2;
1249             pb0 += temp * 2 * 2;
1250 #endif
1251
1252 #ifdef LEFT
1253             off += 2; // number of values in A
1254 #endif
1255 #endif
1256         }
1257
1258         if (m & 1)
1259         {
1260 #if defined(TRMMKERNEL)
1261 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1262             pb0 = B;
1263 #else
1264             pa0 += off * 2 * 1;
1265             pb0 = B + off * 2 * 2;
1266 #endif
1267
1268 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1269             temp = k - off;
1270 #elif defined(LEFT)
1271             temp = off + 1; // number of values in A
1272 #else
1273             temp = off + 2; // number of values in B
1274 #endif
1275 #else
1276             pb0 = B;
1277             temp = k;
1278 #endif
1279
1280 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1281             ZGEMM_KERNEL_1X2_MSA(, -, , +, +);
1282 #endif
1283 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1284             ZGEMM_KERNEL_1X2_MSA(, +, , +, -);
1285 #endif
1286 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1287             ZGEMM_KERNEL_1X2_MSA(, +, , -, +);
1288 #endif
1289 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1290             ZGEMM_KERNEL_1X2_MSA(, -, , -, -);
1291 #endif
1292
1293             pa0 += 2;
1294
1295             for (l = (temp - 1); l--;)
1296             {
1297 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1298                 ZGEMM_KERNEL_1X2_MSA(+, -, +, +,);
1299 #endif
1300 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1301                 ZGEMM_KERNEL_1X2_MSA(+, +, -, +,);
1302 #endif
1303 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1304                 ZGEMM_KERNEL_1X2_MSA(+, +, +, -,);
1305 #endif
1306 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1307                 ZGEMM_KERNEL_1X2_MSA(+, -, -, -,);
1308 #endif
1309
1310                 pa0 += 2;
1311             }
1312
1313 #if defined(TRMMKERNEL)
1314             ZGEMM_TRMM_SCALE_1X2_MSA
1315 #else
1316             ZGEMM_SCALE_1X2_MSA
1317 #endif
1318             pc0 += 2;
1319             pc1 += 2;
1320
1321 #if defined(TRMMKERNEL)
1322 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1323             temp = k - off;
1324 #ifdef LEFT
1325             temp -= 1; // number of values in A
1326 #else
1327             temp -= 2; // number of values in B
1328 #endif
1329             pa0 += temp * 2 * 1;
1330             pb0 += temp * 2 * 2;
1331 #endif
1332
1333 #ifdef LEFT
1334             off += 1; // number of values in A
1335 #endif
1336 #endif
1337         }
1338
1339 #if defined(TRMMKERNEL) && !defined(LEFT)
1340         off += 2; // number of values in A
1341 #endif
1342
1343         B += (k << 2);
1344         C += (ldc << 2);
1345     }
1346
1347     if (n & 1)
1348     {
1349         pc0 = C;
1350         pa0 = A;
1351
1352 #if defined(TRMMKERNEL) && defined(LEFT)
1353         off = offset;
1354 #endif
1355
1356         for (i = (m >> 2); i--;)
1357         {
1358 #if defined(TRMMKERNEL)
1359 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1360             pb0 = B;
1361 #else
1362             pa0 += off * 2 * 4;
1363             pb0 = B + off * 2 * 1;
1364 #endif
1365
1366 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1367             temp = k - off;
1368 #elif defined(LEFT)
1369             temp = off + 4; // number of values in A
1370 #else
1371             temp = off + 1; // number of values in B
1372 #endif
1373 #else
1374             pb0 = B;
1375             temp = k;
1376 #endif
1377
1378 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1379             ZGEMM_KERNEL_4X1_MSA(, -, , +, +);
1380 #endif
1381 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1382             ZGEMM_KERNEL_4X1_MSA(, +, , +, -);
1383 #endif
1384 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1385             ZGEMM_KERNEL_4X1_MSA(, +, , -, +);
1386 #endif
1387 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1388             ZGEMM_KERNEL_4X1_MSA(, -, , -, -);
1389 #endif
1390
1391             pb0 += 2;
1392
1393             for (l = (temp - 1); l--;)
1394             {
1395 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1396                 ZGEMM_KERNEL_4X1_MSA(+, -, +, +,);
1397 #endif
1398 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1399                 ZGEMM_KERNEL_4X1_MSA(+, +, -, +,);
1400 #endif
1401 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1402                 ZGEMM_KERNEL_4X1_MSA(+, +, +, -,);
1403 #endif
1404 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1405                 ZGEMM_KERNEL_4X1_MSA(+, -, -, -,);
1406 #endif
1407
1408                 pb0 += 2;
1409             }
1410
1411 #if defined(TRMMKERNEL)
1412             ZGEMM_TRMM_SCALE_4X1_MSA
1413 #else
1414             ZGEMM_SCALE_4X1_MSA
1415 #endif
1416
1417 #if defined(TRMMKERNEL)
1418 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1419             temp = k - off;
1420 #ifdef LEFT
1421             temp -= 4; // number of values in A
1422 #else
1423             temp -= 1; // number of values in B
1424 #endif
1425             pa0 += temp * 2 * 4;
1426             pb0 += temp * 2 * 1;
1427 #endif
1428
1429 #ifdef LEFT
1430             off += 4; // number of values in A
1431 #endif
1432 #endif
1433         }
1434
1435         if (m & 2)
1436         {
1437 #if defined(TRMMKERNEL)
1438 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1439             pb0 = B;
1440 #else
1441             pa0 += off * 2 * 2;
1442             pb0 = B + off * 2 * 1;
1443 #endif
1444
1445 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1446             temp = k - off;
1447 #elif defined(LEFT)
1448             temp = off + 2; // number of values in A
1449 #else
1450             temp = off + 1; // number of values in B
1451 #endif
1452 #else
1453             pb0 = B;
1454             temp = k;
1455 #endif
1456
1457 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1458             ZGEMM_KERNEL_2X1_MSA(, -, , +, +);
1459 #endif
1460 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1461             ZGEMM_KERNEL_2X1_MSA(, +, , +, -);
1462 #endif
1463 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1464             ZGEMM_KERNEL_2X1_MSA(, +, , -, +);
1465 #endif
1466 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1467             ZGEMM_KERNEL_2X1_MSA(, -, , -, -);
1468 #endif
1469
1470             pb0 += 2;
1471
1472             for (l = (temp - 1); l--;)
1473             {
1474 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1475                 ZGEMM_KERNEL_2X1_MSA(+, -, +, +,);
1476 #endif
1477 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1478                 ZGEMM_KERNEL_2X1_MSA(+, +, -, +,);
1479 #endif
1480 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1481                 ZGEMM_KERNEL_2X1_MSA(+, +, +, -,);
1482 #endif
1483 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1484                 ZGEMM_KERNEL_2X1_MSA(+, -, -, -,);
1485 #endif
1486
1487                 pb0 += 2;
1488             }
1489
1490 #if defined(TRMMKERNEL)
1491             ZGEMM_TRMM_SCALE_2X1_MSA
1492 #else
1493             ZGEMM_SCALE_2X1_MSA
1494 #endif
1495
1496 #if defined(TRMMKERNEL)
1497 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1498             temp = k - off;
1499 #ifdef LEFT
1500             temp -= 2; // number of values in A
1501 #else
1502             temp -= 1; // number of values in B
1503 #endif
1504             pa0 += temp * 2 * 2;
1505             pb0 += temp * 2 * 1;
1506 #endif
1507
1508 #ifdef LEFT
1509             off += 2; // number of values in A
1510 #endif
1511 #endif
1512         }
1513
1514         if (m & 1)
1515         {
1516 #if defined(TRMMKERNEL)
1517 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1518             pb0 = B;
1519 #else
1520             pa0 += off * 2 * 1;
1521             pb0 = B + off * 2 * 1;
1522 #endif
1523
1524 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1525             temp = k - off;
1526 #elif defined(LEFT)
1527             temp = off + 1; // number of values in A
1528 #else
1529             temp = off + 1; // number of values in B
1530 #endif
1531 #else
1532             pb0 = B;
1533             temp = k;
1534 #endif
1535
1536 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1537             ZGEMM_KERNEL_1X1(, -, , +, +);
1538 #endif
1539 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1540             ZGEMM_KERNEL_1X1(, +, , +, -);
1541 #endif
1542 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1543             ZGEMM_KERNEL_1X1(, +, , -, +);
1544 #endif
1545 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1546             ZGEMM_KERNEL_1X1(, -, , -, -);
1547 #endif
1548
1549             pa0 += 2;
1550             pb0 += 2;
1551
1552             for (l = (temp - 1); l--;)
1553             {
1554 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1555                 ZGEMM_KERNEL_1X1(+, -, +, +,);
1556 #endif
1557 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1558                 ZGEMM_KERNEL_1X1(+, +, -, +,);
1559 #endif
1560 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1561                 ZGEMM_KERNEL_1X1(+, +, +, -,);
1562 #endif
1563 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1564                 ZGEMM_KERNEL_1X1(+, -, -, -,);
1565 #endif
1566
1567                 pa0 += 2;
1568                 pb0 += 2;
1569             }
1570
1571 #if defined(TRMMKERNEL)
1572             ZGEMM_TRMM_SCALE_1X1
1573 #else
1574             ZGEMM_SCALE_1X1
1575 #endif
1576             pc0 += 2;
1577
1578 #if defined(TRMMKERNEL)
1579 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1580             temp = k - off;
1581 #ifdef LEFT
1582             temp -= 1; // number of values in A
1583 #else
1584             temp -= 1; // number of values in B
1585 #endif
1586             pa0 += temp * 2 * 1;
1587             pb0 += temp * 2 * 1;
1588 #endif
1589
1590 #ifdef LEFT
1591             off += 1; // number of values in A
1592 #endif
1593 #endif
1594         }
1595
1596 #if defined(TRMMKERNEL) && !defined(LEFT)
1597         off += 1; // number of values in A
1598 #endif
1599
1600         B += (k << 1);
1601         C += (ldc << 1);
1602     }
1603
1604     return 0;
1605 }