1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 #define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
33 LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
34 LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
36 PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
37 PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
40 SPLATI_D2_DP(src_b0, src_br, src_bi); \
41 res0_r OP0## = src_a0r * src_br; \
42 res0_r OP1## = src_a0i * src_bi; \
43 res0_i OP2## = OP4 src_a0r * src_bi; \
44 res0_i OP3## = src_a0i * src_br; \
46 res1_r OP0## = src_a1r * src_br; \
47 res1_r OP1## = src_a1i * src_bi; \
48 res1_i OP2## = OP4 src_a1r * src_bi; \
49 res1_i OP3## = src_a1i * src_br; \
52 SPLATI_D2_DP(src_b1, src_br, src_bi); \
53 res2_r OP0## = src_a0r * src_br; \
54 res2_r OP1## = src_a0i * src_bi; \
55 res2_i OP2## = OP4 src_a0r * src_bi; \
56 res2_i OP3## = src_a0i * src_br; \
58 res3_r OP0## = src_a1r * src_br; \
59 res3_r OP1## = src_a1i * src_bi; \
60 res3_i OP2## = OP4 src_a1r * src_bi; \
61 res3_i OP3## = src_a1i * src_br; \
64 SPLATI_D2_DP(src_b2, src_br, src_bi); \
65 res4_r OP0## = src_a0r * src_br; \
66 res4_r OP1## = src_a0i * src_bi; \
67 res4_i OP2## = OP4 src_a0r * src_bi; \
68 res4_i OP3## = src_a0i * src_br; \
70 res5_r OP0## = src_a1r * src_br; \
71 res5_r OP1## = src_a1i * src_bi; \
72 res5_i OP2## = OP4 src_a1r * src_bi; \
73 res5_i OP3## = src_a1i * src_br; \
76 SPLATI_D2_DP(src_b3, src_br, src_bi); \
77 res6_r OP0## = src_a0r * src_br; \
78 res6_r OP1## = src_a0i * src_bi; \
79 res6_i OP2## = OP4 src_a0r * src_bi; \
80 res6_i OP3## = src_a0i * src_br; \
82 res7_r OP0## = src_a1r * src_br; \
83 res7_r OP1## = src_a1i * src_bi; \
84 res7_i OP2## = OP4 src_a1r * src_bi; \
85 res7_i OP3## = src_a1i * src_br; \
88 #define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4) \
90 LD_DP2_INC(pa0, 2, src_a0, src_a1); \
91 LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
93 PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
96 SPLATI_D2_DP(src_b0, src_br, src_bi); \
97 res0_r OP0## = src_a0r * src_br; \
98 res0_r OP1## = src_a0i * src_bi; \
99 res0_i OP2## = OP4 src_a0r * src_bi; \
100 res0_i OP3## = src_a0i * src_br; \
103 SPLATI_D2_DP(src_b1, src_br, src_bi); \
104 res2_r OP0## = src_a0r * src_br; \
105 res2_r OP1## = src_a0i * src_bi; \
106 res2_i OP2## = OP4 src_a0r * src_bi; \
107 res2_i OP3## = src_a0i * src_br; \
110 SPLATI_D2_DP(src_b2, src_br, src_bi); \
111 res4_r OP0## = src_a0r * src_br; \
112 res4_r OP1## = src_a0i * src_bi; \
113 res4_i OP2## = OP4 src_a0r * src_bi; \
114 res4_i OP3## = src_a0i * src_br; \
117 SPLATI_D2_DP(src_b3, src_br, src_bi); \
118 res6_r OP0## = src_a0r * src_br; \
119 res6_r OP1## = src_a0i * src_bi; \
120 res6_i OP2## = OP4 src_a0r * src_bi; \
121 res6_i OP3## = src_a0i * src_br; \
124 #define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4) \
126 src_a0 = LD_DP(pa0); \
127 LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
129 PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \
131 /* 0th and 1st col */ \
132 PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \
133 res0_r OP0## = src_a0r * src_br; \
134 res0_r OP1## = src_a0i * src_bi; \
135 res0_i OP2## = OP4 src_a0r * src_bi; \
136 res0_i OP3## = src_a0i * src_br; \
138 /* 2nd and 3rd col */ \
139 PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi); \
140 res1_r OP0## = src_a0r * src_br; \
141 res1_r OP1## = src_a0i * src_bi; \
142 res1_i OP2## = OP4 src_a0r * src_bi; \
143 res1_i OP3## = src_a0i * src_br; \
146 #define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
148 LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
149 LD_DP2_INC(pb0, 2, src_b0, src_b1); \
151 PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
152 PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
155 SPLATI_D2_DP(src_b0, src_br, src_bi); \
156 res0_r OP0## = src_a0r * src_br; \
157 res0_r OP1## = src_a0i * src_bi; \
158 res0_i OP2## = OP4 src_a0r * src_bi; \
159 res0_i OP3## = src_a0i * src_br; \
161 res1_r OP0## = src_a1r * src_br; \
162 res1_r OP1## = src_a1i * src_bi; \
163 res1_i OP2## = OP4 src_a1r * src_bi; \
164 res1_i OP3## = src_a1i * src_br; \
167 SPLATI_D2_DP(src_b1, src_br, src_bi); \
168 res2_r OP0## = src_a0r * src_br; \
169 res2_r OP1## = src_a0i * src_bi; \
170 res2_i OP2## = OP4 src_a0r * src_bi; \
171 res2_i OP3## = src_a0i * src_br; \
173 res3_r OP0## = src_a1r * src_br; \
174 res3_r OP1## = src_a1i * src_bi; \
175 res3_i OP2## = OP4 src_a1r * src_bi; \
176 res3_i OP3## = src_a1i * src_br; \
179 #define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4) \
181 LD_DP2_INC(pa0, 2, src_a0, src_a1); \
182 LD_DP2_INC(pb0, 2, src_b0, src_b1); \
184 PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
187 SPLATI_D2_DP(src_b0, src_br, src_bi); \
188 res0_r OP0## = src_a0r * src_br; \
189 res0_r OP1## = src_a0i * src_bi; \
190 res0_i OP2## = OP4 src_a0r * src_bi; \
191 res0_i OP3## = src_a0i * src_br; \
194 SPLATI_D2_DP(src_b1, src_br, src_bi); \
195 res2_r OP0## = src_a0r * src_br; \
196 res2_r OP1## = src_a0i * src_bi; \
197 res2_i OP2## = OP4 src_a0r * src_bi; \
198 res2_i OP3## = src_a0i * src_br; \
201 #define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4) \
203 src_a0 = LD_DP(pa0); \
204 LD_DP2_INC(pb0, 2, src_b0, src_b1); \
206 PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \
208 /* 0th and 1st col */ \
209 PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \
210 res0_r OP0## = src_a0r * src_br; \
211 res0_r OP1## = src_a0i * src_bi; \
212 res0_i OP2## = OP4 src_a0r * src_bi; \
213 res0_i OP3## = src_a0i * src_br; \
216 #define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
218 LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
219 src_b0 = LD_DP(pb0); \
221 PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
222 PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
225 SPLATI_D2_DP(src_b0, src_br, src_bi); \
226 res0_r OP0## = src_a0r * src_br; \
227 res0_r OP1## = src_a0i * src_bi; \
228 res0_i OP2## = OP4 src_a0r * src_bi; \
229 res0_i OP3## = src_a0i * src_br; \
231 res1_r OP0## = src_a1r * src_br; \
232 res1_r OP1## = src_a1i * src_bi; \
233 res1_i OP2## = OP4 src_a1r * src_bi; \
234 res1_i OP3## = src_a1i * src_br; \
237 #define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4) \
239 LD_DP2_INC(pa0, 2, src_a0, src_a1); \
240 src_b0 = LD_DP(pb0); \
242 PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
245 SPLATI_D2_DP(src_b0, src_br, src_bi); \
246 res0_r OP0## = src_a0r * src_br; \
247 res0_r OP1## = src_a0i * src_bi; \
248 res0_i OP2## = OP4 src_a0r * src_bi; \
249 res0_i OP3## = src_a0i * src_br; \
252 #define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
260 res0 OP0## = a0_r * b0_r; \
261 res0 OP1## = a0_i * b0_i; \
262 res1 OP2## = OP4 a0_r * b0_i; \
263 res1 OP3## = a0_i * b0_r; \
266 #define ZGEMM_SCALE_4X4_MSA \
268 LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
270 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
271 PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
273 dst0_r += alpha_r * res0_r; \
274 dst0_r -= alpha_i * res0_i; \
275 dst0_i += alpha_r * res0_i; \
276 dst0_i += alpha_i * res0_r; \
278 dst1_r += alpha_r * res1_r; \
279 dst1_r -= alpha_i * res1_i; \
280 dst1_i += alpha_r * res1_i; \
281 dst1_i += alpha_i * res1_r; \
283 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
284 ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
286 LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \
288 PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
289 PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
291 dst0_r += alpha_r * res2_r; \
292 dst0_r -= alpha_i * res2_i; \
293 dst0_i += alpha_r * res2_i; \
294 dst0_i += alpha_i * res2_r; \
296 dst1_r += alpha_r * res3_r; \
297 dst1_r -= alpha_i * res3_i; \
298 dst1_i += alpha_r * res3_i; \
299 dst1_i += alpha_i * res3_r; \
301 ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
302 ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
304 ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
305 ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
307 LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); \
309 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
310 PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
312 dst0_r += alpha_r * res4_r; \
313 dst0_r -= alpha_i * res4_i; \
314 dst0_i += alpha_r * res4_i; \
315 dst0_i += alpha_i * res4_r; \
317 dst1_r += alpha_r * res5_r; \
318 dst1_r -= alpha_i * res5_i; \
319 dst1_i += alpha_r * res5_i; \
320 dst1_i += alpha_i * res5_r; \
322 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
323 ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
325 LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); \
327 PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
328 PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
330 dst0_r += alpha_r * res6_r; \
331 dst0_r -= alpha_i * res6_i; \
332 dst0_i += alpha_r * res6_i; \
333 dst0_i += alpha_i * res6_r; \
335 dst1_r += alpha_r * res7_r; \
336 dst1_r -= alpha_i * res7_i; \
337 dst1_i += alpha_r * res7_i; \
338 dst1_i += alpha_i * res7_r; \
340 ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
341 ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
343 ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \
344 ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \
347 #define ZGEMM_SCALE_2X4_MSA \
349 LD_DP2(pc0, 2, dst0, dst1); \
351 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
353 dst0_r += alpha_r * res0_r; \
354 dst0_r -= alpha_i * res0_i; \
355 dst0_i += alpha_r * res0_i; \
356 dst0_i += alpha_i * res0_r; \
358 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
360 LD_DP2(pc1, 2, dst2, dst3); \
362 PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
364 dst0_r += alpha_r * res2_r; \
365 dst0_r -= alpha_i * res2_i; \
366 dst0_i += alpha_r * res2_i; \
367 dst0_i += alpha_i * res2_r; \
369 ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
371 ST_DP2_INC(dst0, dst1, pc0, 2); \
372 ST_DP2_INC(dst2, dst3, pc1, 2); \
374 LD_DP2(pc2, 2, dst0, dst1); \
376 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
378 dst0_r += alpha_r * res4_r; \
379 dst0_r -= alpha_i * res4_i; \
380 dst0_i += alpha_r * res4_i; \
381 dst0_i += alpha_i * res4_r; \
383 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
385 LD_DP2(pc3, 2, dst2, dst3); \
387 PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
389 dst0_r += alpha_r * res6_r; \
390 dst0_r -= alpha_i * res6_i; \
391 dst0_i += alpha_r * res6_i; \
392 dst0_i += alpha_i * res6_r; \
394 ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
396 ST_DP2_INC(dst0, dst1, pc2, 2); \
397 ST_DP2_INC(dst2, dst3, pc3, 2); \
400 #define ZGEMM_SCALE_1X4_MSA \
405 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
407 dst0_r += alpha_r * res0_r; \
408 dst0_r -= alpha_i * res0_i; \
409 dst0_i += alpha_r * res0_i; \
410 dst0_i += alpha_i * res0_r; \
412 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
417 PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
419 dst0_r += alpha_r * res1_r; \
420 dst0_r -= alpha_i * res1_i; \
421 dst0_i += alpha_r * res1_i; \
422 dst0_i += alpha_i * res1_r; \
424 ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
432 #define ZGEMM_SCALE_4X2_MSA \
434 LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
436 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
437 PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
439 dst0_r += alpha_r * res0_r; \
440 dst0_r -= alpha_i * res0_i; \
441 dst0_i += alpha_r * res0_i; \
442 dst0_i += alpha_i * res0_r; \
444 dst1_r += alpha_r * res1_r; \
445 dst1_r -= alpha_i * res1_i; \
446 dst1_i += alpha_r * res1_i; \
447 dst1_i += alpha_i * res1_r; \
449 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
450 ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
452 LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \
454 PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
455 PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
457 dst0_r += alpha_r * res2_r; \
458 dst0_r -= alpha_i * res2_i; \
459 dst0_i += alpha_r * res2_i; \
460 dst0_i += alpha_i * res2_r; \
462 dst1_r += alpha_r * res3_r; \
463 dst1_r -= alpha_i * res3_i; \
464 dst1_i += alpha_r * res3_i; \
465 dst1_i += alpha_i * res3_r; \
467 ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
468 ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
470 ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
471 ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
474 #define ZGEMM_SCALE_2X2_MSA \
476 LD_DP2(pc0, 2, dst0, dst1); \
478 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
480 dst0_r += alpha_r * res0_r; \
481 dst0_r -= alpha_i * res0_i; \
482 dst0_i += alpha_r * res0_i; \
483 dst0_i += alpha_i * res0_r; \
485 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
487 ST_DP2_INC(dst0, dst1, pc0, 2); \
489 LD_DP2(pc1, 2, dst2, dst3); \
491 PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
493 dst0_r += alpha_r * res2_r; \
494 dst0_r -= alpha_i * res2_i; \
495 dst0_i += alpha_r * res2_i; \
496 dst0_i += alpha_i * res2_r; \
498 ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
500 ST_DP2_INC(dst2, dst3, pc1, 2); \
503 #define ZGEMM_SCALE_1X2_MSA \
508 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
510 dst0_r += alpha_r * res0_r; \
511 dst0_r -= alpha_i * res0_i; \
512 dst0_i += alpha_r * res0_i; \
513 dst0_i += alpha_i * res0_r; \
515 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
521 #define ZGEMM_SCALE_4X1_MSA \
523 LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
525 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
526 PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
528 dst0_r += alpha_r * res0_r; \
529 dst0_r -= alpha_i * res0_i; \
530 dst0_i += alpha_r * res0_i; \
531 dst0_i += alpha_i * res0_r; \
533 dst1_r += alpha_r * res1_r; \
534 dst1_r -= alpha_i * res1_i; \
535 dst1_i += alpha_r * res1_i; \
536 dst1_i += alpha_i * res1_r; \
538 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
539 ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
541 ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
544 #define ZGEMM_SCALE_2X1_MSA \
546 LD_DP2(pc0, 2, dst0, dst1); \
548 PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
550 dst0_r += alpha_r * res0_r; \
551 dst0_r -= alpha_i * res0_i; \
552 dst0_i += alpha_r * res0_i; \
553 dst0_i += alpha_i * res0_r; \
555 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
557 ST_DP2_INC(dst0, dst1, pc0, 2); \
560 #define ZGEMM_SCALE_1X1 \
562 pc0[0] += alphar * res0; \
563 pc0[0] -= alphai * res1; \
564 pc0[1] += alphar * res1; \
565 pc0[1] += alphai * res0; \
568 #define ZGEMM_TRMM_SCALE_4X4_MSA \
570 dst0_r = alpha_r * res0_r; \
571 dst0_r -= alpha_i * res0_i; \
572 dst0_i = alpha_r * res0_i; \
573 dst0_i += alpha_i * res0_r; \
575 dst1_r = alpha_r * res1_r; \
576 dst1_r -= alpha_i * res1_i; \
577 dst1_i = alpha_r * res1_i; \
578 dst1_i += alpha_i * res1_r; \
580 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
581 ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
583 dst0_r = alpha_r * res2_r; \
584 dst0_r -= alpha_i * res2_i; \
585 dst0_i = alpha_r * res2_i; \
586 dst0_i += alpha_i * res2_r; \
588 dst1_r = alpha_r * res3_r; \
589 dst1_r -= alpha_i * res3_i; \
590 dst1_i = alpha_r * res3_i; \
591 dst1_i += alpha_i * res3_r; \
593 ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
594 ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
596 ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
597 ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
599 dst0_r = alpha_r * res4_r; \
600 dst0_r -= alpha_i * res4_i; \
601 dst0_i = alpha_r * res4_i; \
602 dst0_i += alpha_i * res4_r; \
604 dst1_r = alpha_r * res5_r; \
605 dst1_r -= alpha_i * res5_i; \
606 dst1_i = alpha_r * res5_i; \
607 dst1_i += alpha_i * res5_r; \
609 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
610 ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
612 dst0_r = alpha_r * res6_r; \
613 dst0_r -= alpha_i * res6_i; \
614 dst0_i = alpha_r * res6_i; \
615 dst0_i += alpha_i * res6_r; \
617 dst1_r = alpha_r * res7_r; \
618 dst1_r -= alpha_i * res7_i; \
619 dst1_i = alpha_r * res7_i; \
620 dst1_i += alpha_i * res7_r; \
622 ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
623 ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
625 ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \
626 ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \
629 #define ZGEMM_TRMM_SCALE_2X4_MSA \
631 dst0_r = alpha_r * res0_r; \
632 dst0_r -= alpha_i * res0_i; \
633 dst0_i = alpha_r * res0_i; \
634 dst0_i += alpha_i * res0_r; \
636 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
638 dst0_r = alpha_r * res2_r; \
639 dst0_r -= alpha_i * res2_i; \
640 dst0_i = alpha_r * res2_i; \
641 dst0_i += alpha_i * res2_r; \
643 ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
645 ST_DP2_INC(dst0, dst1, pc0, 2); \
646 ST_DP2_INC(dst2, dst3, pc1, 2); \
648 dst0_r = alpha_r * res4_r; \
649 dst0_r -= alpha_i * res4_i; \
650 dst0_i = alpha_r * res4_i; \
651 dst0_i += alpha_i * res4_r; \
653 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
655 dst0_r = alpha_r * res6_r; \
656 dst0_r -= alpha_i * res6_i; \
657 dst0_i = alpha_r * res6_i; \
658 dst0_i += alpha_i * res6_r; \
660 ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
662 ST_DP2_INC(dst0, dst1, pc2, 2); \
663 ST_DP2_INC(dst2, dst3, pc3, 2); \
666 #define ZGEMM_TRMM_SCALE_1X4_MSA \
668 dst0_r = alpha_r * res0_r; \
669 dst0_r -= alpha_i * res0_i; \
670 dst0_i = alpha_r * res0_i; \
671 dst0_i += alpha_i * res0_r; \
673 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
675 dst0_r = alpha_r * res1_r; \
676 dst0_r -= alpha_i * res1_i; \
677 dst0_i = alpha_r * res1_i; \
678 dst0_i += alpha_i * res1_r; \
680 ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
688 #define ZGEMM_TRMM_SCALE_4X2_MSA \
690 dst0_r = alpha_r * res0_r; \
691 dst0_r -= alpha_i * res0_i; \
692 dst0_i = alpha_r * res0_i; \
693 dst0_i += alpha_i * res0_r; \
695 dst1_r = alpha_r * res1_r; \
696 dst1_r -= alpha_i * res1_i; \
697 dst1_i = alpha_r * res1_i; \
698 dst1_i += alpha_i * res1_r; \
700 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
701 ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
703 dst0_r = alpha_r * res2_r; \
704 dst0_r -= alpha_i * res2_i; \
705 dst0_i = alpha_r * res2_i; \
706 dst0_i += alpha_i * res2_r; \
708 dst1_r = alpha_r * res3_r; \
709 dst1_r -= alpha_i * res3_i; \
710 dst1_i = alpha_r * res3_i; \
711 dst1_i += alpha_i * res3_r; \
713 ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
714 ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
716 ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
717 ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
720 #define ZGEMM_TRMM_SCALE_2X2_MSA \
722 dst0_r = alpha_r * res0_r; \
723 dst0_r -= alpha_i * res0_i; \
724 dst0_i = alpha_r * res0_i; \
725 dst0_i += alpha_i * res0_r; \
727 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
729 ST_DP2_INC(dst0, dst1, pc0, 2); \
731 dst0_r = alpha_r * res2_r; \
732 dst0_r -= alpha_i * res2_i; \
733 dst0_i = alpha_r * res2_i; \
734 dst0_i += alpha_i * res2_r; \
736 ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
738 ST_DP2_INC(dst2, dst3, pc1, 2); \
741 #define ZGEMM_TRMM_SCALE_1X2_MSA \
743 dst0_r = alpha_r * res0_r; \
744 dst0_r -= alpha_i * res0_i; \
745 dst0_i = alpha_r * res0_i; \
746 dst0_i += alpha_i * res0_r; \
748 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
754 #define ZGEMM_TRMM_SCALE_4X1_MSA \
756 dst0_r = alpha_r * res0_r; \
757 dst0_r -= alpha_i * res0_i; \
758 dst0_i = alpha_r * res0_i; \
759 dst0_i += alpha_i * res0_r; \
761 dst1_r = alpha_r * res1_r; \
762 dst1_r -= alpha_i * res1_i; \
763 dst1_i = alpha_r * res1_i; \
764 dst1_i += alpha_i * res1_r; \
766 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
767 ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
769 ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
772 #define ZGEMM_TRMM_SCALE_2X1_MSA \
774 dst0_r = alpha_r * res0_r; \
775 dst0_r -= alpha_i * res0_i; \
776 dst0_i = alpha_r * res0_i; \
777 dst0_i += alpha_i * res0_r; \
779 ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
781 ST_DP2_INC(dst0, dst1, pc0, 2); \
784 #define ZGEMM_TRMM_SCALE_1X1 \
786 pc0[0] = alphar * res0; \
787 pc0[0] -= alphai * res1; \
788 pc0[1] = alphar * res1; \
789 pc0[1] += alphai * res0; \
792 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
793 FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
799 BLASLONG i, j, l, temp;
800 #if defined(TRMMKERNEL)
803 FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
804 FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i;
805 v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3;
806 v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
807 v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
808 v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i;
809 v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
810 v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
812 alpha_r = COPY_DOUBLE_TO_VECTOR(alphar);
813 alpha_i = COPY_DOUBLE_TO_VECTOR(alphai);
815 #if defined(TRMMKERNEL) && !defined(LEFT)
819 for (j = (n >> 2); j--;)
828 #if defined(TRMMKERNEL) && defined(LEFT)
832 for (i = (m >> 2); i--;)
834 #if defined(TRMMKERNEL)
835 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
839 pb0 = B + off * 2 * 4;
842 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
845 temp = off + 4; // number of values in A
847 temp = off + 4; // number of values in B
854 #ifdef ENABLE_PREFETCH
855 __asm__ __volatile__(
856 "pref 0, 64(%[pa0]) \n\t"
857 "pref 0, 96(%[pa0]) \n\t"
858 "pref 0, 64(%[pb0]) \n\t"
859 "pref 0, 96(%[pb0]) \n\t"
862 : [pa0] "r" (pa0), [pb0] "r" (pb0)
866 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
867 ZGEMM_KERNEL_4X4_MSA(, -, , +, +);
869 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
870 ZGEMM_KERNEL_4X4_MSA(, +, , +, -);
872 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
873 ZGEMM_KERNEL_4X4_MSA(, +, , -, +);
875 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
876 ZGEMM_KERNEL_4X4_MSA(, -, , -, -);
879 for (l = (temp - 1); l--;)
881 #ifdef ENABLE_PREFETCH
882 __asm__ __volatile__(
883 "pref 0, 64(%[pa0]) \n\t"
884 "pref 0, 96(%[pa0]) \n\t"
885 "pref 0, 64(%[pb0]) \n\t"
886 "pref 0, 96(%[pb0]) \n\t"
889 : [pa0] "r" (pa0), [pb0] "r" (pb0)
893 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
894 ZGEMM_KERNEL_4X4_MSA(+, -, +, +,);
896 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
897 ZGEMM_KERNEL_4X4_MSA(+, +, -, +,);
899 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
900 ZGEMM_KERNEL_4X4_MSA(+, +, +, -,);
902 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
903 ZGEMM_KERNEL_4X4_MSA(+, -, -, -,);
907 #if defined(TRMMKERNEL)
908 ZGEMM_TRMM_SCALE_4X4_MSA
913 #if defined(TRMMKERNEL)
914 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
917 temp -= 4; // number of values in A
919 temp -= 4; // number of values in B
926 off += 4; // number of values in A
933 #if defined(TRMMKERNEL)
934 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
938 pb0 = B + off * 2 * 4;
941 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
944 temp = off + 2; // number of values in A
946 temp = off + 4; // number of values in B
953 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
954 ZGEMM_KERNEL_2X4_MSA(, -, , +, +);
956 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
957 ZGEMM_KERNEL_2X4_MSA(, +, , +, -);
959 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
960 ZGEMM_KERNEL_2X4_MSA(, +, , -, +);
962 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
963 ZGEMM_KERNEL_2X4_MSA(, -, , -, -);
966 for (l = (temp - 1); l--;)
968 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
969 ZGEMM_KERNEL_2X4_MSA(+, -, +, +,);
971 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
972 ZGEMM_KERNEL_2X4_MSA(+, +, -, +,);
974 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
975 ZGEMM_KERNEL_2X4_MSA(+, +, +, -,);
977 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
978 ZGEMM_KERNEL_2X4_MSA(+, -, -, -,);
982 #if defined(TRMMKERNEL)
983 ZGEMM_TRMM_SCALE_2X4_MSA
988 #if defined(TRMMKERNEL)
989 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
992 temp -= 2; // number of values in A
994 temp -= 4; // number of values in B
1001 off += 2; // number of values in A
1008 #if defined(TRMMKERNEL)
1009 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1013 pb0 = B + off * 2 * 4;
1016 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1019 temp = off + 1; // number of values in A
1021 temp = off + 4; // number of values in B
1028 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1029 ZGEMM_KERNEL_1X4_MSA(, -, , +, +);
1031 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1032 ZGEMM_KERNEL_1X4_MSA(, +, , +, -);
1034 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1035 ZGEMM_KERNEL_1X4_MSA(, +, , -, +);
1037 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1038 ZGEMM_KERNEL_1X4_MSA(, -, , -, -);
1043 for (l = (temp - 1); l--;)
1045 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1046 ZGEMM_KERNEL_1X4_MSA(+, -, +, +,);
1048 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1049 ZGEMM_KERNEL_1X4_MSA(+, +, -, +,);
1051 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1052 ZGEMM_KERNEL_1X4_MSA(+, +, +, -,);
1054 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1055 ZGEMM_KERNEL_1X4_MSA(+, -, -, -,);
1061 #if defined(TRMMKERNEL)
1062 ZGEMM_TRMM_SCALE_1X4_MSA
1071 #if defined(TRMMKERNEL)
1072 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1075 temp -= 1; // number of values in A
1077 temp -= 4; // number of values in B
1079 pa0 += temp * 2 * 1;
1080 pb0 += temp * 2 * 4;
1084 off += 1; // number of values in A
1089 #if defined(TRMMKERNEL) && !defined(LEFT)
1090 off += 4; // number of values in A
1100 pc1 = pc0 + 2 * ldc;
1104 #if defined(TRMMKERNEL) && defined(LEFT)
1108 for (i = (m >> 2); i--;)
1110 #if defined(TRMMKERNEL)
1111 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1115 pb0 = B + off * 2 * 2;
1118 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1121 temp = off + 4; // number of values in A
1123 temp = off + 2; // number of values in B
1130 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1131 ZGEMM_KERNEL_4X2_MSA(, -, , +, +);
1133 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1134 ZGEMM_KERNEL_4X2_MSA(, +, , +, -);
1136 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1137 ZGEMM_KERNEL_4X2_MSA(, +, , -, +);
1139 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1140 ZGEMM_KERNEL_4X2_MSA(, -, , -, -);
1143 for (l = (temp - 1); l--;)
1145 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1146 ZGEMM_KERNEL_4X2_MSA(+, -, +, +,);
1148 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1149 ZGEMM_KERNEL_4X2_MSA(+, +, -, +,);
1151 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1152 ZGEMM_KERNEL_4X2_MSA(+, +, +, -,);
1154 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1155 ZGEMM_KERNEL_4X2_MSA(+, -, -, -,);
1159 #if defined(TRMMKERNEL)
1160 ZGEMM_TRMM_SCALE_4X2_MSA
1165 #if defined(TRMMKERNEL)
1166 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1169 temp -= 4; // number of values in A
1171 temp -= 2; // number of values in B
1173 pa0 += temp * 2 * 4;
1174 pb0 += temp * 2 * 2;
1178 off += 4; // number of values in A
1185 #if defined(TRMMKERNEL)
1186 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1190 pb0 = B + off * 2 * 2;
1193 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1196 temp = off + 2; // number of values in A
1198 temp = off + 2; // number of values in B
1205 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1206 ZGEMM_KERNEL_2X2_MSA(, -, , +, +);
1208 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1209 ZGEMM_KERNEL_2X2_MSA(, +, , +, -);
1211 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1212 ZGEMM_KERNEL_2X2_MSA(, +, , -, +);
1214 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1215 ZGEMM_KERNEL_2X2_MSA(, -, , -, -);
1218 for (l = (temp - 1); l--;)
1220 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1221 ZGEMM_KERNEL_2X2_MSA(+, -, +, +,);
1223 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1224 ZGEMM_KERNEL_2X2_MSA(+, +, -, +,);
1226 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1227 ZGEMM_KERNEL_2X2_MSA(+, +, +, -,);
1229 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1230 ZGEMM_KERNEL_2X2_MSA(+, -, -, -,);
1234 #if defined(TRMMKERNEL)
1235 ZGEMM_TRMM_SCALE_2X2_MSA
1240 #if defined(TRMMKERNEL)
1241 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1244 temp -= 2; // number of values in A
1246 temp -= 2; // number of values in B
1248 pa0 += temp * 2 * 2;
1249 pb0 += temp * 2 * 2;
1253 off += 2; // number of values in A
1260 #if defined(TRMMKERNEL)
1261 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1265 pb0 = B + off * 2 * 2;
1268 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1271 temp = off + 1; // number of values in A
1273 temp = off + 2; // number of values in B
1280 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1281 ZGEMM_KERNEL_1X2_MSA(, -, , +, +);
1283 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1284 ZGEMM_KERNEL_1X2_MSA(, +, , +, -);
1286 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1287 ZGEMM_KERNEL_1X2_MSA(, +, , -, +);
1289 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1290 ZGEMM_KERNEL_1X2_MSA(, -, , -, -);
1295 for (l = (temp - 1); l--;)
1297 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1298 ZGEMM_KERNEL_1X2_MSA(+, -, +, +,);
1300 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1301 ZGEMM_KERNEL_1X2_MSA(+, +, -, +,);
1303 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1304 ZGEMM_KERNEL_1X2_MSA(+, +, +, -,);
1306 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1307 ZGEMM_KERNEL_1X2_MSA(+, -, -, -,);
1313 #if defined(TRMMKERNEL)
1314 ZGEMM_TRMM_SCALE_1X2_MSA
1321 #if defined(TRMMKERNEL)
1322 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1325 temp -= 1; // number of values in A
1327 temp -= 2; // number of values in B
1329 pa0 += temp * 2 * 1;
1330 pb0 += temp * 2 * 2;
1334 off += 1; // number of values in A
1339 #if defined(TRMMKERNEL) && !defined(LEFT)
1340 off += 2; // number of values in A
1352 #if defined(TRMMKERNEL) && defined(LEFT)
1356 for (i = (m >> 2); i--;)
1358 #if defined(TRMMKERNEL)
1359 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1363 pb0 = B + off * 2 * 1;
1366 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1369 temp = off + 4; // number of values in A
1371 temp = off + 1; // number of values in B
1378 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1379 ZGEMM_KERNEL_4X1_MSA(, -, , +, +);
1381 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1382 ZGEMM_KERNEL_4X1_MSA(, +, , +, -);
1384 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1385 ZGEMM_KERNEL_4X1_MSA(, +, , -, +);
1387 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1388 ZGEMM_KERNEL_4X1_MSA(, -, , -, -);
1393 for (l = (temp - 1); l--;)
1395 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1396 ZGEMM_KERNEL_4X1_MSA(+, -, +, +,);
1398 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1399 ZGEMM_KERNEL_4X1_MSA(+, +, -, +,);
1401 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1402 ZGEMM_KERNEL_4X1_MSA(+, +, +, -,);
1404 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1405 ZGEMM_KERNEL_4X1_MSA(+, -, -, -,);
1411 #if defined(TRMMKERNEL)
1412 ZGEMM_TRMM_SCALE_4X1_MSA
1417 #if defined(TRMMKERNEL)
1418 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1421 temp -= 4; // number of values in A
1423 temp -= 1; // number of values in B
1425 pa0 += temp * 2 * 4;
1426 pb0 += temp * 2 * 1;
1430 off += 4; // number of values in A
1437 #if defined(TRMMKERNEL)
1438 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1442 pb0 = B + off * 2 * 1;
1445 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1448 temp = off + 2; // number of values in A
1450 temp = off + 1; // number of values in B
1457 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1458 ZGEMM_KERNEL_2X1_MSA(, -, , +, +);
1460 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1461 ZGEMM_KERNEL_2X1_MSA(, +, , +, -);
1463 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1464 ZGEMM_KERNEL_2X1_MSA(, +, , -, +);
1466 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1467 ZGEMM_KERNEL_2X1_MSA(, -, , -, -);
1472 for (l = (temp - 1); l--;)
1474 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1475 ZGEMM_KERNEL_2X1_MSA(+, -, +, +,);
1477 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1478 ZGEMM_KERNEL_2X1_MSA(+, +, -, +,);
1480 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1481 ZGEMM_KERNEL_2X1_MSA(+, +, +, -,);
1483 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1484 ZGEMM_KERNEL_2X1_MSA(+, -, -, -,);
1490 #if defined(TRMMKERNEL)
1491 ZGEMM_TRMM_SCALE_2X1_MSA
1496 #if defined(TRMMKERNEL)
1497 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1500 temp -= 2; // number of values in A
1502 temp -= 1; // number of values in B
1504 pa0 += temp * 2 * 2;
1505 pb0 += temp * 2 * 1;
1509 off += 2; // number of values in A
1516 #if defined(TRMMKERNEL)
1517 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1521 pb0 = B + off * 2 * 1;
1524 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1527 temp = off + 1; // number of values in A
1529 temp = off + 1; // number of values in B
1536 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1537 ZGEMM_KERNEL_1X1(, -, , +, +);
1539 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1540 ZGEMM_KERNEL_1X1(, +, , +, -);
1542 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1543 ZGEMM_KERNEL_1X1(, +, , -, +);
1545 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1546 ZGEMM_KERNEL_1X1(, -, , -, -);
1552 for (l = (temp - 1); l--;)
1554 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1555 ZGEMM_KERNEL_1X1(+, -, +, +,);
1557 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1558 ZGEMM_KERNEL_1X1(+, +, -, +,);
1560 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1561 ZGEMM_KERNEL_1X1(+, +, +, -,);
1563 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1564 ZGEMM_KERNEL_1X1(+, -, -, -,);
1571 #if defined(TRMMKERNEL)
1572 ZGEMM_TRMM_SCALE_1X1
1578 #if defined(TRMMKERNEL)
1579 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1582 temp -= 1; // number of values in A
1584 temp -= 1; // number of values in B
1586 pa0 += temp * 2 * 1;
1587 pb0 += temp * 2 * 1;
1591 off += 1; // number of values in A
1596 #if defined(TRMMKERNEL) && !defined(LEFT)
1597 off += 1; // number of values in A