1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 #define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \
33 LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
34 LD_SP2_INC(pb0, 4, src_b0, src_b1); \
36 PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
37 PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
40 SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
41 res0_r OP0## = src_a0r * src_br; \
42 res0_r OP1## = src_a0i * src_bi; \
43 res0_i OP2## = (OP4 src_a0r) * src_bi; \
44 res0_i OP3## = src_a0i * src_br; \
46 res1_r OP0## = src_a1r * src_br; \
47 res1_r OP1## = src_a1i * src_bi; \
48 res1_i OP2## = (OP4 src_a1r) * src_bi; \
49 res1_i OP3## = src_a1i * src_br; \
52 SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
53 res2_r OP0## = src_a0r * src_br; \
54 res2_r OP1## = src_a0i * src_bi; \
55 res2_i OP2## = (OP4 src_a0r) * src_bi; \
56 res2_i OP3## = src_a0i * src_br; \
58 res3_r OP0## = src_a1r * src_br; \
59 res3_r OP1## = src_a1i * src_bi; \
60 res3_i OP2## = (OP4 src_a1r) * src_bi; \
61 res3_i OP3## = src_a1i * src_br; \
64 SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
65 res4_r OP0## = src_a0r * src_br; \
66 res4_r OP1## = src_a0i * src_bi; \
67 res4_i OP2## = (OP4 src_a0r) * src_bi; \
68 res4_i OP3## = src_a0i * src_br; \
70 res5_r OP0## = src_a1r * src_br; \
71 res5_r OP1## = src_a1i * src_bi; \
72 res5_i OP2## = (OP4 src_a1r) * src_bi; \
73 res5_i OP3## = src_a1i * src_br; \
76 SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
77 res6_r OP0## = src_a0r * src_br; \
78 res6_r OP1## = src_a0i * src_bi; \
79 res6_i OP2## = (OP4 src_a0r) * src_bi; \
80 res6_i OP3## = src_a0i * src_br; \
82 res7_r OP0## = src_a1r * src_br; \
83 res7_r OP1## = src_a1i * src_bi; \
84 res7_i OP2## = (OP4 src_a1r) * src_bi; \
85 res7_i OP3## = src_a1i * src_br; \
88 #define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \
90 LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
91 src_b0 = LD_SP(pb0); \
93 PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
94 PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
97 SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
98 res0_r OP0## = src_a0r * src_br; \
99 res0_r OP1## = src_a0i * src_bi; \
100 res0_i OP2## = (OP4 src_a0r) * src_bi; \
101 res0_i OP3## = src_a0i * src_br; \
103 res1_r OP0## = src_a1r * src_br; \
104 res1_r OP1## = src_a1i * src_bi; \
105 res1_i OP2## = (OP4 src_a1r) * src_bi; \
106 res1_i OP3## = src_a1i * src_br; \
109 SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
110 res2_r OP0## = src_a0r * src_br; \
111 res2_r OP1## = src_a0i * src_bi; \
112 res2_i OP2## = (OP4 src_a0r) * src_bi; \
113 res2_i OP3## = src_a0i * src_br; \
115 res3_r OP0## = src_a1r * src_br; \
116 res3_r OP1## = src_a1i * src_bi; \
117 res3_i OP2## = (OP4 src_a1r) * src_bi; \
118 res3_i OP3## = src_a1i * src_br; \
121 #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \
123 LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
124 src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
125 SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
127 PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
128 PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
131 res0_r OP0## = src_a0r * src_br; \
132 res0_r OP1## = src_a0i * src_bi; \
133 res0_i OP2## = (OP4 src_a0r) * src_bi; \
134 res0_i OP3## = src_a0i * src_br; \
136 res1_r OP0## = src_a1r * src_br; \
137 res1_r OP1## = src_a1i * src_bi; \
138 res1_i OP2## = (OP4 src_a1r) * src_bi; \
139 res1_i OP3## = src_a1i * src_br; \
142 #define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
144 LD_SP2_INC(pa0, 4, src_a0, src_a1); \
145 LD_SP2_INC(pb0, 4, src_b0, src_b1); \
147 PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
150 SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
151 res0_r OP0## = src_a0r * src_br; \
152 res0_r OP1## = src_a0i * src_bi; \
153 res0_i OP2## = OP4 src_a0r * src_bi; \
154 res0_i OP3## = src_a0i * src_br; \
157 SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
158 res2_r OP0## = src_a0r * src_br; \
159 res2_r OP1## = src_a0i * src_bi; \
160 res2_i OP2## = OP4 src_a0r * src_bi; \
161 res2_i OP3## = src_a0i * src_br; \
164 SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
165 res4_r OP0## = src_a0r * src_br; \
166 res4_r OP1## = src_a0i * src_bi; \
167 res4_i OP2## = OP4 src_a0r * src_bi; \
168 res4_i OP3## = src_a0i * src_br; \
171 SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
172 res6_r OP0## = src_a0r * src_br; \
173 res6_r OP1## = src_a0i * src_bi; \
174 res6_i OP2## = OP4 src_a0r * src_bi; \
175 res6_i OP3## = src_a0i * src_br; \
178 #define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
180 LD_SP2_INC(pa0, 4, src_a0, src_a1); \
181 src_b0 = LD_SP(pb0); \
183 PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
186 SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
187 res0_r OP0## = src_a0r * src_br; \
188 res0_r OP1## = src_a0i * src_bi; \
189 res0_i OP2## = OP4 src_a0r * src_bi; \
190 res0_i OP3## = src_a0i * src_br; \
193 SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
194 res2_r OP0## = src_a0r * src_br; \
195 res2_r OP1## = src_a0i * src_bi; \
196 res2_i OP2## = OP4 src_a0r * src_bi; \
197 res2_i OP3## = src_a0i * src_br; \
200 #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
202 LD_SP2_INC(pa0, 4, src_a0, src_a1); \
203 src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
204 SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
206 PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
209 res0_r OP0## = src_a0r * src_br; \
210 res0_r OP1## = src_a0i * src_bi; \
211 res0_i OP2## = OP4 src_a0r * src_bi; \
212 res0_i OP3## = src_a0i * src_br; \
215 #define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \
222 res0 OP0## = a0_r * b0_r; \
223 res0 OP1## = a0_i * b0_i; \
224 res1 OP2## = OP4 a0_r * b0_i; \
225 res1 OP3## = a0_i * b0_r; \
229 res2 OP0## = a1_r * b0_r; \
230 res2 OP1## = a1_i * b0_i; \
231 res3 OP2## = OP4 a1_r * b0_i; \
232 res3 OP3## = a1_i * b0_r; \
237 res4 OP0## = a0_r * b1_r; \
238 res4 OP1## = a0_i * b1_i; \
239 res5 OP2## = OP4 a0_r * b1_i; \
240 res5 OP3## = a0_i * b1_r; \
242 res6 OP0## = a1_r * b1_r; \
243 res6 OP1## = a1_i * b1_i; \
244 res7 OP2## = OP4 a1_r * b1_i; \
245 res7 OP3## = a1_i * b1_r; \
250 res8 OP0## = a0_r * b2_r; \
251 res8 OP1## = a0_i * b2_i; \
252 res9 OP2## = OP4 a0_r * b2_i; \
253 res9 OP3## = a0_i * b2_r; \
255 res10 OP0## = a1_r * b2_r; \
256 res10 OP1## = a1_i * b2_i; \
257 res11 OP2## = OP4 a1_r * b2_i; \
258 res11 OP3## = a1_i * b2_r; \
263 res12 OP0## = a0_r * b3_r; \
264 res12 OP1## = a0_i * b3_i; \
265 res13 OP2## = OP4 a0_r * b3_i; \
266 res13 OP3## = a0_i * b3_r; \
268 res14 OP0## = a1_r * b3_r; \
269 res14 OP1## = a1_i * b3_i; \
270 res15 OP2## = OP4 a1_r * b3_i; \
271 res15 OP3## = a1_i * b3_r; \
274 #define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \
282 res0 OP0## = a0_r * b0_r; \
283 res0 OP1## = a0_i * b0_i; \
284 res1 OP2## = OP4 a0_r * b0_i; \
285 res1 OP3## = a0_i * b0_r; \
289 res2 OP0## = a1_r * b0_r; \
290 res2 OP1## = a1_i * b0_i; \
291 res3 OP2## = OP4 a1_r * b0_i; \
292 res3 OP3## = a1_i * b0_r; \
297 res4 OP0## = a0_r * b1_r; \
298 res4 OP1## = a0_i * b1_i; \
299 res5 OP2## = OP4 a0_r * b1_i; \
300 res5 OP3## = a0_i * b1_r; \
302 res6 OP0## = a1_r * b1_r; \
303 res6 OP1## = a1_i * b1_i; \
304 res7 OP2## = OP4 a1_r * b1_i; \
305 res7 OP3## = a1_i * b1_r; \
308 #define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \
316 res0 OP0## = a0_r * b0_r; \
317 res0 OP1## = a0_i * b0_i; \
318 res1 OP2## = OP4 a0_r * b0_i; \
319 res1 OP3## = a0_i * b0_r; \
323 res2 OP0## = a1_r * b0_r; \
324 res2 OP1## = a1_i * b0_i; \
325 res3 OP2## = OP4 a1_r * b0_i; \
326 res3 OP3## = a1_i * b0_r; \
329 #define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \
337 res0 OP0## = a0_r * b0_r; \
338 res0 OP1## = a0_i * b0_i; \
339 res1 OP2## = OP4 a0_r * b0_i; \
340 res1 OP3## = a0_i * b0_r; \
345 res2 OP0## = a0_r * b1_r; \
346 res2 OP1## = a0_i * b1_i; \
347 res3 OP2## = OP4 a0_r * b1_i; \
348 res3 OP3## = a0_i * b1_r; \
353 res4 OP0## = a0_r * b2_r; \
354 res4 OP1## = a0_i * b2_i; \
355 res5 OP2## = OP4 a0_r * b2_i; \
356 res5 OP3## = a0_i * b2_r; \
361 res6 OP0## = a0_r * b3_r; \
362 res6 OP1## = a0_i * b3_i; \
363 res7 OP2## = OP4 a0_r * b3_i; \
364 res7 OP3## = a0_i * b3_r; \
367 #define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \
375 res0 OP0## = a0_r * b0_r; \
376 res0 OP1## = a0_i * b0_i; \
377 res1 OP2## = OP4 a0_r * b0_i; \
378 res1 OP3## = a0_i * b0_r; \
383 res2 OP0## = a0_r * b1_r; \
384 res2 OP1## = a0_i * b1_i; \
385 res3 OP2## = OP4 a0_r * b1_i; \
386 res3 OP3## = a0_i * b1_r; \
389 #define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
397 res0 OP0## = a0_r * b0_r; \
398 res0 OP1## = a0_i * b0_i; \
399 res1 OP2## = OP4 a0_r * b0_i; \
400 res1 OP3## = a0_i * b0_r; \
403 #define CGEMM_SCALE_8X4_MSA \
405 LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
407 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
408 PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
410 dst0_r += alpha_r * res0_r; \
411 dst0_r -= alpha_i * res0_i; \
412 dst0_i += alpha_r * res0_i; \
413 dst0_i += alpha_i * res0_r; \
415 dst1_r += alpha_r * res1_r; \
416 dst1_r -= alpha_i * res1_i; \
417 dst1_i += alpha_r * res1_i; \
418 dst1_i += alpha_i * res1_r; \
420 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
421 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
423 ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
425 LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
427 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
428 PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
430 dst0_r += alpha_r * res2_r; \
431 dst0_r -= alpha_i * res2_i; \
432 dst0_i += alpha_r * res2_i; \
433 dst0_i += alpha_i * res2_r; \
435 dst1_r += alpha_r * res3_r; \
436 dst1_r -= alpha_i * res3_i; \
437 dst1_i += alpha_r * res3_i; \
438 dst1_i += alpha_i * res3_r; \
440 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
441 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
443 ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
445 LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \
447 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
448 PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
450 dst0_r += alpha_r * res4_r; \
451 dst0_r -= alpha_i * res4_i; \
452 dst0_i += alpha_r * res4_i; \
453 dst0_i += alpha_i * res4_r; \
455 dst1_r += alpha_r * res5_r; \
456 dst1_r -= alpha_i * res5_i; \
457 dst1_i += alpha_r * res5_i; \
458 dst1_i += alpha_i * res5_r; \
460 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
461 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
463 ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
465 LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \
467 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
468 PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
470 dst0_r += alpha_r * res6_r; \
471 dst0_r -= alpha_i * res6_i; \
472 dst0_i += alpha_r * res6_i; \
473 dst0_i += alpha_i * res6_r; \
475 dst1_r += alpha_r * res7_r; \
476 dst1_r -= alpha_i * res7_i; \
477 dst1_i += alpha_r * res7_i; \
478 dst1_i += alpha_i * res7_r; \
480 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
481 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
483 ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
486 #define CGEMM_SCALE_8X2_MSA \
488 LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
490 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
491 PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
493 dst0_r += alpha_r * res0_r; \
494 dst0_r -= alpha_i * res0_i; \
495 dst0_i += alpha_r * res0_i; \
496 dst0_i += alpha_i * res0_r; \
498 dst1_r += alpha_r * res1_r; \
499 dst1_r -= alpha_i * res1_i; \
500 dst1_i += alpha_r * res1_i; \
501 dst1_i += alpha_i * res1_r; \
503 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
504 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
506 ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
508 LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
510 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
511 PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
513 dst0_r += alpha_r * res2_r; \
514 dst0_r -= alpha_i * res2_i; \
515 dst0_i += alpha_r * res2_i; \
516 dst0_i += alpha_i * res2_r; \
518 dst1_r += alpha_r * res3_r; \
519 dst1_r -= alpha_i * res3_i; \
520 dst1_i += alpha_r * res3_i; \
521 dst1_i += alpha_i * res3_r; \
523 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
524 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
526 ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
529 #define CGEMM_SCALE_8X1_MSA \
531 LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
533 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
534 PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
536 dst0_r += alpha_r * res0_r; \
537 dst0_r -= alpha_i * res0_i; \
538 dst0_i += alpha_r * res0_i; \
539 dst0_i += alpha_i * res0_r; \
541 dst1_r += alpha_r * res1_r; \
542 dst1_r -= alpha_i * res1_i; \
543 dst1_i += alpha_r * res1_i; \
544 dst1_i += alpha_i * res1_r; \
546 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
547 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
549 ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
552 #define CGEMM_SCALE_4X4_MSA \
554 LD_SP2(pc0, 4, dst0, dst1); \
556 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
558 dst0_r += alpha_r * res0_r; \
559 dst0_r -= alpha_i * res0_i; \
560 dst0_i += alpha_r * res0_i; \
561 dst0_i += alpha_i * res0_r; \
563 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
565 ST_SP2_INC(dst0, dst1, pc0, 4); \
567 LD_SP2(pc1, 4, dst0, dst1); \
569 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
571 dst0_r += alpha_r * res2_r; \
572 dst0_r -= alpha_i * res2_i; \
573 dst0_i += alpha_r * res2_i; \
574 dst0_i += alpha_i * res2_r; \
576 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
578 ST_SP2_INC(dst0, dst1, pc1, 4); \
580 LD_SP2(pc2, 4, dst0, dst1); \
582 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
584 dst0_r += alpha_r * res4_r; \
585 dst0_r -= alpha_i * res4_i; \
586 dst0_i += alpha_r * res4_i; \
587 dst0_i += alpha_i * res4_r; \
589 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
591 ST_SP2_INC(dst0, dst1, pc2, 4); \
593 LD_SP2(pc3, 4, dst0, dst1); \
595 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
597 dst0_r += alpha_r * res6_r; \
598 dst0_r -= alpha_i * res6_i; \
599 dst0_i += alpha_r * res6_i; \
600 dst0_i += alpha_i * res6_r; \
602 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
604 ST_SP2_INC(dst0, dst1, pc3, 4); \
607 #define CGEMM_SCALE_4X2_MSA \
609 LD_SP2(pc0, 4, dst0, dst1); \
611 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
613 dst0_r += alpha_r * res0_r; \
614 dst0_r -= alpha_i * res0_i; \
615 dst0_i += alpha_r * res0_i; \
616 dst0_i += alpha_i * res0_r; \
618 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
620 ST_SP2_INC(dst0, dst1, pc0, 4); \
622 LD_SP2(pc1, 4, dst0, dst1); \
624 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
626 dst0_r += alpha_r * res2_r; \
627 dst0_r -= alpha_i * res2_i; \
628 dst0_i += alpha_r * res2_i; \
629 dst0_i += alpha_i * res2_r; \
631 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
633 ST_SP2_INC(dst0, dst1, pc1, 4); \
636 #define CGEMM_SCALE_4X1_MSA \
638 LD_SP2(pc0, 4, dst0, dst1); \
640 PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
642 dst0_r += alpha_r * res0_r; \
643 dst0_r -= alpha_i * res0_i; \
644 dst0_i += alpha_r * res0_i; \
645 dst0_i += alpha_i * res0_r; \
647 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
649 ST_SP2_INC(dst0, dst1, pc0, 4); \
652 #define CGEMM_SCALE_2X4 \
655 pc0[0] += alphar * res0; \
656 pc0[0] -= alphai * res1; \
657 pc0[1] += alphar * res1; \
658 pc0[1] += alphai * res0; \
659 pc0[2] += alphar * res2; \
660 pc0[2] -= alphai * res3; \
661 pc0[3] += alphar * res3; \
662 pc0[3] += alphai * res2; \
665 pc1[0] += alphar * res4; \
666 pc1[0] -= alphai * res5; \
667 pc1[1] += alphar * res5; \
668 pc1[1] += alphai * res4; \
669 pc1[2] += alphar * res6; \
670 pc1[2] -= alphai * res7; \
671 pc1[3] += alphar * res7; \
672 pc1[3] += alphai * res6; \
675 pc2[0] += alphar * res8; \
676 pc2[0] -= alphai * res9; \
677 pc2[1] += alphar * res9; \
678 pc2[1] += alphai * res8; \
679 pc2[2] += alphar * res10; \
680 pc2[2] -= alphai * res11; \
681 pc2[3] += alphar * res11; \
682 pc2[3] += alphai * res10; \
685 pc3[0] += alphar * res12; \
686 pc3[0] -= alphai * res13; \
687 pc3[1] += alphar * res13; \
688 pc3[1] += alphai * res12; \
689 pc3[2] += alphar * res14; \
690 pc3[2] -= alphai * res15; \
691 pc3[3] += alphar * res15; \
692 pc3[3] += alphai * res14; \
695 #define CGEMM_SCALE_2X2 \
698 pc0[0] += alphar * res0; \
699 pc0[0] -= alphai * res1; \
700 pc0[1] += alphar * res1; \
701 pc0[1] += alphai * res0; \
702 pc0[2] += alphar * res2; \
703 pc0[2] -= alphai * res3; \
704 pc0[3] += alphar * res3; \
705 pc0[3] += alphai * res2; \
708 pc1[0] += alphar * res4; \
709 pc1[0] -= alphai * res5; \
710 pc1[1] += alphar * res5; \
711 pc1[1] += alphai * res4; \
712 pc1[2] += alphar * res6; \
713 pc1[2] -= alphai * res7; \
714 pc1[3] += alphar * res7; \
715 pc1[3] += alphai * res6; \
718 #define CGEMM_SCALE_2X1 \
720 pc0[0] += alphar * res0; \
721 pc0[0] -= alphai * res1; \
722 pc0[1] += alphar * res1; \
723 pc0[1] += alphai * res0; \
725 pc0[2] += alphar * res2; \
726 pc0[2] -= alphai * res3; \
727 pc0[3] += alphar * res3; \
728 pc0[3] += alphai * res2; \
731 #define CGEMM_SCALE_1X4 \
733 pc0[0] += alphar * res0; \
734 pc0[0] -= alphai * res1; \
735 pc0[1] += alphar * res1; \
736 pc0[1] += alphai * res0; \
738 pc1[0] += alphar * res2; \
739 pc1[0] -= alphai * res3; \
740 pc1[1] += alphar * res3; \
741 pc1[1] += alphai * res2; \
743 pc2[0] += alphar * res4; \
744 pc2[0] -= alphai * res5; \
745 pc2[1] += alphar * res5; \
746 pc2[1] += alphai * res4; \
748 pc3[0] += alphar * res6; \
749 pc3[0] -= alphai * res7; \
750 pc3[1] += alphar * res7; \
751 pc3[1] += alphai * res6; \
754 #define CGEMM_SCALE_1X2 \
756 pc0[0] += alphar * res0; \
757 pc0[0] -= alphai * res1; \
758 pc0[1] += alphar * res1; \
759 pc0[1] += alphai * res0; \
761 pc1[2] += alphar * res2; \
762 pc1[2] -= alphai * res3; \
763 pc1[3] += alphar * res3; \
764 pc1[3] += alphai * res2; \
767 #define CGEMM_SCALE_1X1 \
769 pc0[0] += alphar * res0; \
770 pc0[0] -= alphai * res1; \
771 pc0[1] += alphar * res1; \
772 pc0[1] += alphai * res0; \
775 #define CGEMM_TRMM_SCALE_8X4_MSA \
777 dst0_r = alpha_r * res0_r; \
778 dst0_r -= alpha_i * res0_i; \
779 dst0_i = alpha_r * res0_i; \
780 dst0_i += alpha_i * res0_r; \
782 dst1_r = alpha_r * res1_r; \
783 dst1_r -= alpha_i * res1_i; \
784 dst1_i = alpha_r * res1_i; \
785 dst1_i += alpha_i * res1_r; \
787 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
788 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
790 ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
792 dst0_r = alpha_r * res2_r; \
793 dst0_r -= alpha_i * res2_i; \
794 dst0_i = alpha_r * res2_i; \
795 dst0_i += alpha_i * res2_r; \
797 dst1_r = alpha_r * res3_r; \
798 dst1_r -= alpha_i * res3_i; \
799 dst1_i = alpha_r * res3_i; \
800 dst1_i += alpha_i * res3_r; \
802 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
803 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
805 ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
807 dst0_r = alpha_r * res4_r; \
808 dst0_r -= alpha_i * res4_i; \
809 dst0_i = alpha_r * res4_i; \
810 dst0_i += alpha_i * res4_r; \
812 dst1_r = alpha_r * res5_r; \
813 dst1_r -= alpha_i * res5_i; \
814 dst1_i = alpha_r * res5_i; \
815 dst1_i += alpha_i * res5_r; \
817 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
818 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
820 ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
822 dst0_r = alpha_r * res6_r; \
823 dst0_r -= alpha_i * res6_i; \
824 dst0_i = alpha_r * res6_i; \
825 dst0_i += alpha_i * res6_r; \
827 dst1_r = alpha_r * res7_r; \
828 dst1_r -= alpha_i * res7_i; \
829 dst1_i = alpha_r * res7_i; \
830 dst1_i += alpha_i * res7_r; \
832 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
833 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
835 ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
838 #define CGEMM_TRMM_SCALE_8X2_MSA \
840 dst0_r = alpha_r * res0_r; \
841 dst0_r -= alpha_i * res0_i; \
842 dst0_i = alpha_r * res0_i; \
843 dst0_i += alpha_i * res0_r; \
845 dst1_r = alpha_r * res1_r; \
846 dst1_r -= alpha_i * res1_i; \
847 dst1_i = alpha_r * res1_i; \
848 dst1_i += alpha_i * res1_r; \
850 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
851 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
853 ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
855 dst0_r = alpha_r * res2_r; \
856 dst0_r -= alpha_i * res2_i; \
857 dst0_i = alpha_r * res2_i; \
858 dst0_i += alpha_i * res2_r; \
860 dst1_r = alpha_r * res3_r; \
861 dst1_r -= alpha_i * res3_i; \
862 dst1_i = alpha_r * res3_i; \
863 dst1_i += alpha_i * res3_r; \
865 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
866 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
868 ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
871 #define CGEMM_TRMM_SCALE_8X1_MSA \
873 dst0_r = alpha_r * res0_r; \
874 dst0_r -= alpha_i * res0_i; \
875 dst0_i = alpha_r * res0_i; \
876 dst0_i += alpha_i * res0_r; \
878 dst1_r = alpha_r * res1_r; \
879 dst1_r -= alpha_i * res1_i; \
880 dst1_i = alpha_r * res1_i; \
881 dst1_i += alpha_i * res1_r; \
883 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
884 ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
886 ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
889 #define CGEMM_TRMM_SCALE_4X4_MSA \
891 dst0_r = alpha_r * res0_r; \
892 dst0_r -= alpha_i * res0_i; \
893 dst0_i = alpha_r * res0_i; \
894 dst0_i += alpha_i * res0_r; \
896 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
898 ST_SP2_INC(dst0, dst1, pc0, 4); \
900 dst0_r = alpha_r * res2_r; \
901 dst0_r -= alpha_i * res2_i; \
902 dst0_i = alpha_r * res2_i; \
903 dst0_i += alpha_i * res2_r; \
905 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
907 ST_SP2_INC(dst0, dst1, pc1, 4); \
909 dst0_r = alpha_r * res4_r; \
910 dst0_r -= alpha_i * res4_i; \
911 dst0_i = alpha_r * res4_i; \
912 dst0_i += alpha_i * res4_r; \
914 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
916 ST_SP2_INC(dst0, dst1, pc2, 4); \
918 dst0_r = alpha_r * res6_r; \
919 dst0_r -= alpha_i * res6_i; \
920 dst0_i = alpha_r * res6_i; \
921 dst0_i += alpha_i * res6_r; \
923 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
925 ST_SP2_INC(dst0, dst1, pc3, 4); \
928 #define CGEMM_TRMM_SCALE_4X2_MSA \
930 dst0_r = alpha_r * res0_r; \
931 dst0_r -= alpha_i * res0_i; \
932 dst0_i = alpha_r * res0_i; \
933 dst0_i += alpha_i * res0_r; \
935 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
937 ST_SP2_INC(dst0, dst1, pc0, 4); \
939 dst0_r = alpha_r * res2_r; \
940 dst0_r -= alpha_i * res2_i; \
941 dst0_i = alpha_r * res2_i; \
942 dst0_i += alpha_i * res2_r; \
944 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
946 ST_SP2_INC(dst0, dst1, pc1, 4); \
949 #define CGEMM_TRMM_SCALE_4X1_MSA \
951 dst0_r = alpha_r * res0_r; \
952 dst0_r -= alpha_i * res0_i; \
953 dst0_i = alpha_r * res0_i; \
954 dst0_i += alpha_i * res0_r; \
956 ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
958 ST_SP2_INC(dst0, dst1, pc0, 4); \
961 #define CGEMM_TRMM_SCALE_2X4 \
964 pc0[0] = alphar * res0; \
965 pc0[0] -= alphai * res1; \
966 pc0[1] = alphar * res1; \
967 pc0[1] += alphai * res0; \
968 pc0[2] = alphar * res2; \
969 pc0[2] -= alphai * res3; \
970 pc0[3] = alphar * res3; \
971 pc0[3] += alphai * res2; \
974 pc1[0] = alphar * res4; \
975 pc1[0] -= alphai * res5; \
976 pc1[1] = alphar * res5; \
977 pc1[1] += alphai * res4; \
978 pc1[2] = alphar * res6; \
979 pc1[2] -= alphai * res7; \
980 pc1[3] = alphar * res7; \
981 pc1[3] += alphai * res6; \
984 pc2[0] = alphar * res8; \
985 pc2[0] -= alphai * res9; \
986 pc2[1] = alphar * res9; \
987 pc2[1] += alphai * res8; \
988 pc2[2] = alphar * res10; \
989 pc2[2] -= alphai * res11; \
990 pc2[3] = alphar * res11; \
991 pc2[3] += alphai * res10; \
994 pc3[0] = alphar * res12; \
995 pc3[0] -= alphai * res13; \
996 pc3[1] = alphar * res13; \
997 pc3[1] += alphai * res12; \
998 pc3[2] = alphar * res14; \
999 pc3[2] -= alphai * res15; \
1000 pc3[3] = alphar * res15; \
1001 pc3[3] += alphai * res14; \
1004 #define CGEMM_TRMM_SCALE_2X2 \
1007 pc0[0] = alphar * res0; \
1008 pc0[0] -= alphai * res1; \
1009 pc0[1] = alphar * res1; \
1010 pc0[1] += alphai * res0; \
1011 pc0[2] = alphar * res2; \
1012 pc0[2] -= alphai * res3; \
1013 pc0[3] = alphar * res3; \
1014 pc0[3] += alphai * res2; \
1017 pc1[0] = alphar * res4; \
1018 pc1[0] -= alphai * res5; \
1019 pc1[1] = alphar * res5; \
1020 pc1[1] += alphai * res4; \
1021 pc1[2] = alphar * res6; \
1022 pc1[2] -= alphai * res7; \
1023 pc1[3] = alphar * res7; \
1024 pc1[3] += alphai * res6; \
1027 #define CGEMM_TRMM_SCALE_2X1 \
1029 pc0[0] = alphar * res0; \
1030 pc0[0] -= alphai * res1; \
1031 pc0[1] = alphar * res1; \
1032 pc0[1] += alphai * res0; \
1034 pc0[2] = alphar * res2; \
1035 pc0[2] -= alphai * res3; \
1036 pc0[3] = alphar * res3; \
1037 pc0[3] += alphai * res2; \
1040 #define CGEMM_TRMM_SCALE_1X4 \
1042 pc0[0] = alphar * res0; \
1043 pc0[0] -= alphai * res1; \
1044 pc0[1] = alphar * res1; \
1045 pc0[1] += alphai * res0; \
1047 pc1[0] = alphar * res2; \
1048 pc1[0] -= alphai * res3; \
1049 pc1[1] = alphar * res3; \
1050 pc1[1] += alphai * res2; \
1052 pc2[0] = alphar * res4; \
1053 pc2[0] -= alphai * res5; \
1054 pc2[1] = alphar * res5; \
1055 pc2[1] += alphai * res4; \
1057 pc3[0] = alphar * res6; \
1058 pc3[0] -= alphai * res7; \
1059 pc3[1] = alphar * res7; \
1060 pc3[1] += alphai * res6; \
1063 #define CGEMM_TRMM_SCALE_1X2 \
1065 pc0[0] = alphar * res0; \
1066 pc0[0] -= alphai * res1; \
1067 pc0[1] = alphar * res1; \
1068 pc0[1] += alphai * res0; \
1070 pc1[2] = alphar * res2; \
1071 pc1[2] -= alphai * res3; \
1072 pc1[3] = alphar * res3; \
1073 pc1[3] += alphai * res2; \
1076 #define CGEMM_TRMM_SCALE_1X1 \
1078 pc0[0] = alphar * res0; \
1079 pc0[0] -= alphai * res1; \
1080 pc0[1] = alphar * res1; \
1081 pc0[1] += alphai * res0; \
1084 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
1085 FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
1091 BLASLONG i, j, l, temp;
1092 #if defined(TRMMKERNEL)
1095 FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
1096 FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
1097 FLOAT res8, res9, res10, res11, res12, res13, res14, res15;
1098 FLOAT a0_r, a1_r, a0_i, a1_i, b0_i, b1_i, b2_i, b3_i;
1099 FLOAT b0_r, b1_r, b2_r, b3_r;
1100 v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1;
1101 v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
1102 v4f32 dst0, dst1, dst2, dst3, alpha_r, alpha_i;
1103 v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
1104 v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
1105 v4f32 dst0_r, dst0_i, dst1_r, dst1_i;
1107 alpha_r = COPY_FLOAT_TO_VECTOR(alphar);
1108 alpha_i = COPY_FLOAT_TO_VECTOR(alphai);
1110 #if defined(TRMMKERNEL) && !defined(LEFT)
1114 for (j = (n >> 2); j--;)
1117 pc1 = pc0 + 2 * ldc;
1118 pc2 = pc1 + 2 * ldc;
1119 pc3 = pc2 + 2 * ldc;
1121 #if defined(TRMMKERNEL) && defined(LEFT)
1127 for (i = (m >> 3); i--;)
1129 #if defined(TRMMKERNEL)
1130 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1134 pb0 = B + off * 2 * 4;
1137 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1140 temp = off + 8; // number of values in A
1142 temp = off + 4; // number of values in B
1149 #ifdef ENABLE_PREFETCH
1150 __asm__ __volatile__(
1151 "pref 0, 64(%[pa0]) \n\t"
1152 "pref 0, 96(%[pa0]) \n\t"
1153 "pref 0, 32(%[pb0]) \n\t"
1156 : [pa0] "r" (pa0), [pb0] "r" (pb0)
1160 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1161 CGEMM_KERNEL_8X4_MSA(, -, , +, +);
1163 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1164 CGEMM_KERNEL_8X4_MSA(, +, , +, -);
1166 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1167 CGEMM_KERNEL_8X4_MSA(, +, , -, +);
1169 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1170 CGEMM_KERNEL_8X4_MSA(, -, , -, -);
1173 for (l = (temp - 1); l--;)
1175 #ifdef ENABLE_PREFETCH
1176 __asm__ __volatile__(
1177 "pref 0, 64(%[pa0]) \n\t"
1178 "pref 0, 96(%[pa0]) \n\t"
1179 "pref 0, 32(%[pb0]) \n\t"
1182 : [pa0] "r" (pa0), [pb0] "r" (pb0)
1186 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1187 CGEMM_KERNEL_8X4_MSA(+, -, +, +,);
1189 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1190 CGEMM_KERNEL_8X4_MSA(+, +, -, +,);
1192 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1193 CGEMM_KERNEL_8X4_MSA(+, +, +, -,);
1195 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1196 CGEMM_KERNEL_8X4_MSA(+, -, -, -,);
1200 #if defined(TRMMKERNEL)
1201 CGEMM_TRMM_SCALE_8X4_MSA
1206 #if defined(TRMMKERNEL)
1207 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1210 temp -= 8; // number of values in A
1212 temp -= 4; // number of values in B
1214 pa0 += temp * 2 * 8;
1215 pb0 += temp * 2 * 4;
1219 off += 8; // number of values in A
1226 #if defined(TRMMKERNEL)
1227 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1231 pb0 = B + off * 2 * 4;
1234 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1237 temp = off + 4; // number of values in A
1239 temp = off + 4; // number of values in B
1246 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1247 CGEMM_KERNEL_4X4_MSA(, -, , +, +);
1249 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1250 CGEMM_KERNEL_4X4_MSA(, +, , +, -);
1252 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1253 CGEMM_KERNEL_4X4_MSA(, +, , -, +);
1255 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1256 CGEMM_KERNEL_4X4_MSA(, -, , -, -);
1259 for (l = (temp - 1); l--;)
1261 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1262 CGEMM_KERNEL_4X4_MSA(+, -, +, +,);
1264 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1265 CGEMM_KERNEL_4X4_MSA(+, +, -, +,);
1267 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1268 CGEMM_KERNEL_4X4_MSA(+, +, +, -,);
1270 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1271 CGEMM_KERNEL_4X4_MSA(+, -, -, -,);
1275 #if defined(TRMMKERNEL)
1276 CGEMM_TRMM_SCALE_4X4_MSA
1281 #if defined(TRMMKERNEL)
1282 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1285 temp -= 4; // number of values in A
1287 temp -= 4; // number of values in B
1289 pa0 += temp * 2 * 4;
1290 pb0 += temp * 2 * 4;
1294 off += 4; // number of values in A
1301 #if defined(TRMMKERNEL)
1302 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1306 pb0 = B + off * 2 * 4;
1309 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1312 temp = off + 2; // number of values in A
1314 temp = off + 4; // number of values in B
1321 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1322 CGEMM_KERNEL_2X4(, -, , +, +);
1324 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1325 CGEMM_KERNEL_2X4(, +, , +, -);
1327 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1328 CGEMM_KERNEL_2X4(, +, , -, +);
1330 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1331 CGEMM_KERNEL_2X4(, -, , -, -);
1337 for (l = (temp - 1); l--;)
1339 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1340 CGEMM_KERNEL_2X4(+, -, +, +,);
1342 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1343 CGEMM_KERNEL_2X4(+, +, -, +,);
1345 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1346 CGEMM_KERNEL_2X4(+, +, +, -,);
1348 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1349 CGEMM_KERNEL_2X4(+, -, -, -,);
1356 #if defined(TRMMKERNEL)
1357 CGEMM_TRMM_SCALE_2X4
1366 #if defined(TRMMKERNEL)
1367 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1370 temp -= 2; // number of values in A
1372 temp -= 4; // number of values in B
1374 pa0 += temp * 2 * 2;
1375 pb0 += temp * 2 * 4;
1379 off += 2; // number of values in A
1386 #if defined(TRMMKERNEL)
1387 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1391 pb0 = B + off * 2 * 4;
1394 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1397 temp = off + 1; // number of values in A
1399 temp = off + 4; // number of values in B
1406 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1407 CGEMM_KERNEL_1X4(, -, , +, +);
1409 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1410 CGEMM_KERNEL_1X4(, +, , +, -);
1412 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1413 CGEMM_KERNEL_1X4(, +, , -, +);
1415 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1416 CGEMM_KERNEL_1X4(, -, , -, -);
1422 for (l = (temp - 1); l--;)
1424 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1425 CGEMM_KERNEL_1X4(+, -, +, +,);
1427 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1428 CGEMM_KERNEL_1X4(+, +, -, +,);
1430 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1431 CGEMM_KERNEL_1X4(+, +, +, -,);
1433 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1434 CGEMM_KERNEL_1X4(+, -, -, -,);
1441 #if defined(TRMMKERNEL)
1442 CGEMM_TRMM_SCALE_1X4
1451 #if defined(TRMMKERNEL)
1452 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1455 temp -= 1; // number of values in A
1457 temp -= 4; // number of values in B
1459 pa0 += temp * 2 * 1;
1460 pb0 += temp * 2 * 4;
1464 off += 1; // number of values in A
1469 #if defined(TRMMKERNEL) && !defined(LEFT)
1470 off += 4; // number of values in A
1480 pc1 = pc0 + 2 * ldc;
1482 #if defined(TRMMKERNEL) && defined(LEFT)
1488 for (i = (m >> 3); i--;)
1490 #if defined(TRMMKERNEL)
1491 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1495 pb0 = B + off * 2 * 2;
1498 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1501 temp = off + 8; // number of values in A
1503 temp = off + 2; // number of values in B
1510 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1511 CGEMM_KERNEL_8X2_MSA(, -, , +, +);
1513 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1514 CGEMM_KERNEL_8X2_MSA(, +, , +, -);
1516 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1517 CGEMM_KERNEL_8X2_MSA(, +, , -, +);
1519 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1520 CGEMM_KERNEL_8X2_MSA(, -, , -, -);
1525 for (l = (temp - 1); l--;)
1527 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1528 CGEMM_KERNEL_8X2_MSA(+, -, +, +,);
1530 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1531 CGEMM_KERNEL_8X2_MSA(+, +, -, +,);
1533 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1534 CGEMM_KERNEL_8X2_MSA(+, +, +, -,);
1536 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1537 CGEMM_KERNEL_8X2_MSA(+, -, -, -,);
1543 #if defined(TRMMKERNEL)
1544 CGEMM_TRMM_SCALE_8X2_MSA
1549 #if defined(TRMMKERNEL)
1550 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1553 temp -= 8; // number of values in A
1555 temp -= 2; // number of values in B
1557 pa0 += temp * 2 * 8;
1558 pb0 += temp * 2 * 2;
1562 off += 8; // number of values in A
1569 #if defined(TRMMKERNEL)
1570 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1574 pb0 = B + off * 2 * 2;
1577 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1580 temp = off + 4; // number of values in A
1582 temp = off + 2; // number of values in B
1589 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1590 CGEMM_KERNEL_4X2_MSA(, -, , +, +);
1592 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1593 CGEMM_KERNEL_4X2_MSA(, +, , +, -);
1595 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1596 CGEMM_KERNEL_4X2_MSA(, +, , -, +);
1598 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1599 CGEMM_KERNEL_4X2_MSA(, -, , -, -);
1604 for (l = (temp - 1); l--;)
1606 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1607 CGEMM_KERNEL_4X2_MSA(+, -, +, +,);
1609 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1610 CGEMM_KERNEL_4X2_MSA(+, +, -, +,);
1612 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1613 CGEMM_KERNEL_4X2_MSA(+, +, +, -,);
1615 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1616 CGEMM_KERNEL_4X2_MSA(+, -, -, -,);
1622 #if defined(TRMMKERNEL)
1623 CGEMM_TRMM_SCALE_4X2_MSA
1628 #if defined(TRMMKERNEL)
1629 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1632 temp -= 4; // number of values in A
1634 temp -= 2; // number of values in B
1636 pa0 += temp * 2 * 4;
1637 pb0 += temp * 2 * 2;
1641 off += 4; // number of values in A
1648 #if defined(TRMMKERNEL)
1649 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1653 pb0 = B + off * 2 * 2;
1656 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1659 temp = off + 2; // number of values in A
1661 temp = off + 2; // number of values in B
1668 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1669 CGEMM_KERNEL_2X2(, -, , +, +);
1671 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1672 CGEMM_KERNEL_2X2(, +, , +, -);
1674 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1675 CGEMM_KERNEL_2X2(, +, , -, +);
1677 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1678 CGEMM_KERNEL_2X2(, -, , -, -);
1684 for (l = (temp - 1); l--;)
1686 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1687 CGEMM_KERNEL_2X2(+, -, +, +,);
1689 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1690 CGEMM_KERNEL_2X2(+, +, -, +,);
1692 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1693 CGEMM_KERNEL_2X2(+, +, +, -,);
1695 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1696 CGEMM_KERNEL_2X2(+, -, -, -,);
1703 #if defined(TRMMKERNEL)
1704 CGEMM_TRMM_SCALE_2X2
1711 #if defined(TRMMKERNEL)
1712 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1715 temp -= 2; // number of values in A
1717 temp -= 2; // number of values in B
1719 pa0 += temp * 2 * 2;
1720 pb0 += temp * 2 * 2;
1724 off += 2; // number of values in A
1731 #if defined(TRMMKERNEL)
1732 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1736 pb0 = B + off * 2 * 2;
1739 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1742 temp = off + 1; // number of values in A
1744 temp = off + 2; // number of values in B
1751 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1752 CGEMM_KERNEL_1X2(, -, , +, +);
1754 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1755 CGEMM_KERNEL_1X2(, +, , +, -);
1757 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1758 CGEMM_KERNEL_1X2(, +, , -, +);
1760 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1761 CGEMM_KERNEL_1X2(, -, , -, -);
1767 for (l = (temp - 1); l--;)
1769 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1770 CGEMM_KERNEL_1X2(+, -, +, +,);
1772 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1773 CGEMM_KERNEL_1X2(+, +, -, +,);
1775 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1776 CGEMM_KERNEL_1X2(+, +, +, -,);
1778 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1779 CGEMM_KERNEL_1X2(+, -, -, -,);
1786 #if defined(TRMMKERNEL)
1787 CGEMM_TRMM_SCALE_1X2
1794 #if defined(TRMMKERNEL)
1795 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1798 temp -= 1; // number of values in A
1800 temp -= 2; // number of values in B
1802 pa0 += temp * 2 * 1;
1803 pb0 += temp * 2 * 2;
1807 off += 1; // number of values in A
1812 #if defined(TRMMKERNEL) && !defined(LEFT)
1813 off += 2; // number of values in A
1824 #if defined(TRMMKERNEL) && defined(LEFT)
1830 for (i = (m >> 3); i--;)
1832 #if defined(TRMMKERNEL)
1833 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1837 pb0 = B + off * 2 * 1;
1840 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1843 temp = off + 8; // number of values in A
1845 temp = off + 1; // number of values in B
1852 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1853 CGEMM_KERNEL_8X1_MSA(, -, , +, +);
1855 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1856 CGEMM_KERNEL_8X1_MSA(, +, , +, -);
1858 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1859 CGEMM_KERNEL_8X1_MSA(, +, , -, +);
1861 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1862 CGEMM_KERNEL_8X1_MSA(, -, , -, -);
1867 for (l = (temp - 1); l--;)
1869 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1870 CGEMM_KERNEL_8X1_MSA(+, -, +, +,);
1872 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1873 CGEMM_KERNEL_8X1_MSA(+, +, -, +,);
1875 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1876 CGEMM_KERNEL_8X1_MSA(+, +, +, -,);
1878 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1879 CGEMM_KERNEL_8X1_MSA(+, -, -, -,);
1885 #if defined(TRMMKERNEL)
1886 CGEMM_TRMM_SCALE_8X1_MSA
1891 #if defined(TRMMKERNEL)
1892 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1895 temp -= 8; // number of values in A
1897 temp -= 1; // number of values in B
1899 pa0 += temp * 2 * 8;
1900 pb0 += temp * 2 * 1;
1904 off += 8; // number of values in A
1911 #if defined(TRMMKERNEL)
1912 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1916 pb0 = B + off * 2 * 1;
1919 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1922 temp = off + 4; // number of values in A
1924 temp = off + 1; // number of values in B
1931 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1932 CGEMM_KERNEL_4X1_MSA(, -, , +, +);
1934 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1935 CGEMM_KERNEL_4X1_MSA(, +, , +, -);
1937 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1938 CGEMM_KERNEL_4X1_MSA(, +, , -, +);
1940 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1941 CGEMM_KERNEL_4X1_MSA(, -, , -, -);
1946 for (l = (temp - 1); l--;)
1948 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
1949 CGEMM_KERNEL_4X1_MSA(+, -, +, +,);
1951 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
1952 CGEMM_KERNEL_4X1_MSA(+, +, -, +,);
1954 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
1955 CGEMM_KERNEL_4X1_MSA(+, +, +, -,);
1957 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
1958 CGEMM_KERNEL_4X1_MSA(+, -, -, -,);
1964 #if defined(TRMMKERNEL)
1965 CGEMM_TRMM_SCALE_4X1_MSA
1970 #if defined(TRMMKERNEL)
1971 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1974 temp -= 4; // number of values in A
1976 temp -= 1; // number of values in B
1978 pa0 += temp * 2 * 4;
1979 pb0 += temp * 2 * 1;
1983 off += 4; // number of values in A
1990 #if defined(TRMMKERNEL)
1991 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1995 pb0 = B + off * 2 * 1;
1998 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2001 temp = off + 2; // number of values in A
2003 temp = off + 1; // number of values in B
2010 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
2011 CGEMM_KERNEL_2X1(, -, , +, +);
2013 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
2014 CGEMM_KERNEL_2X1(, +, , +, -);
2016 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
2017 CGEMM_KERNEL_2X1(, +, , -, +);
2019 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
2020 CGEMM_KERNEL_2X1(, -, , -, -);
2026 for (l = (temp - 1); l--;)
2028 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
2029 CGEMM_KERNEL_2X1(+, -, +, +,);
2031 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
2032 CGEMM_KERNEL_2X1(+, +, -, +,);
2034 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
2035 CGEMM_KERNEL_2X1(+, +, +, -,);
2037 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
2038 CGEMM_KERNEL_2X1(+, -, -, -,);
2045 #if defined(TRMMKERNEL)
2046 CGEMM_TRMM_SCALE_2X1
2052 #if defined(TRMMKERNEL)
2053 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2056 temp -= 2; // number of values in A
2058 temp -= 1; // number of values in B
2060 pa0 += temp * 2 * 2;
2061 pb0 += temp * 2 * 1;
2065 off += 2; // number of values in A
2072 #if defined(TRMMKERNEL)
2073 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2077 pb0 = B + off * 2 * 1;
2080 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2083 temp = off + 1; // number of values in A
2085 temp = off + 1; // number of values in B
2092 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
2093 CGEMM_KERNEL_1X1(, -, , +, +);
2095 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
2096 CGEMM_KERNEL_1X1(, +, , +, -);
2098 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
2099 CGEMM_KERNEL_1X1(, +, , -, +);
2101 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
2102 CGEMM_KERNEL_1X1(, -, , -, -);
2108 for (l = (temp - 1); l--;)
2110 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
2111 CGEMM_KERNEL_1X1(+, -, +, +,);
2113 #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
2114 CGEMM_KERNEL_1X1(+, +, -, +,);
2116 #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
2117 CGEMM_KERNEL_1X1(+, +, +, -,);
2119 #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
2120 CGEMM_KERNEL_1X1(+, -, -, -,);
2127 #if defined(TRMMKERNEL)
2128 CGEMM_TRMM_SCALE_1X1
2134 #if defined(TRMMKERNEL)
2135 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2138 temp -= 1; // number of values in A
2140 temp -= 1; // number of values in B
2142 pa0 += temp * 2 * 1;
2143 pb0 += temp * 2 * 1;
2147 off += 1; // number of values in A
2152 #if defined(TRMMKERNEL) && !defined(LEFT)
2153 off += 1; // number of values in A