1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
32 FLOAT *C, BLASLONG ldc
38 BLASLONG i, j, l, temp;
39 #if defined(TRMMKERNEL)
42 FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
44 FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
45 FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
46 FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7;
47 v4f32 v_alpha = {alpha, alpha, alpha, alpha};
48 v4f32 src_a0, src_a1, src_b, src_b0, src_b1;
49 v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
50 v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
51 v4f32 res8, res9, res10, res11, res12, res13, res14, res15;
53 #if defined(TRMMKERNEL) && !defined(LEFT)
57 for (j = (n >> 3); j--;)
68 #if defined(TRMMKERNEL) && defined(LEFT)
73 for (i = (m >> 3); i--;)
75 #if defined(TRMMKERNEL)
76 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
83 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
86 temp = off + 8; // number of values in A
88 temp = off + 8; // number of values in B
94 #ifdef ENABLE_PREFETCH
96 "pref 0, 32(%[pa0]) \n\t"
97 "pref 0, 32(%[pb0]) \n\t"
100 : [pa0] "r" (pa0), [pb0] "r" (pb0)
104 LD_SP2_INC(pa0, 4, src_a0, src_a1);
105 LD_SP2_INC(pb0, 4, src_b0, src_b1);
107 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
108 res0 = src_a0 * src_b;
109 res1 = src_a1 * src_b;
111 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
112 res2 = src_a0 * src_b;
113 res3 = src_a1 * src_b;
115 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
116 res4 = src_a0 * src_b;
117 res5 = src_a1 * src_b;
119 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
120 res6 = src_a0 * src_b;
121 res7 = src_a1 * src_b;
123 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
124 res8 = src_a0 * src_b;
125 res9 = src_a1 * src_b;
127 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
128 res10 = src_a0 * src_b;
129 res11 = src_a1 * src_b;
131 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
132 res12 = src_a0 * src_b;
133 res13 = src_a1 * src_b;
135 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
136 res14 = src_a0 * src_b;
137 res15 = src_a1 * src_b;
139 for (l = ((temp - 1) >> 1); l--;)
141 #ifdef ENABLE_PREFETCH
142 __asm__ __volatile__(
143 "pref 0, 64(%[pa0]) \n\t"
144 "pref 0, 96(%[pa0]) \n\t"
145 "pref 0, 64(%[pb0]) \n\t"
146 "pref 0, 96(%[pb0]) \n\t"
149 : [pa0] "r" (pa0), [pb0] "r" (pb0)
153 LD_SP2_INC(pa0, 4, src_a0, src_a1);
154 LD_SP2_INC(pb0, 4, src_b0, src_b1);
156 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
157 res0 += src_a0 * src_b;
158 res1 += src_a1 * src_b;
160 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
161 res2 += src_a0 * src_b;
162 res3 += src_a1 * src_b;
164 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
165 res4 += src_a0 * src_b;
166 res5 += src_a1 * src_b;
168 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
169 res6 += src_a0 * src_b;
170 res7 += src_a1 * src_b;
172 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
173 res8 += src_a0 * src_b;
174 res9 += src_a1 * src_b;
176 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
177 res10 += src_a0 * src_b;
178 res11 += src_a1 * src_b;
180 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
181 res12 += src_a0 * src_b;
182 res13 += src_a1 * src_b;
184 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
185 res14 += src_a0 * src_b;
186 res15 += src_a1 * src_b;
188 LD_SP2_INC(pa0, 4, src_a0, src_a1);
189 LD_SP2_INC(pb0, 4, src_b0, src_b1);
191 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
192 res0 += src_a0 * src_b;
193 res1 += src_a1 * src_b;
195 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
196 res2 += src_a0 * src_b;
197 res3 += src_a1 * src_b;
199 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
200 res4 += src_a0 * src_b;
201 res5 += src_a1 * src_b;
203 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
204 res6 += src_a0 * src_b;
205 res7 += src_a1 * src_b;
207 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
208 res8 += src_a0 * src_b;
209 res9 += src_a1 * src_b;
211 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
212 res10 += src_a0 * src_b;
213 res11 += src_a1 * src_b;
215 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
216 res12 += src_a0 * src_b;
217 res13 += src_a1 * src_b;
219 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
220 res14 += src_a0 * src_b;
221 res15 += src_a1 * src_b;
226 LD_SP2_INC(pa0, 4, src_a0, src_a1);
227 LD_SP2_INC(pb0, 4, src_b0, src_b1);
229 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
230 res0 += src_a0 * src_b;
231 res1 += src_a1 * src_b;
233 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
234 res2 += src_a0 * src_b;
235 res3 += src_a1 * src_b;
237 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
238 res4 += src_a0 * src_b;
239 res5 += src_a1 * src_b;
241 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
242 res6 += src_a0 * src_b;
243 res7 += src_a1 * src_b;
245 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
246 res8 += src_a0 * src_b;
247 res9 += src_a1 * src_b;
249 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
250 res10 += src_a0 * src_b;
251 res11 += src_a1 * src_b;
253 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
254 res12 += src_a0 * src_b;
255 res13 += src_a1 * src_b;
257 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
258 res14 += src_a0 * src_b;
259 res15 += src_a1 * src_b;
262 #if defined(TRMMKERNEL)
263 dst0 = res0 * v_alpha;
264 dst1 = res1 * v_alpha;
265 dst2 = res2 * v_alpha;
266 dst3 = res3 * v_alpha;
267 dst4 = res4 * v_alpha;
268 dst5 = res5 * v_alpha;
269 dst6 = res6 * v_alpha;
270 dst7 = res7 * v_alpha;
272 LD_SP2(pc0, 4, dst0, dst1);
273 LD_SP2(pc1, 4, dst2, dst3);
274 LD_SP2(pc2, 4, dst4, dst5);
275 LD_SP2(pc3, 4, dst6, dst7);
277 dst0 += res0 * v_alpha;
278 dst1 += res1 * v_alpha;
279 dst2 += res2 * v_alpha;
280 dst3 += res3 * v_alpha;
281 dst4 += res4 * v_alpha;
282 dst5 += res5 * v_alpha;
283 dst6 += res6 * v_alpha;
284 dst7 += res7 * v_alpha;
286 ST_SP2_INC(dst0, dst1, pc0, 4);
287 ST_SP2_INC(dst2, dst3, pc1, 4);
288 ST_SP2_INC(dst4, dst5, pc2, 4);
289 ST_SP2_INC(dst6, dst7, pc3, 4);
291 #if defined(TRMMKERNEL)
292 dst0 = res8 * v_alpha;
293 dst1 = res9 * v_alpha;
294 dst2 = res10 * v_alpha;
295 dst3 = res11 * v_alpha;
296 dst4 = res12 * v_alpha;
297 dst5 = res13 * v_alpha;
298 dst6 = res14 * v_alpha;
299 dst7 = res15 * v_alpha;
301 LD_SP2(pc4, 4, dst0, dst1);
302 LD_SP2(pc5, 4, dst2, dst3);
303 LD_SP2(pc6, 4, dst4, dst5);
304 LD_SP2(pc7, 4, dst6, dst7);
306 dst0 += res8 * v_alpha;
307 dst1 += res9 * v_alpha;
308 dst2 += res10 * v_alpha;
309 dst3 += res11 * v_alpha;
310 dst4 += res12 * v_alpha;
311 dst5 += res13 * v_alpha;
312 dst6 += res14 * v_alpha;
313 dst7 += res15 * v_alpha;
315 ST_SP2_INC(dst0, dst1, pc4, 4);
316 ST_SP2_INC(dst2, dst3, pc5, 4);
317 ST_SP2_INC(dst4, dst5, pc6, 4);
318 ST_SP2_INC(dst6, dst7, pc7, 4);
320 #if defined(TRMMKERNEL)
321 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
324 temp -= 8; // number of values in A
326 temp -= 8; // number of values in B
333 off += 8; // number of values in A
340 #if defined(TRMMKERNEL)
341 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
348 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
351 temp = off + 4; // number of values in A
353 temp = off + 8; // number of values in B
361 LD_SP2_INC(pb0, 4, src_b0, src_b1);
363 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
364 res0 = src_a0 * src_b;
366 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
367 res1 = src_a0 * src_b;
369 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
370 res2 = src_a0 * src_b;
372 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
373 res3 = src_a0 * src_b;
375 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
376 res4 = src_a0 * src_b;
378 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
379 res5 = src_a0 * src_b;
381 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
382 res6 = src_a0 * src_b;
384 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
385 res7 = src_a0 * src_b;
389 for (l = ((temp - 1) >> 1); l--;)
392 LD_SP2_INC(pb0, 4, src_b0, src_b1);
394 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
395 res0 += src_a0 * src_b;
397 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
398 res1 += src_a0 * src_b;
400 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
401 res2 += src_a0 * src_b;
403 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
404 res3 += src_a0 * src_b;
406 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
407 res4 += src_a0 * src_b;
409 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
410 res5 += src_a0 * src_b;
412 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
413 res6 += src_a0 * src_b;
415 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
416 res7 += src_a0 * src_b;
421 LD_SP2_INC(pb0, 4, src_b0, src_b1);
423 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
424 res0 += src_a0 * src_b;
426 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
427 res1 += src_a0 * src_b;
429 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
430 res2 += src_a0 * src_b;
432 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
433 res3 += src_a0 * src_b;
435 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
436 res4 += src_a0 * src_b;
438 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
439 res5 += src_a0 * src_b;
441 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
442 res6 += src_a0 * src_b;
444 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
445 res7 += src_a0 * src_b;
453 LD_SP2_INC(pb0, 4, src_b0, src_b1);
455 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
456 res0 += src_a0 * src_b;
458 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
459 res1 += src_a0 * src_b;
461 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
462 res2 += src_a0 * src_b;
464 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
465 res3 += src_a0 * src_b;
467 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
468 res4 += src_a0 * src_b;
470 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
471 res5 += src_a0 * src_b;
473 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
474 res6 += src_a0 * src_b;
476 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
477 res7 += src_a0 * src_b;
482 #if defined(TRMMKERNEL)
483 dst0 = res0 * v_alpha;
484 dst1 = res1 * v_alpha;
485 dst2 = res2 * v_alpha;
486 dst3 = res3 * v_alpha;
493 dst0 += res0 * v_alpha;
494 dst1 += res1 * v_alpha;
495 dst2 += res2 * v_alpha;
496 dst3 += res3 * v_alpha;
503 #if defined(TRMMKERNEL)
504 dst0 = res4 * v_alpha;
505 dst1 = res5 * v_alpha;
506 dst2 = res6 * v_alpha;
507 dst3 = res7 * v_alpha;
514 dst0 += res4 * v_alpha;
515 dst1 += res5 * v_alpha;
516 dst2 += res6 * v_alpha;
517 dst3 += res7 * v_alpha;
533 #if defined(TRMMKERNEL)
534 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
537 temp -= 4; // number of values in A
539 temp -= 8; // number of values in B
546 off += 4; // number of values in A
553 #if defined(TRMMKERNEL)
554 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
561 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
564 temp = off + 2; // number of values in A
566 temp = off + 8; // number of values in B
611 for (l = ((temp - 1) >> 1); l--;)
736 tmp10 = alpha * tmp10;
737 tmp12 = alpha * tmp12;
738 tmp14 = alpha * tmp14;
740 #if defined(TRMMKERNEL)
764 tmp11 = alpha * tmp11;
765 tmp13 = alpha * tmp13;
766 tmp15 = alpha * tmp15;
768 #if defined(TRMMKERNEL)
796 #if defined(TRMMKERNEL)
797 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
800 temp -= 2; // number of values in A
802 temp -= 8; // number of values in B
809 off += 2; // number of values in A
816 #if defined(TRMMKERNEL)
817 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
824 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
827 temp = off + 1; // number of values in A
829 temp = off + 8; // number of values in B
864 for (l = ((temp - 1) >> 1); l--;)
963 #if defined(TRMMKERNEL)
991 #if defined(TRMMKERNEL)
992 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
995 temp -= 1; // number of values in A
997 temp -= 8; // number of values in B
1004 off += 1; // number of values in A
1009 #if defined(TRMMKERNEL) && !defined(LEFT)
1010 off += 8; // number of values in A
1024 #if defined(TRMMKERNEL) && defined(LEFT)
1030 for (i = (m >> 3); i--;)
1032 #if defined(TRMMKERNEL)
1033 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1040 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1043 temp = off + 8; // number of values in A
1045 temp = off + 4; // number of values in B
1052 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1053 src_b0 = LD_SP(pb0);
1055 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1056 res0 = src_a0 * src_b;
1057 res1 = src_a1 * src_b;
1059 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1060 res2 = src_a0 * src_b;
1061 res3 = src_a1 * src_b;
1063 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1064 res4 = src_a0 * src_b;
1065 res5 = src_a1 * src_b;
1067 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1068 res6 = src_a0 * src_b;
1069 res7 = src_a1 * src_b;
1073 for (l = ((temp - 1) >> 1); l--;)
1075 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1076 src_b0 = LD_SP(pb0);
1078 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1079 res0 += src_a0 * src_b;
1080 res1 += src_a1 * src_b;
1082 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1083 res2 += src_a0 * src_b;
1084 res3 += src_a1 * src_b;
1086 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1087 res4 += src_a0 * src_b;
1088 res5 += src_a1 * src_b;
1090 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1091 res6 += src_a0 * src_b;
1092 res7 += src_a1 * src_b;
1096 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1097 src_b0 = LD_SP(pb0);
1099 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1100 res0 += src_a0 * src_b;
1101 res1 += src_a1 * src_b;
1103 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1104 res2 += src_a0 * src_b;
1105 res3 += src_a1 * src_b;
1107 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1108 res4 += src_a0 * src_b;
1109 res5 += src_a1 * src_b;
1111 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1112 res6 += src_a0 * src_b;
1113 res7 += src_a1 * src_b;
1120 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1121 src_b0 = LD_SP(pb0);
1123 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1124 res0 += src_a0 * src_b;
1125 res1 += src_a1 * src_b;
1127 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1128 res2 += src_a0 * src_b;
1129 res3 += src_a1 * src_b;
1131 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1132 res4 += src_a0 * src_b;
1133 res5 += src_a1 * src_b;
1135 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1136 res6 += src_a0 * src_b;
1137 res7 += src_a1 * src_b;
1142 #if defined(TRMMKERNEL)
1143 dst0 = res0 * v_alpha;
1144 dst1 = res1 * v_alpha;
1145 dst2 = res2 * v_alpha;
1146 dst3 = res3 * v_alpha;
1147 dst4 = res4 * v_alpha;
1148 dst5 = res5 * v_alpha;
1149 dst6 = res6 * v_alpha;
1150 dst7 = res7 * v_alpha;
1152 LD_SP2(pc0, 4, dst0, dst1);
1153 LD_SP2(pc1, 4, dst2, dst3);
1154 LD_SP2(pc2, 4, dst4, dst5);
1155 LD_SP2(pc3, 4, dst6, dst7);
1157 dst0 += res0 * v_alpha;
1158 dst1 += res1 * v_alpha;
1159 dst2 += res2 * v_alpha;
1160 dst3 += res3 * v_alpha;
1161 dst4 += res4 * v_alpha;
1162 dst5 += res5 * v_alpha;
1163 dst6 += res6 * v_alpha;
1164 dst7 += res7 * v_alpha;
1166 ST_SP2_INC(dst0, dst1, pc0, 4);
1167 ST_SP2_INC(dst2, dst3, pc1, 4);
1168 ST_SP2_INC(dst4, dst5, pc2, 4);
1169 ST_SP2_INC(dst6, dst7, pc3, 4);
1171 #if defined(TRMMKERNEL)
1172 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1175 temp -= 8; // number of values in A
1177 temp -= 4; // number of values in B
1184 off += 8; // number of values in A
1191 #if defined(TRMMKERNEL)
1192 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1199 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1202 temp = off + 4; // number of values in A
1204 temp = off + 4; // number of values in B
1211 src_a0 = LD_SP(pa0);
1212 src_b0 = LD_SP(pb0);
1214 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1215 res0 = src_a0 * src_b;
1217 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1218 res1 = src_a0 * src_b;
1220 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1221 res2 = src_a0 * src_b;
1223 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1224 res3 = src_a0 * src_b;
1229 for (l = ((temp - 1) >> 1); l--;)
1231 src_a0 = LD_SP(pa0);
1232 src_b0 = LD_SP(pb0);
1234 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1235 res0 += src_a0 * src_b;
1237 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1238 res1 += src_a0 * src_b;
1240 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1241 res2 += src_a0 * src_b;
1243 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1244 res3 += src_a0 * src_b;
1249 src_a0 = LD_SP(pa0);
1250 src_b0 = LD_SP(pb0);
1252 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1253 res0 += src_a0 * src_b;
1255 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1256 res1 += src_a0 * src_b;
1258 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1259 res2 += src_a0 * src_b;
1261 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1262 res3 += src_a0 * src_b;
1270 src_a0 = LD_SP(pa0);
1271 src_b0 = LD_SP(pb0);
1273 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1274 res0 += src_a0 * src_b;
1276 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1277 res1 += src_a0 * src_b;
1279 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1280 res2 += src_a0 * src_b;
1282 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1283 res3 += src_a0 * src_b;
1289 #if defined(TRMMKERNEL)
1290 dst0 = res0 * v_alpha;
1291 dst1 = res1 * v_alpha;
1292 dst2 = res2 * v_alpha;
1293 dst3 = res3 * v_alpha;
1300 dst0 += res0 * v_alpha;
1301 dst1 += res1 * v_alpha;
1302 dst2 += res2 * v_alpha;
1303 dst3 += res3 * v_alpha;
1315 #if defined(TRMMKERNEL)
1316 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1319 temp -= 4; // number of values in A
1321 temp -= 4; // number of values in B
1328 off += 4; // number of values in A
1335 #if defined(TRMMKERNEL)
1336 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1343 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1346 temp = off + 2; // number of values in A
1348 temp = off + 4; // number of values in B
1377 for (l = ((temp - 1) >> 1); l--;)
1449 tmp0 = alpha * tmp0;
1450 tmp2 = alpha * tmp2;
1451 tmp4 = alpha * tmp4;
1452 tmp6 = alpha * tmp6;
1454 #if defined(TRMMKERNEL)
1465 tmp1 = alpha * tmp1;
1466 tmp3 = alpha * tmp3;
1467 tmp5 = alpha * tmp5;
1468 tmp7 = alpha * tmp7;
1470 #if defined(TRMMKERNEL)
1486 #if defined(TRMMKERNEL)
1487 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1490 temp -= 2; // number of values in A
1492 temp -= 4; // number of values in B
1499 off += 2; // number of values in A
1506 #if defined(TRMMKERNEL)
1507 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1514 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1517 temp = off + 1; // number of values in A
1519 temp = off + 4; // number of values in B
1542 for (l = ((temp - 1) >> 1); l--;)
1596 tmp0 = alpha * tmp0;
1597 tmp1 = alpha * tmp1;
1598 tmp2 = alpha * tmp2;
1599 tmp3 = alpha * tmp3;
1601 #if defined(TRMMKERNEL)
1617 #if defined(TRMMKERNEL)
1618 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1621 temp -= 1; // number of values in A
1623 temp -= 4; // number of values in B
1630 off += 1; // number of values in A
1635 #if defined(TRMMKERNEL) && !defined(LEFT)
1636 off += 4; // number of values in A
1648 #if defined(TRMMKERNEL) && defined(LEFT)
1654 for (i = (m >> 3); i--;)
1656 #if defined(TRMMKERNEL)
1657 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1664 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1667 temp = off + 8; // number of values in A
1669 temp = off + 2; // number of values in B
1676 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1680 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1681 res0 = src_a0 * src_b;
1682 res1 = src_a1 * src_b;
1684 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1685 res2 = src_a0 * src_b;
1686 res3 = src_a1 * src_b;
1690 for (l = ((temp - 1) >> 1); l--;)
1692 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1696 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1697 res0 += src_a0 * src_b;
1698 res1 += src_a1 * src_b;
1700 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1701 res2 += src_a0 * src_b;
1702 res3 += src_a1 * src_b;
1706 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1710 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1711 res0 += src_a0 * src_b;
1712 res1 += src_a1 * src_b;
1714 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1715 res2 += src_a0 * src_b;
1716 res3 += src_a1 * src_b;
1723 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1727 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1728 res0 += src_a0 * src_b;
1729 res1 += src_a1 * src_b;
1731 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1732 res2 += src_a0 * src_b;
1733 res3 += src_a1 * src_b;
1738 #if defined(TRMMKERNEL)
1739 dst0 = res0 * v_alpha;
1740 dst1 = res1 * v_alpha;
1741 dst2 = res2 * v_alpha;
1742 dst3 = res3 * v_alpha;
1744 LD_SP2(pc0, 4, dst0, dst1);
1745 LD_SP2(pc1, 4, dst2, dst3);
1747 dst0 += res0 * v_alpha;
1748 dst1 += res1 * v_alpha;
1749 dst2 += res2 * v_alpha;
1750 dst3 += res3 * v_alpha;
1752 ST_SP2_INC(dst0, dst1, pc0, 4);
1753 ST_SP2_INC(dst2, dst3, pc1, 4);
1755 #if defined(TRMMKERNEL)
1756 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1759 temp -= 8; // number of values in A
1761 temp -= 2; // number of values in B
1768 off += 8; // number of values in A
1775 #if defined(TRMMKERNEL)
1776 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1783 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1786 temp = off + 4; // number of values in A
1788 temp = off + 2; // number of values in B
1795 src_a0 = LD_SP(pa0);
1799 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1800 res0 = src_a0 * src_b;
1802 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1803 res1 = src_a0 * src_b;
1808 for (l = ((temp - 1) >> 1); l--;)
1810 src_a0 = LD_SP(pa0);
1814 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1815 res0 += src_a0 * src_b;
1817 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1818 res1 += src_a0 * src_b;
1823 src_a0 = LD_SP(pa0);
1827 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1828 res0 += src_a0 * src_b;
1830 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1831 res1 += src_a0 * src_b;
1839 src_a0 = LD_SP(pa0);
1843 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1844 res0 += src_a0 * src_b;
1846 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1847 res1 += src_a0 * src_b;
1853 #if defined(TRMMKERNEL)
1854 dst0 = res0 * v_alpha;
1855 dst1 = res1 * v_alpha;
1860 dst0 += res0 * v_alpha;
1861 dst1 += res1 * v_alpha;
1869 #if defined(TRMMKERNEL)
1870 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1873 temp -= 4; // number of values in A
1875 temp -= 2; // number of values in B
1882 off += 4; // number of values in A
1889 #if defined(TRMMKERNEL)
1890 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1897 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1900 temp = off + 2; // number of values in A
1902 temp = off + 2; // number of values in B
1923 for (l = ((temp - 1) >> 1); l--;)
1971 tmp0 = alpha * tmp0;
1972 tmp1 = alpha * tmp1;
1973 tmp2 = alpha * tmp2;
1974 tmp3 = alpha * tmp3;
1976 #if defined(TRMMKERNEL)
1990 #if defined(TRMMKERNEL)
1991 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1994 temp -= 2; // number of values in A
1996 temp -= 2; // number of values in B
2003 off += 2; // number of values in A
2010 #if defined(TRMMKERNEL)
2011 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2018 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2021 temp = off + 1; // number of values in A
2023 temp = off + 2; // number of values in B
2040 for (l = ((temp - 1) >> 1); l--;)
2076 tmp0 = alpha * tmp0;
2077 tmp1 = alpha * tmp1;
2079 #if defined(TRMMKERNEL)
2089 #if defined(TRMMKERNEL)
2090 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2093 temp -= 1; // number of values in A
2095 temp -= 2; // number of values in B
2102 off += 1; // number of values in A
2107 #if defined(TRMMKERNEL) && !defined(LEFT)
2108 off += 2; // number of values in A
2119 #if defined(TRMMKERNEL) && defined(LEFT)
2125 for (i = (m >> 3); i--;)
2127 #if defined(TRMMKERNEL)
2128 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2135 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2138 temp = off + 8; // number of values in A
2140 temp = off + 1; // number of values in B
2147 LD_SP2_INC(pa0, 4, src_a0, src_a1);
2150 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2151 res0 = src_a0 * src_b;
2152 res1 = src_a1 * src_b;
2156 for (l = ((temp - 1) >> 1); l--;)
2158 LD_SP2_INC(pa0, 4, src_a0, src_a1);
2161 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2162 res0 += src_a0 * src_b;
2163 res1 += src_a1 * src_b;
2167 LD_SP2_INC(pa0, 4, src_a0, src_a1);
2170 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2171 res0 += src_a0 * src_b;
2172 res1 += src_a1 * src_b;
2179 LD_SP2_INC(pa0, 4, src_a0, src_a1);
2182 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2183 res0 += src_a0 * src_b;
2184 res1 += src_a1 * src_b;
2189 #if defined(TRMMKERNEL)
2190 dst0 = res0 * v_alpha;
2191 dst1 = res1 * v_alpha;
2193 LD_SP2(pc0, 4, dst0, dst1);
2195 dst0 += res0 * v_alpha;
2196 dst1 += res1 * v_alpha;
2198 ST_SP2_INC(dst0, dst1, pc0, 4);
2200 #if defined(TRMMKERNEL)
2201 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2204 temp -= 8; // number of values in A
2206 temp -= 1; // number of values in B
2213 off += 8; // number of values in A
2220 #if defined(TRMMKERNEL)
2221 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2228 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2231 temp = off + 4; // number of values in A
2233 temp = off + 1; // number of values in B
2240 src_a0 = LD_SP(pa0);
2243 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2244 res0 = src_a0 * src_b;
2249 for (l = ((temp - 1) >> 1); l--;)
2251 src_a0 = LD_SP(pa0);
2254 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2255 res0 += src_a0 * src_b;
2260 src_a0 = LD_SP(pa0);
2263 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2264 res0 += src_a0 * src_b;
2272 src_a0 = LD_SP(pa0);
2275 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2276 res0 += src_a0 * src_b;
2282 #if defined(TRMMKERNEL)
2283 dst0 = res0 * v_alpha;
2287 dst0 += res0 * v_alpha;
2293 #if defined(TRMMKERNEL)
2294 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2297 temp -= 4; // number of values in A
2299 temp -= 1; // number of values in B
2306 off += 4; // number of values in A
2313 #if defined(TRMMKERNEL)
2314 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2321 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2324 temp = off + 2; // number of values in A
2326 temp = off + 1; // number of values in B
2343 for (l = ((temp - 1) >> 1); l--;)
2379 tmp0 = alpha * tmp0;
2380 tmp1 = alpha * tmp1;
2382 #if defined(TRMMKERNEL)
2391 #if defined(TRMMKERNEL)
2392 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2395 temp -= 2; // number of values in A
2397 temp -= 1; // number of values in B
2404 off += 2; // number of values in A
2411 #if defined(TRMMKERNEL)
2412 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2419 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2422 temp = off + 1; // number of values in A
2424 temp = off + 1; // number of values in B
2438 for (l = ((temp - 1) >> 1); l--;)
2465 #if defined(TRMMKERNEL)
2466 pc0[0] = alpha * tmp0;
2468 pc0[0] += alpha * tmp0;