fix build error
[platform/upstream/openblas.git] / kernel / mips / sgemm_kernel_8x8_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
32           FLOAT *C, BLASLONG ldc
33 #ifdef TRMMKERNEL
34           , BLASLONG offset
35 #endif
36           )
37 {
38     BLASLONG i, j, l, temp;
39 #if defined(TRMMKERNEL)
40     BLASLONG off;
41 #endif
42     FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
43     FLOAT *pa0, *pb0;
44     FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
45     FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
46     FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7;
47     v4f32 v_alpha = {alpha, alpha, alpha, alpha};
48     v4f32 src_a0, src_a1, src_b, src_b0, src_b1;
49     v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
50     v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
51     v4f32 res8, res9, res10, res11, res12, res13, res14, res15;
52
53 #if defined(TRMMKERNEL) && !defined(LEFT)
54     off = -offset;
55 #endif
56
57     for (j = (n >> 3); j--;)
58     {
59         pc0 = C;
60         pc1 = pc0 + ldc;
61         pc2 = pc1 + ldc;
62         pc3 = pc2 + ldc;
63         pc4 = pc3 + ldc;
64         pc5 = pc4 + ldc;
65         pc6 = pc5 + ldc;
66         pc7 = pc6 + ldc;
67
68 #if defined(TRMMKERNEL) && defined(LEFT)
69         off = offset;
70 #endif
71
72         pa0 = A;
73         for (i = (m >> 3); i--;)
74         {
75 #if defined(TRMMKERNEL)
76 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
77             pb0 = B;
78 #else
79             pa0 += off * 8;
80             pb0 = B + off * 8;
81 #endif
82
83 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
84             temp = k - off;
85 #elif defined(LEFT)
86             temp = off + 8; // number of values in A
87 #else
88             temp = off + 8; // number of values in B
89 #endif
90 #else
91             pb0 = B;
92             temp = k;
93 #endif
94 #ifdef ENABLE_PREFETCH
95             __asm__ __volatile__(
96                 "pref   0,   32(%[pa0])   \n\t"
97                 "pref   0,   32(%[pb0])   \n\t"
98
99                 :
100                 : [pa0] "r" (pa0), [pb0] "r" (pb0)
101             );
102 #endif
103
104             LD_SP2_INC(pa0, 4, src_a0, src_a1);
105             LD_SP2_INC(pb0, 4, src_b0, src_b1);
106
107             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
108             res0 = src_a0 * src_b;
109             res1 = src_a1 * src_b;
110
111             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
112             res2 = src_a0 * src_b;
113             res3 = src_a1 * src_b;
114
115             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
116             res4 = src_a0 * src_b;
117             res5 = src_a1 * src_b;
118
119             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
120             res6 = src_a0 * src_b;
121             res7 = src_a1 * src_b;
122
123             src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
124             res8 = src_a0 * src_b;
125             res9 = src_a1 * src_b;
126
127             src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
128             res10 = src_a0 * src_b;
129             res11 = src_a1 * src_b;
130
131             src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
132             res12 = src_a0 * src_b;
133             res13 = src_a1 * src_b;
134
135             src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
136             res14 = src_a0 * src_b;
137             res15 = src_a1 * src_b;
138
139             for (l = ((temp - 1) >> 1); l--;)
140             {
141 #ifdef ENABLE_PREFETCH
142             __asm__ __volatile__(
143                 "pref   0,   64(%[pa0])   \n\t"
144                 "pref   0,   96(%[pa0])   \n\t"
145                 "pref   0,   64(%[pb0])   \n\t"
146                 "pref   0,   96(%[pb0])   \n\t"
147
148                 :
149                 : [pa0] "r" (pa0), [pb0] "r" (pb0)
150             );
151 #endif
152
153                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
154                 LD_SP2_INC(pb0, 4, src_b0, src_b1);
155
156                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
157                 res0 += src_a0 * src_b;
158                 res1 += src_a1 * src_b;
159
160                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
161                 res2 += src_a0 * src_b;
162                 res3 += src_a1 * src_b;
163
164                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
165                 res4 += src_a0 * src_b;
166                 res5 += src_a1 * src_b;
167
168                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
169                 res6 += src_a0 * src_b;
170                 res7 += src_a1 * src_b;
171
172                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
173                 res8 += src_a0 * src_b;
174                 res9 += src_a1 * src_b;
175
176                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
177                 res10 += src_a0 * src_b;
178                 res11 += src_a1 * src_b;
179
180                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
181                 res12 += src_a0 * src_b;
182                 res13 += src_a1 * src_b;
183
184                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
185                 res14 += src_a0 * src_b;
186                 res15 += src_a1 * src_b;
187
188                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
189                 LD_SP2_INC(pb0, 4, src_b0, src_b1);
190
191                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
192                 res0 += src_a0 * src_b;
193                 res1 += src_a1 * src_b;
194
195                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
196                 res2 += src_a0 * src_b;
197                 res3 += src_a1 * src_b;
198
199                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
200                 res4 += src_a0 * src_b;
201                 res5 += src_a1 * src_b;
202
203                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
204                 res6 += src_a0 * src_b;
205                 res7 += src_a1 * src_b;
206
207                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
208                 res8 += src_a0 * src_b;
209                 res9 += src_a1 * src_b;
210
211                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
212                 res10 += src_a0 * src_b;
213                 res11 += src_a1 * src_b;
214
215                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
216                 res12 += src_a0 * src_b;
217                 res13 += src_a1 * src_b;
218
219                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
220                 res14 += src_a0 * src_b;
221                 res15 += src_a1 * src_b;
222             }
223
224             if ((temp - 1) & 1)
225             {
226                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
227                 LD_SP2_INC(pb0, 4, src_b0, src_b1);
228
229                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
230                 res0 += src_a0 * src_b;
231                 res1 += src_a1 * src_b;
232
233                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
234                 res2 += src_a0 * src_b;
235                 res3 += src_a1 * src_b;
236
237                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
238                 res4 += src_a0 * src_b;
239                 res5 += src_a1 * src_b;
240
241                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
242                 res6 += src_a0 * src_b;
243                 res7 += src_a1 * src_b;
244
245                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
246                 res8 += src_a0 * src_b;
247                 res9 += src_a1 * src_b;
248
249                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
250                 res10 += src_a0 * src_b;
251                 res11 += src_a1 * src_b;
252
253                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
254                 res12 += src_a0 * src_b;
255                 res13 += src_a1 * src_b;
256
257                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
258                 res14 += src_a0 * src_b;
259                 res15 += src_a1 * src_b;
260             }
261
262 #if defined(TRMMKERNEL)
263             dst0 = res0 * v_alpha;
264             dst1 = res1 * v_alpha;
265             dst2 = res2 * v_alpha;
266             dst3 = res3 * v_alpha;
267             dst4 = res4 * v_alpha;
268             dst5 = res5 * v_alpha;
269             dst6 = res6 * v_alpha;
270             dst7 = res7 * v_alpha;
271 #else
272             LD_SP2(pc0, 4, dst0, dst1);
273             LD_SP2(pc1, 4, dst2, dst3);
274             LD_SP2(pc2, 4, dst4, dst5);
275             LD_SP2(pc3, 4, dst6, dst7);
276
277             dst0 += res0 * v_alpha;
278             dst1 += res1 * v_alpha;
279             dst2 += res2 * v_alpha;
280             dst3 += res3 * v_alpha;
281             dst4 += res4 * v_alpha;
282             dst5 += res5 * v_alpha;
283             dst6 += res6 * v_alpha;
284             dst7 += res7 * v_alpha;
285 #endif
286             ST_SP2_INC(dst0, dst1, pc0, 4);
287             ST_SP2_INC(dst2, dst3, pc1, 4);
288             ST_SP2_INC(dst4, dst5, pc2, 4);
289             ST_SP2_INC(dst6, dst7, pc3, 4);
290
291 #if defined(TRMMKERNEL)
292             dst0 = res8 * v_alpha;
293             dst1 = res9 * v_alpha;
294             dst2 = res10 * v_alpha;
295             dst3 = res11 * v_alpha;
296             dst4 = res12 * v_alpha;
297             dst5 = res13 * v_alpha;
298             dst6 = res14 * v_alpha;
299             dst7 = res15 * v_alpha;
300 #else
301             LD_SP2(pc4, 4, dst0, dst1);
302             LD_SP2(pc5, 4, dst2, dst3);
303             LD_SP2(pc6, 4, dst4, dst5);
304             LD_SP2(pc7, 4, dst6, dst7);
305
306             dst0 += res8 * v_alpha;
307             dst1 += res9 * v_alpha;
308             dst2 += res10 * v_alpha;
309             dst3 += res11 * v_alpha;
310             dst4 += res12 * v_alpha;
311             dst5 += res13 * v_alpha;
312             dst6 += res14 * v_alpha;
313             dst7 += res15 * v_alpha;
314 #endif
315             ST_SP2_INC(dst0, dst1, pc4, 4);
316             ST_SP2_INC(dst2, dst3, pc5, 4);
317             ST_SP2_INC(dst4, dst5, pc6, 4);
318             ST_SP2_INC(dst6, dst7, pc7, 4);
319
320 #if defined(TRMMKERNEL)
321 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
322             temp = k - off;
323 #ifdef LEFT
324             temp -= 8; // number of values in A
325 #else
326             temp -= 8; // number of values in B
327 #endif
328             pa0 += temp * 8;
329             pb0 += temp * 8;
330 #endif
331
332 #ifdef LEFT
333             off += 8; // number of values in A
334 #endif
335 #endif
336         }
337
338         if (m & 4)
339         {
340 #if defined(TRMMKERNEL)
341 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
342             pb0 = B;
343 #else
344             pa0 += off * 4;
345             pb0 = B + off * 8;
346 #endif
347
348 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
349             temp = k - off;
350 #elif defined(LEFT)
351             temp = off + 4; // number of values in A
352 #else
353             temp = off + 8; // number of values in B
354 #endif
355 #else
356             pb0 = B;
357             temp = k;
358 #endif
359
360             src_a0 = LD_SP(pa0);
361             LD_SP2_INC(pb0, 4, src_b0, src_b1);
362
363             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
364             res0 = src_a0 * src_b;
365
366             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
367             res1 = src_a0 * src_b;
368
369             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
370             res2 = src_a0 * src_b;
371
372             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
373             res3 = src_a0 * src_b;
374
375             src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
376             res4 = src_a0 * src_b;
377
378             src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
379             res5 = src_a0 * src_b;
380
381             src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
382             res6 = src_a0 * src_b;
383
384             src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
385             res7 = src_a0 * src_b;
386
387             pa0 += 4;
388
389             for (l = ((temp - 1) >> 1); l--;)
390             {
391                 src_a0 = LD_SP(pa0);
392                 LD_SP2_INC(pb0, 4, src_b0, src_b1);
393
394                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
395                 res0 += src_a0 * src_b;
396
397                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
398                 res1 += src_a0 * src_b;
399
400                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
401                 res2 += src_a0 * src_b;
402
403                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
404                 res3 += src_a0 * src_b;
405
406                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
407                 res4 += src_a0 * src_b;
408
409                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
410                 res5 += src_a0 * src_b;
411
412                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
413                 res6 += src_a0 * src_b;
414
415                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
416                 res7 += src_a0 * src_b;
417
418                 pa0 += 4;
419
420                 src_a0 = LD_SP(pa0);
421                 LD_SP2_INC(pb0, 4, src_b0, src_b1);
422
423                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
424                 res0 += src_a0 * src_b;
425
426                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
427                 res1 += src_a0 * src_b;
428
429                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
430                 res2 += src_a0 * src_b;
431
432                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
433                 res3 += src_a0 * src_b;
434
435                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
436                 res4 += src_a0 * src_b;
437
438                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
439                 res5 += src_a0 * src_b;
440
441                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
442                 res6 += src_a0 * src_b;
443
444                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
445                 res7 += src_a0 * src_b;
446
447                 pa0 += 4;
448             }
449
450             if ((temp - 1) & 1)
451             {
452                 src_a0 = LD_SP(pa0);
453                 LD_SP2_INC(pb0, 4, src_b0, src_b1);
454
455                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
456                 res0 += src_a0 * src_b;
457
458                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
459                 res1 += src_a0 * src_b;
460
461                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
462                 res2 += src_a0 * src_b;
463
464                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
465                 res3 += src_a0 * src_b;
466
467                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
468                 res4 += src_a0 * src_b;
469
470                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
471                 res5 += src_a0 * src_b;
472
473                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
474                 res6 += src_a0 * src_b;
475
476                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
477                 res7 += src_a0 * src_b;
478
479                 pa0 += 4;
480             }
481
482 #if defined(TRMMKERNEL)
483             dst0 = res0 * v_alpha;
484             dst1 = res1 * v_alpha;
485             dst2 = res2 * v_alpha;
486             dst3 = res3 * v_alpha;
487 #else
488             dst0 = LD_SP(pc0);
489             dst1 = LD_SP(pc1);
490             dst2 = LD_SP(pc2);
491             dst3 = LD_SP(pc3);
492
493             dst0 += res0 * v_alpha;
494             dst1 += res1 * v_alpha;
495             dst2 += res2 * v_alpha;
496             dst3 += res3 * v_alpha;
497 #endif
498             ST_SP(dst0, pc0);
499             ST_SP(dst1, pc1);
500             ST_SP(dst2, pc2);
501             ST_SP(dst3, pc3);
502
503 #if defined(TRMMKERNEL)
504             dst0 = res4 * v_alpha;
505             dst1 = res5 * v_alpha;
506             dst2 = res6 * v_alpha;
507             dst3 = res7 * v_alpha;
508 #else
509             dst0 = LD_SP(pc4);
510             dst1 = LD_SP(pc5);
511             dst2 = LD_SP(pc6);
512             dst3 = LD_SP(pc7);
513
514             dst0 += res4 * v_alpha;
515             dst1 += res5 * v_alpha;
516             dst2 += res6 * v_alpha;
517             dst3 += res7 * v_alpha;
518 #endif
519             ST_SP(dst0, pc4);
520             ST_SP(dst1, pc5);
521             ST_SP(dst2, pc6);
522             ST_SP(dst3, pc7);
523
524             pc0 += 4;
525             pc1 += 4;
526             pc2 += 4;
527             pc3 += 4;
528             pc4 += 4;
529             pc5 += 4;
530             pc6 += 4;
531             pc7 += 4;
532
533 #if defined(TRMMKERNEL)
534 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
535             temp = k - off;
536 #ifdef LEFT
537             temp -= 4; // number of values in A
538 #else
539             temp -= 8; // number of values in B
540 #endif
541             pa0 += temp * 4;
542             pb0 += temp * 8;
543 #endif
544
545 #ifdef LEFT
546             off += 4; // number of values in A
547 #endif
548 #endif
549         }
550
551         if (m & 2)
552         {
553 #if defined(TRMMKERNEL)
554 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
555             pb0 = B;
556 #else
557             pa0 += off * 2;
558             pb0 = B + off * 8;
559 #endif
560
561 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
562             temp = k - off;
563 #elif defined(LEFT)
564             temp = off + 2; // number of values in A
565 #else
566             temp = off + 8; // number of values in B
567 #endif
568 #else
569             pb0 = B;
570             temp = k;
571 #endif
572
573             a0 = pa0[0];
574             b0 = pb0[0];
575             tmp0 = a0 * b0;
576
577             a1 = pa0[1];
578             tmp1 = a1 * b0;
579
580             b1 = pb0[1];
581             tmp2 = a0 * b1;
582             tmp3 = a1 * b1;
583
584             b2 = pb0[2];
585             tmp4 = a0 * b2;
586             tmp5 = a1 * b2;
587
588             b3 = pb0[3];
589             tmp6 = a0 * b3;
590             tmp7 = a1 * b3;
591
592             b4 = pb0[4];
593             tmp8 = a0 * b4;
594             tmp9 = a1 * b4;
595
596             b5 = pb0[5];
597             tmp10 = a0 * b5;
598             tmp11 = a1 * b5;
599
600             b6 = pb0[6];
601             tmp12 = a0 * b6;
602             tmp13 = a1 * b6;
603
604             b7 = pb0[7];
605             tmp14 = a0 * b7;
606             tmp15 = a1 * b7;
607
608             pa0 += 2;
609             pb0 += 8;
610
611             for (l = ((temp - 1) >> 1); l--;)
612             {
613                 a0 = pa0[0];
614                 b0 = pb0[0];
615                 tmp0 += a0 * b0;
616
617                 a1 = pa0[1];
618                 tmp1 += a1 * b0;
619
620                 b1 = pb0[1];
621                 tmp2 += a0 * b1;
622                 tmp3 += a1 * b1;
623
624                 b2 = pb0[2];
625                 tmp4 += a0 * b2;
626                 tmp5 += a1 * b2;
627
628                 b3 = pb0[3];
629                 tmp6 += a0 * b3;
630                 tmp7 += a1 * b3;
631
632                 b4 = pb0[4];
633                 tmp8 += a0 * b4;
634                 tmp9 += a1 * b4;
635
636                 b5 = pb0[5];
637                 tmp10 += a0 * b5;
638                 tmp11 += a1 * b5;
639
640                 b6 = pb0[6];
641                 tmp12 += a0 * b6;
642                 tmp13 += a1 * b6;
643
644                 b7 = pb0[7];
645                 tmp14 += a0 * b7;
646                 tmp15 += a1 * b7;
647
648                 pa0 += 2;
649                 pb0 += 8;
650
651                 a0 = pa0[0];
652                 b0 = pb0[0];
653                 tmp0 += a0 * b0;
654
655                 a1 = pa0[1];
656                 tmp1 += a1 * b0;
657
658                 b1 = pb0[1];
659                 tmp2 += a0 * b1;
660                 tmp3 += a1 * b1;
661
662                 b2 = pb0[2];
663                 tmp4 += a0 * b2;
664                 tmp5 += a1 * b2;
665
666                 b3 = pb0[3];
667                 tmp6 += a0 * b3;
668                 tmp7 += a1 * b3;
669
670                 b4 = pb0[4];
671                 tmp8 += a0 * b4;
672                 tmp9 += a1 * b4;
673
674                 b5 = pb0[5];
675                 tmp10 += a0 * b5;
676                 tmp11 += a1 * b5;
677
678                 b6 = pb0[6];
679                 tmp12 += a0 * b6;
680                 tmp13 += a1 * b6;
681
682                 b7 = pb0[7];
683                 tmp14 += a0 * b7;
684                 tmp15 += a1 * b7;
685
686                 pa0 += 2;
687                 pb0 += 8;
688             }
689
690             if ((temp - 1) & 1)
691             {
692                 a0 = pa0[0];
693                 b0 = pb0[0];
694                 tmp0 += a0 * b0;
695
696                 a1 = pa0[1];
697                 tmp1 += a1 * b0;
698
699                 b1 = pb0[1];
700                 tmp2 += a0 * b1;
701                 tmp3 += a1 * b1;
702
703                 b2 = pb0[2];
704                 tmp4 += a0 * b2;
705                 tmp5 += a1 * b2;
706
707                 b3 = pb0[3];
708                 tmp6 += a0 * b3;
709                 tmp7 += a1 * b3;
710
711                 b4 = pb0[4];
712                 tmp8 += a0 * b4;
713                 tmp9 += a1 * b4;
714
715                 b5 = pb0[5];
716                 tmp10 += a0 * b5;
717                 tmp11 += a1 * b5;
718
719                 b6 = pb0[6];
720                 tmp12 += a0 * b6;
721                 tmp13 += a1 * b6;
722
723                 b7 = pb0[7];
724                 tmp14 += a0 * b7;
725                 tmp15 += a1 * b7;
726
727                 pa0 += 2;
728                 pb0 += 8;
729             }
730
731             tmp0 = alpha * tmp0;
732             tmp2 = alpha * tmp2;
733             tmp4 = alpha * tmp4;
734             tmp6 = alpha * tmp6;
735             tmp8 = alpha * tmp8;
736             tmp10 = alpha * tmp10;
737             tmp12 = alpha * tmp12;
738             tmp14 = alpha * tmp14;
739
740 #if defined(TRMMKERNEL)
741             pc0[0] = tmp0;
742             pc1[0] = tmp2;
743             pc2[0] = tmp4;
744             pc3[0] = tmp6;
745             pc4[0] = tmp8;
746             pc5[0] = tmp10;
747             pc6[0] = tmp12;
748             pc7[0] = tmp14;
749 #else
750             pc0[0] += tmp0;
751             pc1[0] += tmp2;
752             pc2[0] += tmp4;
753             pc3[0] += tmp6;
754             pc4[0] += tmp8;
755             pc5[0] += tmp10;
756             pc6[0] += tmp12;
757             pc7[0] += tmp14;
758 #endif
759             tmp1 = alpha * tmp1;
760             tmp3 = alpha * tmp3;
761             tmp5 = alpha * tmp5;
762             tmp7 = alpha * tmp7;
763             tmp9 = alpha * tmp9;
764             tmp11 = alpha * tmp11;
765             tmp13 = alpha * tmp13;
766             tmp15 = alpha * tmp15;
767
768 #if defined(TRMMKERNEL)
769             pc0[1] = tmp1;
770             pc1[1] = tmp3;
771             pc2[1] = tmp5;
772             pc3[1] = tmp7;
773             pc4[1] = tmp9;
774             pc5[1] = tmp11;
775             pc6[1] = tmp13;
776             pc7[1] = tmp15;
777 #else
778             pc0[1] += tmp1;
779             pc1[1] += tmp3;
780             pc2[1] += tmp5;
781             pc3[1] += tmp7;
782             pc4[1] += tmp9;
783             pc5[1] += tmp11;
784             pc6[1] += tmp13;
785             pc7[1] += tmp15;
786 #endif
787             pc0 += 2;
788             pc1 += 2;
789             pc2 += 2;
790             pc3 += 2;
791             pc4 += 2;
792             pc5 += 2;
793             pc6 += 2;
794             pc7 += 2;
795
796 #if defined(TRMMKERNEL)
797 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
798             temp = k - off;
799 #ifdef LEFT
800             temp -= 2; // number of values in A
801 #else
802             temp -= 8; // number of values in B
803 #endif
804             pa0 += temp * 2;
805             pb0 += temp * 8;
806 #endif
807
808 #ifdef LEFT
809             off += 2; // number of values in A
810 #endif
811 #endif
812         }
813
814         if (m & 1)
815         {
816 #if defined(TRMMKERNEL)
817 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
818             pb0 = B;
819 #else
820             pa0 += off * 1;
821             pb0 = B + off * 8;
822 #endif
823
824 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
825             temp = k - off;
826 #elif defined(LEFT)
827             temp = off + 1; // number of values in A
828 #else
829             temp = off + 8; // number of values in B
830 #endif
831 #else
832             pb0 = B;
833             temp = k;
834 #endif
835
836             a0 = pa0[0];
837             b0 = pb0[0];
838             tmp0 = a0 * b0;
839
840             b1 = pb0[1];
841             tmp1 = a0 * b1;
842
843             b2 = pb0[2];
844             tmp2 = a0 * b2;
845
846             b3 = pb0[3];
847             tmp3 = a0 * b3;
848
849             b4 = pb0[4];
850             tmp4 = a0 * b4;
851
852             b5 = pb0[5];
853             tmp5 = a0 * b5;
854
855             b6 = pb0[6];
856             tmp6 = a0 * b6;
857
858             b7 = pb0[7];
859             tmp7 = a0 * b7;
860
861             pa0 += 1;
862             pb0 += 8;
863
864             for (l = ((temp - 1) >> 1); l--;)
865             {
866                 a0 = pa0[0];
867                 b0 = pb0[0];
868                 tmp0 += a0 * b0;
869
870                 b1 = pb0[1];
871                 tmp1  += a0 * b1;
872
873                 b2 = pb0[2];
874                 tmp2 += a0 * b2;
875
876                 b3 = pb0[3];
877                 tmp3 += a0 * b3;
878
879                 b4 = pb0[4];
880                 tmp4 += a0 * b4;
881
882                 b5 = pb0[5];
883                 tmp5 += a0 * b5;
884
885                 b6 = pb0[6];
886                 tmp6 += a0 * b6;
887
888                 b7 = pb0[7];
889                 tmp7 += a0 * b7;
890
891                 pa0 += 1;
892                 pb0 += 8;
893
894                 a0 = pa0[0];
895                 b0 = pb0[0];
896                 tmp0 += a0 * b0;
897
898                 b1 = pb0[1];
899                 tmp1  += a0 * b1;
900
901                 b2 = pb0[2];
902                 tmp2 += a0 * b2;
903
904                 b3 = pb0[3];
905                 tmp3 += a0 * b3;
906
907                 b4 = pb0[4];
908                 tmp4 += a0 * b4;
909
910                 b5 = pb0[5];
911                 tmp5 += a0 * b5;
912
913                 b6 = pb0[6];
914                 tmp6 += a0 * b6;
915
916                 b7 = pb0[7];
917                 tmp7 += a0 * b7;
918
919                 pa0 += 1;
920                 pb0 += 8;
921             }
922
923             if ((temp - 1) & 1)
924             {
925                 a0 = pa0[0];
926                 b0 = pb0[0];
927                 tmp0 += a0 * b0;
928
929                 b1 = pb0[1];
930                 tmp1 += a0 * b1;
931
932                 b2 = pb0[2];
933                 tmp2 += a0 * b2;
934
935                 b3 = pb0[3];
936                 tmp3 += a0 * b3;
937
938                 b4 = pb0[4];
939                 tmp4 += a0 * b4;
940
941                 b5 = pb0[5];
942                 tmp5 += a0 * b5;
943
944                 b6 = pb0[6];
945                 tmp6 += a0 * b6;
946
947                 b7 = pb0[7];
948                 tmp7 += a0 * b7;
949
950                 pa0 += 1;
951                 pb0 += 8;
952             }
953
954             tmp0 = alpha * tmp0;
955             tmp1 = alpha * tmp1;
956             tmp2 = alpha * tmp2;
957             tmp3 = alpha * tmp3;
958             tmp4 = alpha * tmp4;
959             tmp5 = alpha * tmp5;
960             tmp6 = alpha * tmp6;
961             tmp7 = alpha * tmp7;
962
963 #if defined(TRMMKERNEL)
964             pc0[0] = tmp0;
965             pc1[0] = tmp1;
966             pc2[0] = tmp2;
967             pc3[0] = tmp3;
968             pc4[0] = tmp4;
969             pc5[0] = tmp5;
970             pc6[0] = tmp6;
971             pc7[0] = tmp7;
972 #else
973             pc0[0] += tmp0;
974             pc1[0] += tmp1;
975             pc2[0] += tmp2;
976             pc3[0] += tmp3;
977             pc4[0] += tmp4;
978             pc5[0] += tmp5;
979             pc6[0] += tmp6;
980             pc7[0] += tmp7;
981 #endif
982             pc0 += 1;
983             pc1 += 1;
984             pc2 += 1;
985             pc3 += 1;
986             pc4 += 1;
987             pc5 += 1;
988             pc6 += 1;
989             pc7 += 1;
990
991 #if defined(TRMMKERNEL)
992 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
993             temp = k - off;
994 #ifdef LEFT
995             temp -= 1; // number of values in A
996 #else
997             temp -= 8; // number of values in B
998 #endif
999             pa0 += temp * 1;
1000             pb0 += temp * 8;
1001 #endif
1002
1003 #ifdef LEFT
1004             off += 1; // number of values in A
1005 #endif
1006 #endif
1007         }
1008
1009 #if defined(TRMMKERNEL) && !defined(LEFT)
1010         off += 8; // number of values in A
1011 #endif
1012
1013         B += (k << 3);
1014         C += (ldc << 3);
1015     }
1016
1017     if (n & 4)
1018     {
1019         pc0 = C;
1020         pc1 = pc0 + ldc;
1021         pc2 = pc1 + ldc;
1022         pc3 = pc2 + ldc;
1023
1024 #if defined(TRMMKERNEL) && defined(LEFT)
1025         off = offset;
1026 #endif
1027
1028         pa0 = A;
1029
1030         for (i = (m >> 3); i--;)
1031         {
1032 #if defined(TRMMKERNEL)
1033 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1034             pb0 = B;
1035 #else
1036             pa0 += off * 8;
1037             pb0 = B + off * 4;
1038 #endif
1039
1040 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1041             temp = k - off;
1042 #elif defined(LEFT)
1043             temp = off + 8; // number of values in A
1044 #else
1045             temp = off + 4; // number of values in B
1046 #endif
1047 #else
1048             pb0 = B;
1049             temp = k;
1050 #endif
1051
1052             LD_SP2_INC(pa0, 4, src_a0, src_a1);
1053             src_b0 = LD_SP(pb0);
1054
1055             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1056             res0 = src_a0 * src_b;
1057             res1 = src_a1 * src_b;
1058
1059             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1060             res2 = src_a0 * src_b;
1061             res3 = src_a1 * src_b;
1062
1063             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1064             res4 = src_a0 * src_b;
1065             res5 = src_a1 * src_b;
1066
1067             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1068             res6 = src_a0 * src_b;
1069             res7 = src_a1 * src_b;
1070
1071             pb0 += 4;
1072
1073             for (l = ((temp - 1) >> 1); l--;)
1074             {
1075                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1076                 src_b0 = LD_SP(pb0);
1077
1078                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1079                 res0 += src_a0 * src_b;
1080                 res1 += src_a1 * src_b;
1081
1082                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1083                 res2 += src_a0 * src_b;
1084                 res3 += src_a1 * src_b;
1085
1086                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1087                 res4 += src_a0 * src_b;
1088                 res5 += src_a1 * src_b;
1089
1090                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1091                 res6 += src_a0 * src_b;
1092                 res7 += src_a1 * src_b;
1093
1094                 pb0 += 4;
1095
1096                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1097                 src_b0 = LD_SP(pb0);
1098
1099                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1100                 res0 += src_a0 * src_b;
1101                 res1 += src_a1 * src_b;
1102
1103                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1104                 res2 += src_a0 * src_b;
1105                 res3 += src_a1 * src_b;
1106
1107                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1108                 res4 += src_a0 * src_b;
1109                 res5 += src_a1 * src_b;
1110
1111                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1112                 res6 += src_a0 * src_b;
1113                 res7 += src_a1 * src_b;
1114
1115                 pb0 += 4;
1116             }
1117
1118             if ((temp - 1) & 1)
1119             {
1120                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1121                 src_b0 = LD_SP(pb0);
1122
1123                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1124                 res0 += src_a0 * src_b;
1125                 res1 += src_a1 * src_b;
1126
1127                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1128                 res2 += src_a0 * src_b;
1129                 res3 += src_a1 * src_b;
1130
1131                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1132                 res4 += src_a0 * src_b;
1133                 res5 += src_a1 * src_b;
1134
1135                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1136                 res6 += src_a0 * src_b;
1137                 res7 += src_a1 * src_b;
1138
1139                 pb0 += 4;
1140             }
1141
1142 #if defined(TRMMKERNEL)
1143             dst0 = res0 * v_alpha;
1144             dst1 = res1 * v_alpha;
1145             dst2 = res2 * v_alpha;
1146             dst3 = res3 * v_alpha;
1147             dst4 = res4 * v_alpha;
1148             dst5 = res5 * v_alpha;
1149             dst6 = res6 * v_alpha;
1150             dst7 = res7 * v_alpha;
1151 #else
1152             LD_SP2(pc0, 4, dst0, dst1);
1153             LD_SP2(pc1, 4, dst2, dst3);
1154             LD_SP2(pc2, 4, dst4, dst5);
1155             LD_SP2(pc3, 4, dst6, dst7);
1156
1157             dst0 += res0 * v_alpha;
1158             dst1 += res1 * v_alpha;
1159             dst2 += res2 * v_alpha;
1160             dst3 += res3 * v_alpha;
1161             dst4 += res4 * v_alpha;
1162             dst5 += res5 * v_alpha;
1163             dst6 += res6 * v_alpha;
1164             dst7 += res7 * v_alpha;
1165 #endif
1166             ST_SP2_INC(dst0, dst1, pc0, 4);
1167             ST_SP2_INC(dst2, dst3, pc1, 4);
1168             ST_SP2_INC(dst4, dst5, pc2, 4);
1169             ST_SP2_INC(dst6, dst7, pc3, 4);
1170
1171 #if defined(TRMMKERNEL)
1172 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1173             temp = k - off;
1174 #ifdef LEFT
1175             temp -= 8; // number of values in A
1176 #else
1177             temp -= 4; // number of values in B
1178 #endif
1179             pa0 += temp * 8;
1180             pb0 += temp * 4;
1181 #endif
1182
1183 #ifdef LEFT
1184             off += 8; // number of values in A
1185 #endif
1186 #endif
1187         }
1188
1189         if (m & 4)
1190         {
1191 #if defined(TRMMKERNEL)
1192 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1193             pb0 = B;
1194 #else
1195             pa0 += off * 4;
1196             pb0 = B + off * 4;
1197 #endif
1198
1199 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1200             temp = k - off;
1201 #elif defined(LEFT)
1202             temp = off + 4; // number of values in A
1203 #else
1204             temp = off + 4; // number of values in B
1205 #endif
1206 #else
1207             pb0 = B;
1208             temp = k;
1209 #endif
1210
1211             src_a0 = LD_SP(pa0);
1212             src_b0 = LD_SP(pb0);
1213
1214             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1215             res0 = src_a0 * src_b;
1216
1217             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1218             res1 = src_a0 * src_b;
1219
1220             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1221             res2 = src_a0 * src_b;
1222
1223             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1224             res3 = src_a0 * src_b;
1225
1226             pa0 += 4;
1227             pb0 += 4;
1228
1229             for (l = ((temp - 1) >> 1); l--;)
1230             {
1231                 src_a0 = LD_SP(pa0);
1232                 src_b0 = LD_SP(pb0);
1233
1234                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1235                 res0 += src_a0 * src_b;
1236
1237                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1238                 res1 += src_a0 * src_b;
1239
1240                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1241                 res2 += src_a0 * src_b;
1242
1243                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1244                 res3 += src_a0 * src_b;
1245
1246                 pa0 += 4;
1247                 pb0 += 4;
1248
1249                 src_a0 = LD_SP(pa0);
1250                 src_b0 = LD_SP(pb0);
1251
1252                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1253                 res0 += src_a0 * src_b;
1254
1255                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1256                 res1 += src_a0 * src_b;
1257
1258                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1259                 res2 += src_a0 * src_b;
1260
1261                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1262                 res3 += src_a0 * src_b;
1263
1264                 pa0 += 4;
1265                 pb0 += 4;
1266             }
1267
1268             if ((temp - 1) & 1)
1269             {
1270                 src_a0 = LD_SP(pa0);
1271                 src_b0 = LD_SP(pb0);
1272
1273                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1274                 res0 += src_a0 * src_b;
1275
1276                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1277                 res1 += src_a0 * src_b;
1278
1279                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
1280                 res2 += src_a0 * src_b;
1281
1282                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
1283                 res3 += src_a0 * src_b;
1284
1285                 pa0 += 4;
1286                 pb0 += 4;
1287             }
1288
1289 #if defined(TRMMKERNEL)
1290             dst0 = res0 * v_alpha;
1291             dst1 = res1 * v_alpha;
1292             dst2 = res2 * v_alpha;
1293             dst3 = res3 * v_alpha;
1294 #else
1295             dst0 = LD_SP(pc0);
1296             dst1 = LD_SP(pc1);
1297             dst2 = LD_SP(pc2);
1298             dst3 = LD_SP(pc3);
1299
1300             dst0 += res0 * v_alpha;
1301             dst1 += res1 * v_alpha;
1302             dst2 += res2 * v_alpha;
1303             dst3 += res3 * v_alpha;
1304 #endif
1305             ST_SP(dst0, pc0);
1306             ST_SP(dst1, pc1);
1307             ST_SP(dst2, pc2);
1308             ST_SP(dst3, pc3);
1309
1310             pc0 += 4;
1311             pc1 += 4;
1312             pc2 += 4;
1313             pc3 += 4;
1314
1315 #if defined(TRMMKERNEL)
1316 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1317             temp = k - off;
1318 #ifdef LEFT
1319             temp -= 4; // number of values in A
1320 #else
1321             temp -= 4; // number of values in B
1322 #endif
1323             pa0 += temp * 4;
1324             pb0 += temp * 4;
1325 #endif
1326
1327 #ifdef LEFT
1328             off += 4; // number of values in A
1329 #endif
1330 #endif
1331         }
1332
1333         if (m & 2)
1334         {
1335 #if defined(TRMMKERNEL)
1336 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1337             pb0 = B;
1338 #else
1339             pa0 += off * 2;
1340             pb0 = B + off * 4;
1341 #endif
1342
1343 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1344             temp = k - off;
1345 #elif defined(LEFT)
1346             temp = off + 2; // number of values in A
1347 #else
1348             temp = off + 4; // number of values in B
1349 #endif
1350 #else
1351             pb0 = B;
1352             temp = k;
1353 #endif
1354
1355             a0 = pa0[0];
1356             b0 = pb0[0];
1357             tmp0 = a0 * b0;
1358
1359             a1 = pa0[1];
1360             tmp1 = a1 * b0;
1361
1362             b1 = pb0[1];
1363             tmp2 = a0 * b1;
1364             tmp3 = a1 * b1;
1365
1366             b2 = pb0[2];
1367             tmp4 = a0 * b2;
1368             tmp5 = a1 * b2;
1369
1370             b3 = pb0[3];
1371             tmp6 = a0 * b3;
1372             tmp7 = a1 * b3;
1373
1374             pa0 += 2;
1375             pb0 += 4;
1376
1377             for (l = ((temp - 1) >> 1); l--;)
1378             {
1379                 a0 = pa0[0];
1380                 b0 = pb0[0];
1381                 tmp0 += a0 * b0;
1382
1383                 a1 = pa0[1];
1384                 tmp1 += a1 * b0;
1385
1386                 b1 = pb0[1];
1387                 tmp2  += a0 * b1;
1388                 tmp3  += a1 * b1;
1389
1390                 b2 = pb0[2];
1391                 tmp4 += a0 * b2;
1392                 tmp5 += a1 * b2;
1393
1394                 b3 = pb0[3];
1395                 tmp6 += a0 * b3;
1396                 tmp7 += a1 * b3;
1397
1398                 pa0 += 2;
1399                 pb0 += 4;
1400
1401                 a0 = pa0[0];
1402                 b0 = pb0[0];
1403                 tmp0 += a0 * b0;
1404
1405                 a1 = pa0[1];
1406                 tmp1 += a1 * b0;
1407
1408                 b1 = pb0[1];
1409                 tmp2  += a0 * b1;
1410                 tmp3  += a1 * b1;
1411
1412                 b2 = pb0[2];
1413                 tmp4 += a0 * b2;
1414                 tmp5 += a1 * b2;
1415
1416                 b3 = pb0[3];
1417                 tmp6 += a0 * b3;
1418                 tmp7 += a1 * b3;
1419
1420                 pa0 += 2;
1421                 pb0 += 4;
1422             }
1423
1424             if ((temp - 1) & 1)
1425             {
1426                 a0 = pa0[0];
1427                 b0 = pb0[0];
1428                 tmp0 += a0 * b0;
1429
1430                 a1 = pa0[1];
1431                 tmp1 += a1 * b0;
1432
1433                 b1 = pb0[1];
1434                 tmp2  += a0 * b1;
1435                 tmp3  += a1 * b1;
1436
1437                 b2 = pb0[2];
1438                 tmp4 += a0 * b2;
1439                 tmp5 += a1 * b2;
1440
1441                 b3 = pb0[3];
1442                 tmp6 += a0 * b3;
1443                 tmp7 += a1 * b3;
1444
1445                 pa0 += 2;
1446                 pb0 += 4;
1447             }
1448
1449             tmp0 = alpha * tmp0;
1450             tmp2 = alpha * tmp2;
1451             tmp4 = alpha * tmp4;
1452             tmp6 = alpha * tmp6;
1453
1454 #if defined(TRMMKERNEL)
1455             pc0[0] = tmp0;
1456             pc1[0] = tmp2;
1457             pc2[0] = tmp4;
1458             pc3[0] = tmp6;
1459 #else
1460             pc0[0] += tmp0;
1461             pc1[0] += tmp2;
1462             pc2[0] += tmp4;
1463             pc3[0] += tmp6;
1464 #endif
1465             tmp1 = alpha * tmp1;
1466             tmp3 = alpha * tmp3;
1467             tmp5 = alpha * tmp5;
1468             tmp7 = alpha * tmp7;
1469
1470 #if defined(TRMMKERNEL)
1471             pc0[1] = tmp1;
1472             pc1[1] = tmp3;
1473             pc2[1] = tmp5;
1474             pc3[1] = tmp7;
1475 #else
1476             pc0[1] += tmp1;
1477             pc1[1] += tmp3;
1478             pc2[1] += tmp5;
1479             pc3[1] += tmp7;
1480 #endif
1481             pc0 += 2;
1482             pc1 += 2;
1483             pc2 += 2;
1484             pc3 += 2;
1485
1486 #if defined(TRMMKERNEL)
1487 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1488             temp = k - off;
1489 #ifdef LEFT
1490             temp -= 2; // number of values in A
1491 #else
1492             temp -= 4; // number of values in B
1493 #endif
1494             pa0 += temp * 2;
1495             pb0 += temp * 4;
1496 #endif
1497
1498 #ifdef LEFT
1499             off += 2; // number of values in A
1500 #endif
1501 #endif
1502         }
1503
1504         if (m & 1)
1505         {
1506 #if defined(TRMMKERNEL)
1507 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1508             pb0 = B;
1509 #else
1510             pa0 += off * 1;
1511             pb0 = B + off * 4;
1512 #endif
1513
1514 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1515             temp = k - off;
1516 #elif defined(LEFT)
1517             temp = off + 1; // number of values in A
1518 #else
1519             temp = off + 4; // number of values in B
1520 #endif
1521 #else
1522             pb0 = B;
1523             temp = k;
1524 #endif
1525
1526             a0 = pa0[0];
1527             b0 = pb0[0];
1528             tmp0 = a0 * b0;
1529
1530             b1 = pb0[1];
1531             tmp1 = a0 * b1;
1532
1533             b2 = pb0[2];
1534             tmp2 = a0 * b2;
1535
1536             b3 = pb0[3];
1537             tmp3 = a0 * b3;
1538
1539             pa0 += 1;
1540             pb0 += 4;
1541
1542             for (l = ((temp - 1) >> 1); l--;)
1543             {
1544                 a0 = pa0[0];
1545                 b0 = pb0[0];
1546                 tmp0 += a0 * b0;
1547
1548                 b1 = pb0[1];
1549                 tmp1  += a0 * b1;
1550
1551                 b2 = pb0[2];
1552                 tmp2 += a0 * b2;
1553
1554                 b3 = pb0[3];
1555                 tmp3 += a0 * b3;
1556
1557                 pa0 += 1;
1558                 pb0 += 4;
1559
1560                 a0 = pa0[0];
1561                 b0 = pb0[0];
1562                 tmp0 += a0 * b0;
1563
1564                 b1 = pb0[1];
1565                 tmp1  += a0 * b1;
1566
1567                 b2 = pb0[2];
1568                 tmp2 += a0 * b2;
1569
1570                 b3 = pb0[3];
1571                 tmp3 += a0 * b3;
1572
1573                 pa0 += 1;
1574                 pb0 += 4;
1575             }
1576
1577             if ((temp - 1) & 1)
1578             {
1579                 a0 = pa0[0];
1580                 b0 = pb0[0];
1581                 tmp0 += a0 * b0;
1582
1583                 b1 = pb0[1];
1584                 tmp1  += a0 * b1;
1585
1586                 b2 = pb0[2];
1587                 tmp2 += a0 * b2;
1588
1589                 b3 = pb0[3];
1590                 tmp3 += a0 * b3;
1591
1592                 pa0 += 1;
1593                 pb0 += 4;
1594             }
1595
1596             tmp0 = alpha * tmp0;
1597             tmp1 = alpha * tmp1;
1598             tmp2 = alpha * tmp2;
1599             tmp3 = alpha * tmp3;
1600
1601 #if defined(TRMMKERNEL)
1602             pc0[0] = tmp0;
1603             pc1[0] = tmp1;
1604             pc2[0] = tmp2;
1605             pc3[0] = tmp3;
1606 #else
1607             pc0[0] += tmp0;
1608             pc1[0] += tmp1;
1609             pc2[0] += tmp2;
1610             pc3[0] += tmp3;
1611 #endif
1612             pc0 += 1;
1613             pc1 += 1;
1614             pc2 += 1;
1615             pc3 += 1;
1616
1617 #if defined(TRMMKERNEL)
1618 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1619             temp = k - off;
1620 #ifdef LEFT
1621             temp -= 1; // number of values in A
1622 #else
1623             temp -= 4; // number of values in B
1624 #endif
1625             pa0 += temp * 1;
1626             pb0 += temp * 4;
1627 #endif
1628
1629 #ifdef LEFT
1630             off += 1; // number of values in A
1631 #endif
1632 #endif
1633         }
1634
1635 #if defined(TRMMKERNEL) && !defined(LEFT)
1636         off += 4; // number of values in A
1637 #endif
1638
1639         B += (k << 2);
1640         C += (ldc << 2);
1641     }
1642
1643     if (n & 2)
1644     {
1645         pc0 = C;
1646         pc1 = pc0 + ldc;
1647
1648 #if defined(TRMMKERNEL) && defined(LEFT)
1649         off = offset;
1650 #endif
1651
1652         pa0 = A;
1653
1654         for (i = (m >> 3); i--;)
1655         {
1656 #if defined(TRMMKERNEL)
1657 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1658             pb0 = B;
1659 #else
1660             pa0 += off * 8;
1661             pb0 = B + off * 2;
1662 #endif
1663
1664 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1665             temp = k - off;
1666 #elif defined(LEFT)
1667             temp = off + 8; // number of values in A
1668 #else
1669             temp = off + 2; // number of values in B
1670 #endif
1671 #else
1672             pb0 = B;
1673             temp = k;
1674 #endif
1675
1676             LD_SP2_INC(pa0, 4, src_a0, src_a1);
1677             src_b0[0] = pb0[0];
1678             src_b0[1] = pb0[1];
1679
1680             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1681             res0 = src_a0 * src_b;
1682             res1 = src_a1 * src_b;
1683
1684             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1685             res2 = src_a0 * src_b;
1686             res3 = src_a1 * src_b;
1687
1688             pb0 += 2;
1689
1690             for (l = ((temp - 1) >> 1); l--;)
1691             {
1692                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1693                 src_b0[0] = pb0[0];
1694                 src_b0[1] = pb0[1];
1695
1696                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1697                 res0 += src_a0 * src_b;
1698                 res1 += src_a1 * src_b;
1699
1700                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1701                 res2 += src_a0 * src_b;
1702                 res3 += src_a1 * src_b;
1703
1704                 pb0 += 2;
1705
1706                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1707                 src_b0[0] = pb0[0];
1708                 src_b0[1] = pb0[1];
1709
1710                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1711                 res0 += src_a0 * src_b;
1712                 res1 += src_a1 * src_b;
1713
1714                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1715                 res2 += src_a0 * src_b;
1716                 res3 += src_a1 * src_b;
1717
1718                 pb0 += 2;
1719             }
1720
1721             if ((temp - 1) & 1)
1722             {
1723                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
1724                 src_b0[0] = pb0[0];
1725                 src_b0[1] = pb0[1];
1726
1727                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1728                 res0 += src_a0 * src_b;
1729                 res1 += src_a1 * src_b;
1730
1731                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1732                 res2 += src_a0 * src_b;
1733                 res3 += src_a1 * src_b;
1734
1735                 pb0 += 2;
1736             }
1737
1738 #if defined(TRMMKERNEL)
1739             dst0 = res0 * v_alpha;
1740             dst1 = res1 * v_alpha;
1741             dst2 = res2 * v_alpha;
1742             dst3 = res3 * v_alpha;
1743 #else
1744             LD_SP2(pc0, 4, dst0, dst1);
1745             LD_SP2(pc1, 4, dst2, dst3);
1746
1747             dst0 += res0 * v_alpha;
1748             dst1 += res1 * v_alpha;
1749             dst2 += res2 * v_alpha;
1750             dst3 += res3 * v_alpha;
1751 #endif
1752             ST_SP2_INC(dst0, dst1, pc0, 4);
1753             ST_SP2_INC(dst2, dst3, pc1, 4);
1754
1755 #if defined(TRMMKERNEL)
1756 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1757             temp = k - off;
1758 #ifdef LEFT
1759             temp -= 8; // number of values in A
1760 #else
1761             temp -= 2; // number of values in B
1762 #endif
1763             pa0 += temp * 8;
1764             pb0 += temp * 2;
1765 #endif
1766
1767 #ifdef LEFT
1768             off += 8; // number of values in A
1769 #endif
1770 #endif
1771         }
1772
1773         if (m & 4)
1774         {
1775 #if defined(TRMMKERNEL)
1776 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1777             pb0 = B;
1778 #else
1779             pa0 += off * 4;
1780             pb0 = B + off * 2;
1781 #endif
1782
1783 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1784             temp = k - off;
1785 #elif defined(LEFT)
1786             temp = off + 4; // number of values in A
1787 #else
1788             temp = off + 2; // number of values in B
1789 #endif
1790 #else
1791             pb0 = B;
1792             temp = k;
1793 #endif
1794
1795             src_a0 = LD_SP(pa0);
1796             src_b0[0] = pb0[0];
1797             src_b0[1] = pb0[1];
1798
1799             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1800             res0 = src_a0 * src_b;
1801
1802             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1803             res1 = src_a0 * src_b;
1804
1805             pa0 += 4;
1806             pb0 += 2;
1807
1808             for (l = ((temp - 1) >> 1); l--;)
1809             {
1810                 src_a0 = LD_SP(pa0);
1811                 src_b0[0] = pb0[0];
1812                 src_b0[1] = pb0[1];
1813
1814                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1815                 res0 += src_a0 * src_b;
1816
1817                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1818                 res1 += src_a0 * src_b;
1819
1820                 pa0 += 4;
1821                 pb0 += 2;
1822
1823                 src_a0 = LD_SP(pa0);
1824                 src_b0[0] = pb0[0];
1825                 src_b0[1] = pb0[1];
1826
1827                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1828                 res0 += src_a0 * src_b;
1829
1830                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1831                 res1 += src_a0 * src_b;
1832
1833                 pa0 += 4;
1834                 pb0 += 2;
1835             }
1836
1837             if ((temp - 1) & 1)
1838             {
1839                 src_a0 = LD_SP(pa0);
1840                 src_b0[0] = pb0[0];
1841                 src_b0[1] = pb0[1];
1842
1843                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
1844                 res0 += src_a0 * src_b;
1845
1846                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
1847                 res1 += src_a0 * src_b;
1848
1849                 pa0 += 4;
1850                 pb0 += 2;
1851             }
1852
1853 #if defined(TRMMKERNEL)
1854             dst0 = res0 * v_alpha;
1855             dst1 = res1 * v_alpha;
1856 #else
1857             dst0 = LD_SP(pc0);
1858             dst1 = LD_SP(pc1);
1859
1860             dst0 += res0 * v_alpha;
1861             dst1 += res1 * v_alpha;
1862 #endif
1863             ST_SP(dst0, pc0);
1864             ST_SP(dst1, pc1);
1865
1866             pc0 += 4;
1867             pc1 += 4;
1868
1869 #if defined(TRMMKERNEL)
1870 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1871             temp = k - off;
1872 #ifdef LEFT
1873             temp -= 4; // number of values in A
1874 #else
1875             temp -= 2; // number of values in B
1876 #endif
1877             pa0 += temp * 4;
1878             pb0 += temp * 2;
1879 #endif
1880
1881 #ifdef LEFT
1882             off += 4; // number of values in A
1883 #endif
1884 #endif
1885         }
1886
1887         if (m & 2)
1888         {
1889 #if defined(TRMMKERNEL)
1890 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1891             pb0 = B;
1892 #else
1893             pa0 += off * 2;
1894             pb0 = B + off * 2;
1895 #endif
1896
1897 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1898             temp = k - off;
1899 #elif defined(LEFT)
1900             temp = off + 2; // number of values in A
1901 #else
1902             temp = off + 2; // number of values in B
1903 #endif
1904 #else
1905             pb0 = B;
1906             temp = k;
1907 #endif
1908
1909             a0 = pa0[0];
1910             b0 = pb0[0];
1911             tmp0 = a0 * b0;
1912
1913             a1 = pa0[1];
1914             tmp1 = a1 * b0;
1915
1916             b1 = pb0[1];
1917             tmp2 = a0 * b1;
1918             tmp3 = a1 * b1;
1919
1920             pa0 += 2;
1921             pb0 += 2;
1922
1923             for (l = ((temp - 1) >> 1); l--;)
1924             {
1925                 a0 = pa0[0];
1926                 b0 = pb0[0];
1927                 tmp0 += a0 * b0;
1928
1929                 a1 = pa0[1];
1930                 tmp1 += a1 * b0;
1931
1932                 b1 = pb0[1];
1933                 tmp2  += a0 * b1;
1934                 tmp3  += a1 * b1;
1935
1936                 pa0 += 2;
1937                 pb0 += 2;
1938
1939                 a0 = pa0[0];
1940                 b0 = pb0[0];
1941                 tmp0 += a0 * b0;
1942
1943                 a1 = pa0[1];
1944                 tmp1 += a1 * b0;
1945
1946                 b1 = pb0[1];
1947                 tmp2  += a0 * b1;
1948                 tmp3  += a1 * b1;
1949
1950                 pa0 += 2;
1951                 pb0 += 2;
1952             }
1953
1954             if ((temp - 1) & 1)
1955             {
1956                 a0 = pa0[0];
1957                 b0 = pb0[0];
1958                 tmp0 += a0 * b0;
1959
1960                 a1 = pa0[1];
1961                 tmp1 += a1 * b0;
1962
1963                 b1 = pb0[1];
1964                 tmp2  += a0 * b1;
1965                 tmp3  += a1 * b1;
1966
1967                 pa0 += 2;
1968                 pb0 += 2;
1969             }
1970
1971             tmp0 = alpha * tmp0;
1972             tmp1 = alpha * tmp1;
1973             tmp2 = alpha * tmp2;
1974             tmp3 = alpha * tmp3;
1975
1976 #if defined(TRMMKERNEL)
1977             pc0[0] = tmp0;
1978             pc1[0] = tmp2;
1979             pc0[1] = tmp1;
1980             pc1[1] = tmp3;
1981 #else
1982             pc0[0] += tmp0;
1983             pc1[0] += tmp2;
1984             pc0[1] += tmp1;
1985             pc1[1] += tmp3;
1986 #endif
1987             pc0 += 2;
1988             pc1 += 2;
1989
1990 #if defined(TRMMKERNEL)
1991 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
1992             temp = k - off;
1993 #ifdef LEFT
1994             temp -= 2; // number of values in A
1995 #else
1996             temp -= 2; // number of values in B
1997 #endif
1998             pa0 += temp * 2;
1999             pb0 += temp * 2;
2000 #endif
2001
2002 #ifdef LEFT
2003             off += 2; // number of values in A
2004 #endif
2005 #endif
2006         }
2007
2008         if (m & 1)
2009         {
2010 #if defined(TRMMKERNEL)
2011 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2012             pb0 = B;
2013 #else
2014             pa0 += off * 1;
2015             pb0 = B + off * 2;
2016 #endif
2017
2018 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2019             temp = k - off;
2020 #elif defined(LEFT)
2021             temp = off + 1; // number of values in A
2022 #else
2023             temp = off + 2; // number of values in B
2024 #endif
2025 #else
2026             pb0 = B;
2027             temp = k;
2028 #endif
2029
2030             a0 = pa0[0];
2031             b0 = pb0[0];
2032             tmp0 = a0 * b0;
2033
2034             b1 = pb0[1];
2035             tmp1 = a0 * b1;
2036
2037             pa0 += 1;
2038             pb0 += 2;
2039
2040             for (l = ((temp - 1) >> 1); l--;)
2041             {
2042                 a0 = pa0[0];
2043                 b0 = pb0[0];
2044                 tmp0 += a0 * b0;
2045
2046                 b1 = pb0[1];
2047                 tmp1  += a0 * b1;
2048
2049                 pa0 += 1;
2050                 pb0 += 2;
2051
2052                 a0 = pa0[0];
2053                 b0 = pb0[0];
2054                 tmp0 += a0 * b0;
2055
2056                 b1 = pb0[1];
2057                 tmp1  += a0 * b1;
2058
2059                 pa0 += 1;
2060                 pb0 += 2;
2061             }
2062
2063             if ((temp - 1) & 1)
2064             {
2065                 a0 = pa0[0];
2066                 b0 = pb0[0];
2067                 tmp0 += a0 * b0;
2068
2069                 b1 = pb0[1];
2070                 tmp1  += a0 * b1;
2071
2072                 pa0 += 1;
2073                 pb0 += 2;
2074             }
2075
2076             tmp0 = alpha * tmp0;
2077             tmp1 = alpha * tmp1;
2078
2079 #if defined(TRMMKERNEL)
2080             pc0[0] = tmp0;
2081             pc1[0] = tmp1;
2082 #else
2083             pc0[0] += tmp0;
2084             pc1[0] += tmp1;
2085 #endif
2086             pc0 += 1;
2087             pc1 += 1;
2088
2089 #if defined(TRMMKERNEL)
2090 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2091             temp = k - off;
2092 #ifdef LEFT
2093             temp -= 1; // number of values in A
2094 #else
2095             temp -= 2; // number of values in B
2096 #endif
2097             pa0 += temp * 1;
2098             pb0 += temp * 2;
2099 #endif
2100
2101 #ifdef LEFT
2102             off += 1; // number of values in A
2103 #endif
2104 #endif
2105         }
2106
2107 #if defined(TRMMKERNEL) && !defined(LEFT)
2108         off += 2; // number of values in A
2109 #endif
2110
2111         B += (k << 1);
2112         C += (ldc << 1);
2113     }
2114
2115     if (n & 1)
2116     {
2117         pc0 = C;
2118
2119 #if defined(TRMMKERNEL) && defined(LEFT)
2120         off = offset;
2121 #endif
2122
2123         pa0 = A;
2124
2125         for (i = (m >> 3); i--;)
2126         {
2127 #if defined(TRMMKERNEL)
2128 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2129             pb0 = B;
2130 #else
2131             pa0 += off * 8;
2132             pb0 = B + off * 1;
2133 #endif
2134
2135 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2136             temp = k - off;
2137 #elif defined(LEFT)
2138             temp = off + 8; // number of values in A
2139 #else
2140             temp = off + 1; // number of values in B
2141 #endif
2142 #else
2143             pb0 = B;
2144             temp = k;
2145 #endif
2146
2147             LD_SP2_INC(pa0, 4, src_a0, src_a1);
2148             src_b0[0] = pb0[0];
2149
2150             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2151             res0 = src_a0 * src_b;
2152             res1 = src_a1 * src_b;
2153
2154             pb0 += 1;
2155
2156             for (l = ((temp - 1) >> 1); l--;)
2157             {
2158                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
2159                 src_b0[0] = pb0[0];
2160
2161                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2162                 res0 += src_a0 * src_b;
2163                 res1 += src_a1 * src_b;
2164
2165                 pb0 += 1;
2166
2167                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
2168                 src_b0[0] = pb0[0];
2169
2170                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2171                 res0 += src_a0 * src_b;
2172                 res1 += src_a1 * src_b;
2173
2174                 pb0 += 1;
2175             }
2176
2177             if ((temp - 1) & 1)
2178             {
2179                 LD_SP2_INC(pa0, 4, src_a0, src_a1);
2180                 src_b0[0] = pb0[0];
2181
2182                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2183                 res0 += src_a0 * src_b;
2184                 res1 += src_a1 * src_b;
2185
2186                 pb0 += 1;
2187             }
2188
2189 #if defined(TRMMKERNEL)
2190             dst0 = res0 * v_alpha;
2191             dst1 = res1 * v_alpha;
2192 #else
2193             LD_SP2(pc0, 4, dst0, dst1);
2194
2195             dst0 += res0 * v_alpha;
2196             dst1 += res1 * v_alpha;
2197 #endif
2198             ST_SP2_INC(dst0, dst1, pc0, 4);
2199
2200 #if defined(TRMMKERNEL)
2201 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2202             temp = k - off;
2203 #ifdef LEFT
2204             temp -= 8; // number of values in A
2205 #else
2206             temp -= 1; // number of values in B
2207 #endif
2208             pa0 += temp * 8;
2209             pb0 += temp * 1;
2210 #endif
2211
2212 #ifdef LEFT
2213             off += 8; // number of values in A
2214 #endif
2215 #endif
2216         }
2217
2218         if (m & 4)
2219         {
2220 #if defined(TRMMKERNEL)
2221 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2222             pb0 = B;
2223 #else
2224             pa0 += off * 4;
2225             pb0 = B + off * 1;
2226 #endif
2227
2228 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2229             temp = k - off;
2230 #elif defined(LEFT)
2231             temp = off + 4; // number of values in A
2232 #else
2233             temp = off + 1; // number of values in B
2234 #endif
2235 #else
2236             pb0 = B;
2237             temp = k;
2238 #endif
2239
2240             src_a0 = LD_SP(pa0);
2241             src_b0[0] = pb0[0];
2242
2243             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2244             res0 = src_a0 * src_b;
2245
2246             pa0 += 4;
2247             pb0 += 1;
2248
2249             for (l = ((temp - 1) >> 1); l--;)
2250             {
2251                 src_a0 = LD_SP(pa0);
2252                 src_b0[0] = pb0[0];
2253
2254                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2255                 res0 += src_a0 * src_b;
2256
2257                 pa0 += 4;
2258                 pb0 += 1;
2259
2260                 src_a0 = LD_SP(pa0);
2261                 src_b0[0] = pb0[0];
2262
2263                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2264                 res0 += src_a0 * src_b;
2265
2266                 pa0 += 4;
2267                 pb0 += 1;
2268             }
2269
2270             if ((temp - 1) & 1)
2271             {
2272                 src_a0 = LD_SP(pa0);
2273                 src_b0[0] = pb0[0];
2274
2275                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
2276                 res0 += src_a0 * src_b;
2277
2278                 pa0 += 4;
2279                 pb0 += 1;
2280             }
2281
2282 #if defined(TRMMKERNEL)
2283             dst0 = res0 * v_alpha;
2284 #else
2285             dst0 = LD_SP(pc0);
2286
2287             dst0 += res0 * v_alpha;
2288 #endif
2289             ST_SP(dst0, pc0);
2290
2291             pc0 += 4;
2292
2293 #if defined(TRMMKERNEL)
2294 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2295             temp = k - off;
2296 #ifdef LEFT
2297             temp -= 4; // number of values in A
2298 #else
2299             temp -= 1; // number of values in B
2300 #endif
2301             pa0 += temp * 4;
2302             pb0 += temp * 1;
2303 #endif
2304
2305 #ifdef LEFT
2306             off += 4; // number of values in A
2307 #endif
2308 #endif
2309         }
2310
2311         if (m & 2)
2312         {
2313 #if defined(TRMMKERNEL)
2314 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2315             pb0 = B;
2316 #else
2317             pa0 += off * 2;
2318             pb0 = B + off * 1;
2319 #endif
2320
2321 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2322             temp = k - off;
2323 #elif defined(LEFT)
2324             temp = off + 2; // number of values in A
2325 #else
2326             temp = off + 1; // number of values in B
2327 #endif
2328 #else
2329             pb0 = B;
2330             temp = k;
2331 #endif
2332
2333             a0 = pa0[0];
2334             b0 = pb0[0];
2335             tmp0 = a0 * b0;
2336
2337             a1 = pa0[1];
2338             tmp1 = a1 * b0;
2339
2340             pa0 += 2;
2341             pb0 += 1;
2342
2343             for (l = ((temp - 1) >> 1); l--;)
2344             {
2345                 a0 = pa0[0];
2346                 b0 = pb0[0];
2347                 tmp0 += a0 * b0;
2348
2349                 a1 = pa0[1];
2350                 tmp1 += a1 * b0;
2351
2352                 pa0 += 2;
2353                 pb0 += 1;
2354
2355                 a0 = pa0[0];
2356                 b0 = pb0[0];
2357                 tmp0 += a0 * b0;
2358
2359                 a1 = pa0[1];
2360                 tmp1 += a1 * b0;
2361
2362                 pa0 += 2;
2363                 pb0 += 1;
2364             }
2365
2366             if ((temp - 1) & 1)
2367             {
2368                 a0 = pa0[0];
2369                 b0 = pb0[0];
2370                 tmp0 += a0 * b0;
2371
2372                 a1 = pa0[1];
2373                 tmp1 += a1 * b0;
2374
2375                 pa0 += 2;
2376                 pb0 += 1;
2377             }
2378
2379             tmp0 = alpha * tmp0;
2380             tmp1 = alpha * tmp1;
2381
2382 #if defined(TRMMKERNEL)
2383             pc0[0] = tmp0;
2384             pc0[1] = tmp1;
2385 #else
2386             pc0[0] += tmp0;
2387             pc0[1] += tmp1;
2388 #endif
2389             pc0 += 2;
2390
2391 #if defined(TRMMKERNEL)
2392 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2393             temp = k - off;
2394 #ifdef LEFT
2395             temp -= 2; // number of values in A
2396 #else
2397             temp -= 1; // number of values in B
2398 #endif
2399             pa0 += temp * 2;
2400             pb0 += temp * 1;
2401 #endif
2402
2403 #ifdef LEFT
2404             off += 2; // number of values in A
2405 #endif
2406 #endif
2407         }
2408
2409         if (m & 1)
2410         {
2411 #if defined(TRMMKERNEL)
2412 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
2413             pb0 = B;
2414 #else
2415             pa0 += off * 1;
2416             pb0 = B + off * 1;
2417 #endif
2418
2419 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2420             temp = k - off;
2421 #elif defined(LEFT)
2422             temp = off + 1; // number of values in A
2423 #else
2424             temp = off + 1; // number of values in B
2425 #endif
2426 #else
2427             pb0 = B;
2428             temp = k;
2429 #endif
2430
2431             a0 = pa0[0];
2432             b0 = pb0[0];
2433             tmp0 = a0 * b0;
2434
2435             pa0 += 1;
2436             pb0 += 1;
2437
2438             for (l = ((temp - 1) >> 1); l--;)
2439             {
2440                 a0 = pa0[0];
2441                 b0 = pb0[0];
2442                 tmp0 += a0 * b0;
2443
2444                 pa0 += 1;
2445                 pb0 += 1;
2446
2447                 a0 = pa0[0];
2448                 b0 = pb0[0];
2449                 tmp0 += a0 * b0;
2450
2451                 pa0 += 1;
2452                 pb0 += 1;
2453             }
2454
2455             if ((temp - 1) & 1)
2456             {
2457                 a0 = pa0[0];
2458                 b0 = pb0[0];
2459                 tmp0 += a0 * b0;
2460
2461                 pa0 += 1;
2462                 pb0 += 1;
2463             }
2464
2465 #if defined(TRMMKERNEL)
2466             pc0[0] = alpha * tmp0;
2467 #else
2468             pc0[0] += alpha * tmp0;
2469 #endif
2470         }
2471     }
2472
2473     return 0;
2474 }