1 /*********************************************************************************
2 Copyright (c) 2020, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 **********************************************************************************/
30 typedef unsigned char vec_t __attribute__ ((vector_size (16)));
31 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
32 typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
35 #define SAVE_ACC(ACC, J) \
36 __builtin_mma_disassemble_acc (result, ACC); \
37 rowC = (v4sf_t *) &CO[0* ldc+J]; \
38 rowC[0] = result[3] * alpha; \
39 rowC = (v4sf_t *) &CO[1*ldc+J]; \
40 rowC[0] = result[2] * alpha; \
41 rowC = (v4sf_t *) &CO[2*ldc+J]; \
42 rowC[0] = result[1] * alpha; \
43 rowC = (v4sf_t *) &CO[3*ldc+J]; \
44 rowC[0] = result[0] * alpha;
45 #define SAVE_ACC1(ACC, J) \
46 __builtin_mma_disassemble_acc (result, ACC); \
47 rowC = (v4sf_t *) &CO[4* ldc+J]; \
48 rowC[0] = result[3] * alpha; \
49 rowC = (v4sf_t *) &CO[5*ldc+J]; \
50 rowC[0] = result[2] * alpha; \
51 rowC = (v4sf_t *) &CO[6*ldc+J]; \
52 rowC[0] = result[1] * alpha; \
53 rowC = (v4sf_t *) &CO[7*ldc+J]; \
54 rowC[0] = result[0] * alpha;
55 #define SAVE2x4_ACC(ACC, J) \
56 __builtin_mma_disassemble_acc (result, ACC); \
57 rowC = (v4sf_t *) &CO[0* ldc+J]; \
58 rowC[0] = result[3] * alpha; \
59 rowC = (v4sf_t *) &CO[1* ldc+J]; \
60 rowC[0] = result[2] * alpha;
62 #define SAVE_ACC(ACC, J) \
63 __builtin_mma_disassemble_acc (result, ACC); \
64 rowC = (v4sf_t *) &CO[0* ldc+J]; \
65 rowC[0] += result[3] * alpha; \
66 rowC = (v4sf_t *) &CO[1*ldc+J]; \
67 rowC[0] += result[2] * alpha; \
68 rowC = (v4sf_t *) &CO[2*ldc+J]; \
69 rowC[0] += result[1] * alpha; \
70 rowC = (v4sf_t *) &CO[3*ldc+J]; \
71 rowC[0] += result[0] * alpha;
72 #define SAVE_ACC1(ACC, J) \
73 __builtin_mma_disassemble_acc (result, ACC); \
74 rowC = (v4sf_t *) &CO[4* ldc+J]; \
75 rowC[0] += result[3] * alpha; \
76 rowC = (v4sf_t *) &CO[5*ldc+J]; \
77 rowC[0] += result[2] * alpha; \
78 rowC = (v4sf_t *) &CO[6*ldc+J]; \
79 rowC[0] += result[1] * alpha; \
80 rowC = (v4sf_t *) &CO[7*ldc+J]; \
81 rowC[0] += result[0] * alpha;
82 #define SAVE2x4_ACC(ACC, J) \
83 __builtin_mma_disassemble_acc (result, ACC); \
84 rowC = (v4sf_t *) &CO[0* ldc+J]; \
85 rowC[0] += result[3] * alpha; \
86 rowC = (v4sf_t *) &CO[1* ldc+J]; \
87 rowC[0] += result[2] * alpha;
90 #define SET_ACC_ZERO4() \
91 __builtin_mma_xxsetaccz (&acc0); \
92 __builtin_mma_xxsetaccz (&acc1); \
93 __builtin_mma_xxsetaccz (&acc2); \
94 __builtin_mma_xxsetaccz (&acc3);
96 #define SET_ACC_ZERO8() \
97 __builtin_mma_xxsetaccz (&acc0); \
98 __builtin_mma_xxsetaccz (&acc1); \
99 __builtin_mma_xxsetaccz (&acc2); \
100 __builtin_mma_xxsetaccz (&acc3); \
101 __builtin_mma_xxsetaccz (&acc4); \
102 __builtin_mma_xxsetaccz (&acc5); \
103 __builtin_mma_xxsetaccz (&acc6); \
104 __builtin_mma_xxsetaccz (&acc7);
106 #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
108 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
109 #define REFRESH_TEMP_BK(x, y) \
112 #define REFRESH_TEMP_BK(x, y) \
115 #define REFRESH_TEMP_BK(x, y) \
118 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
119 #define REFRESH_POINTERS(x, y) \
121 REFRESH_TEMP_BK(x, y)
123 #define REFRESH_POINTERS(x, y) \
126 REFRESH_TEMP_BK(x, y)
130 #define REFRESH_OFF(x) \
133 #define REFRESH_OFF(x)
137 #define UPDATE_TEMP(x, y) \
140 #define UPDATE_TEMP(x, y) \
144 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
145 #define REFRESH_TMP_AFTER_SAVE(x, y) \
151 #define REFRESH_TMP_AFTER_SAVE(x, y)
154 #define REFRESH_AFTER_SAVE(x,y) \
155 REFRESH_TMP_AFTER_SAVE(x, y) \
157 /*************************************************************************************
159 *************************************************************************************/
161 CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
162 FLOAT * C, BLASLONG ldc
170 #if defined(TRMMKERNEL)
173 #if defined(TRMMKERNEL) && !defined(LEFT)
176 v4sf_t valpha = { alpha, alpha };
178 for (i1 = 0; i1 < N; i1++)
183 #if defined(TRMMKERNEL) && defined(LEFT)
192 for (j = 0; j < i; j++)
195 #if defined(TRMMKERNEL)
196 REFRESH_POINTERS (16, 4);
205 PREFETCH1 (CO + ldc, 0);
206 PREFETCH1 (CO + ldc + ldc, 0);
207 PREFETCH1 (CO + ldc + ldc + ldc, 0);
209 PREFETCH1 (CO + ldc, 128);
210 PREFETCH1 (CO + ldc + ldc, 128);
211 PREFETCH1 (CO + ldc + ldc + ldc, 128);
212 __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
214 for (l = 0; l < temp; l++)
216 vec_t *rowA = (vec_t *) & AO[l << 4];
218 vec_t *rb = (vec_t *) & BO[l << 2];
219 __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
220 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
221 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
222 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
223 __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
224 __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
225 __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
226 __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
227 __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
234 SAVE_ACC (&acc6, 12);
235 SAVE_ACC (&acc5, 10);
236 SAVE_ACC (&acc7, 14);
239 #if defined(TRMMKERNEL)
240 REFRESH_AFTER_SAVE (16, 4)
245 for (j = 0; j < i; j++)
248 #if defined(TRMMKERNEL)
249 REFRESH_POINTERS (8, 4);
256 __vector_quad acc0, acc1, acc2, acc3;
259 for (l = 0; l < temp; l++)
261 vec_t *rowA = (vec_t *) & AO[l << 3];
263 vec_t *rb = (vec_t *) & BO[l << 2];
264 __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
265 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
266 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
267 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
268 __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
277 #if defined(TRMMKERNEL)
278 REFRESH_AFTER_SAVE (8, 4)
282 for (j = 0; j < i; j++)
285 #if defined(TRMMKERNEL)
286 REFRESH_POINTERS (4, 4);
293 __vector_quad acc0, acc1;
294 __builtin_mma_xxsetaccz (&acc0);
295 __builtin_mma_xxsetaccz (&acc1);
297 for (l = 0; l < temp; l++)
299 vec_t *rowA = (vec_t *) & AO[l << 2];
301 vec_t *rb = (vec_t *) & BO[l << 2];
302 __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
303 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
304 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
311 #if defined(TRMMKERNEL)
312 REFRESH_AFTER_SAVE (4, 4)
316 for (j = 0; j < i; j++)
319 #if defined(TRMMKERNEL)
320 REFRESH_POINTERS (2, 4);
328 __builtin_mma_xxsetaccz (&acc0);
330 for (l = 0; l < temp; l++)
332 vec_t *rowA = (vec_t *) & AO[l << 1];
334 vec_t *rb = (vec_t *) & BO[l << 2];
335 __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
336 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
342 #if defined(TRMMKERNEL)
343 REFRESH_AFTER_SAVE (2, 4)
347 for (j = 0; j < i; j++)
350 #if defined(TRMMKERNEL)
351 REFRESH_POINTERS (1, 4);
358 v4sf_t t1 = { 0, 0 };
359 for (l = 0; l < temp; l++)
361 v4sf_t rowA = { AO[l], AO[l] };
362 v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
363 v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
369 #if defined(TRMMKERNEL)
377 CO[2 * ldc] += t1[0];
378 CO[3 * ldc] += t1[1];
383 #if defined(TRMMKERNEL)
384 REFRESH_AFTER_SAVE (1, 4)
387 #if defined(TRMMKERNEL) && !defined(LEFT)
388 off += 4; // number of values in A
393 for (i1 = 0; i1 < N; i1++)
396 #if defined(TRMMKERNEL) && defined(LEFT)
405 for (j = 0; j < i; j++)
408 #if defined(TRMMKERNEL)
409 REFRESH_POINTERS (16, 2);
416 __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
419 for (l = 0; l < temp; l++)
421 FLOAT t[4] = { 0, 0, 0, 0 };
422 t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
424 vec_t *rb = (vec_t *) & t[0];
425 __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
426 vec_t *rowA = (vec_t *) & AO[l << 4];
427 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
428 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
429 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
430 __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
431 __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
432 __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
433 __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
434 __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
436 SAVE2x4_ACC (&acc0, 0);
437 SAVE2x4_ACC (&acc1, 2);
438 SAVE2x4_ACC (&acc2, 4);
439 SAVE2x4_ACC (&acc3, 6);
440 SAVE2x4_ACC (&acc4, 8);
441 SAVE2x4_ACC (&acc5, 10);
442 SAVE2x4_ACC (&acc6, 12);
443 SAVE2x4_ACC (&acc7, 14);
447 #if defined(TRMMKERNEL)
448 REFRESH_AFTER_SAVE (16, 2)
452 for (j = 0; j < i; j++)
455 #if defined(TRMMKERNEL)
456 REFRESH_POINTERS (8, 2);
463 __vector_quad acc0, acc1, acc2, acc3;
466 for (l = 0; l < temp; l++)
468 FLOAT t[4] = { 0, 0, 0, 0 };
469 t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
471 vec_t *rb = (vec_t *) & t[0];
472 __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
473 vec_t *rowA = (vec_t *) & AO[l << 3];
474 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
475 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
476 __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
477 __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
479 SAVE2x4_ACC (&acc0, 0);
480 SAVE2x4_ACC (&acc1, 2);
481 SAVE2x4_ACC (&acc2, 4);
482 SAVE2x4_ACC (&acc3, 6);
486 #if defined(TRMMKERNEL)
487 REFRESH_AFTER_SAVE (8, 2)
491 for (j = 0; j < i; j++)
494 #if defined(TRMMKERNEL)
495 REFRESH_POINTERS (4, 2);
502 __vector_quad acc0, acc1;
503 __builtin_mma_xxsetaccz (&acc0);
504 __builtin_mma_xxsetaccz (&acc1);
506 for (l = 0; l < temp; l++)
508 FLOAT t[4] = { 0, 0, 0, 0 };
509 t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
511 vec_t *rb = (vec_t *) & t[0];
512 __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
513 vec_t *rowA = (vec_t *) & AO[l << 2];
514 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
515 __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
517 SAVE2x4_ACC (&acc0, 0);
518 SAVE2x4_ACC (&acc1, 2);
522 #if defined(TRMMKERNEL)
523 REFRESH_AFTER_SAVE (4, 2)
527 for (j = 0; j < i; j++)
530 #if defined(TRMMKERNEL)
531 REFRESH_POINTERS (2, 2);
539 __builtin_mma_xxsetaccz (&acc0);
541 for (l = 0; l < temp; l++)
543 FLOAT t[4] = { 0, 0, 0, 0 };
544 t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
546 vec_t *rb = (vec_t *) & t[0];
547 __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
548 vec_t *rowA = (vec_t *) & AO[l << 1];
549 __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
551 SAVE2x4_ACC (&acc0, 0);
555 #if defined(TRMMKERNEL)
556 REFRESH_AFTER_SAVE (2, 2)
560 for (j = 0; j < i; j++)
563 #if defined(TRMMKERNEL)
564 REFRESH_POINTERS (1, 2);
571 for (l = 0; l < temp; l++)
573 v4sf_t rowA = { AO[l], AO[l] };
574 v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
578 #if defined(TRMMKERNEL)
588 #if defined(TRMMKERNEL)
589 REFRESH_AFTER_SAVE (1, 2)
592 #if defined(TRMMKERNEL) && !defined(LEFT)
593 off += 2; // number of values in A
598 for (i1 = 0; i1 < N; i1++)
601 #if defined(TRMMKERNEL) && defined(LEFT)
613 #if defined(TRMMKERNEL)
614 REFRESH_POINTERS (16, 1)
621 v4sf_t t1 = { 0, 0 };
622 v4sf_t t2 = { 0, 0 };
623 v4sf_t t3 = { 0, 0 };
624 v4sf_t t4 = { 0, 0 };
625 v4sf_t t5 = { 0, 0 };
626 v4sf_t t6 = { 0, 0 };
627 v4sf_t t7 = { 0, 0 };
628 for (l = 0; l < temp; l++)
630 v4sf_t rowB = { BO[l], BO[l] };
631 v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
632 v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
633 v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
634 v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
635 v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
636 v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
637 v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
638 v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
656 #if defined(TRMMKERNEL)
695 #if defined(TRMMKERNEL)
696 REFRESH_AFTER_SAVE (16, 1)
702 #if defined(TRMMKERNEL)
703 REFRESH_POINTERS (8, 1)
710 v4sf_t t1 = { 0, 0 };
711 v4sf_t t2 = { 0, 0 };
712 v4sf_t t3 = { 0, 0 };
713 for (l = 0; l < temp; l++)
715 v4sf_t rowB = { BO[l], BO[l] };
716 v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
717 v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
718 v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
719 v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
729 #if defined(TRMMKERNEL)
752 #if defined(TRMMKERNEL)
753 REFRESH_AFTER_SAVE (8, 1)
759 #if defined(TRMMKERNEL)
760 REFRESH_POINTERS (4, 1)
767 v4sf_t t1 = { 0, 0 };
768 for (l = 0; l < temp; l++)
770 v4sf_t rowB = { BO[l], BO[l] };
771 v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
772 v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
778 #if defined(TRMMKERNEL)
793 #if defined(TRMMKERNEL)
794 REFRESH_AFTER_SAVE (4, 1)
800 #if defined(TRMMKERNEL)
801 REFRESH_POINTERS (2, 1)
808 for (l = 0; l < temp; l++)
810 v4sf_t rowB = { BO[l], BO[l] };
811 v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
815 #if defined(TRMMKERNEL)
826 #if defined(TRMMKERNEL)
827 REFRESH_AFTER_SAVE (2, 1)
833 #if defined(TRMMKERNEL)
834 REFRESH_POINTERS (1, 1)
841 for (l = 0; l < temp; l++)
847 #if defined(TRMMKERNEL)
854 #if defined(TRMMKERNEL)
855 REFRESH_AFTER_SAVE (1, 1)
858 #if defined(TRMMKERNEL) && !defined(LEFT)
859 off += 1; // number of values in A