1 /*********************************************************************************
2 Copyright (c) 2020, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 **********************************************************************************/
30 typedef unsigned char vec_t __attribute__ ((vector_size (16)));
31 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
32 typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
33 #if defined(TRMMKERNEL)
34 #define SAVE_ACC(ACC, J) \
35 __builtin_mma_disassemble_acc (result, ACC); \
36 rowC = (v4sf_t *) &CO[0* ldc+J]; \
37 rowC[0] = result[3] * alpha; \
38 rowC = (v4sf_t *) &CO[1*ldc+J]; \
39 rowC[0] = result[2] * alpha; \
40 rowC = (v4sf_t *) &CO[2*ldc+J]; \
41 rowC[0] = result[1] * alpha; \
42 rowC = (v4sf_t *) &CO[3*ldc+J]; \
43 rowC[0] = result[0] * alpha;
44 #define SAVE_ACC1(ACC, J) \
45 __builtin_mma_disassemble_acc (result, ACC); \
46 rowC = (v4sf_t *) &CO[4* ldc+J]; \
47 rowC[0] = result[3] * alpha; \
48 rowC = (v4sf_t *) &CO[5*ldc+J]; \
49 rowC[0] = result[2] * alpha; \
50 rowC = (v4sf_t *) &CO[6*ldc+J]; \
51 rowC[0] = result[1] * alpha; \
52 rowC = (v4sf_t *) &CO[7*ldc+J]; \
53 rowC[0] = result[0] * alpha;
54 #define SAVE4x2_ACC(ACC, J) \
55 __builtin_mma_disassemble_acc (result, ACC); \
56 rowC = (v2sf_t *) &CO[0* ldc+J]; \
57 rowC[0] = result[6] * alpha; \
58 rowC = (v2sf_t *) &CO[1* ldc+J]; \
59 rowC[0] = result[4] * alpha; \
60 rowC = (v2sf_t *) &CO[2* ldc+J]; \
61 rowC[0] = result[2] * alpha; \
62 rowC = (v2sf_t *) &CO[3* ldc+J]; \
63 rowC[0] = result[0] * alpha;
64 #define SAVE4x2_ACC1(ACC, J) \
65 __builtin_mma_disassemble_acc (result, ACC); \
66 rowC = (v2sf_t *) &CO[4* ldc+J]; \
67 rowC[0] = result[6] * alpha; \
68 rowC = (v2sf_t *) &CO[5* ldc+J]; \
69 rowC[0] = result[4] * alpha; \
70 rowC = (v2sf_t *) &CO[6* ldc+J]; \
71 rowC[0] = result[2] * alpha; \
72 rowC = (v2sf_t *) &CO[7* ldc+J]; \
73 rowC[0] = result[0] * alpha;
74 #define SAVE2x4_ACC(ACC, J) \
75 __builtin_mma_disassemble_acc (result, ACC); \
76 rowC = (v4sf_t *) &CO[0* ldc+J]; \
77 rowC[0] = result[3] * alpha; \
78 rowC = (v4sf_t *) &CO[1* ldc+J]; \
79 rowC[0] = result[2] * alpha;
81 #define SAVE_ACC(ACC, J) \
82 __builtin_mma_disassemble_acc (result, ACC); \
83 rowC = (v4sf_t *) &CO[0* ldc+J]; \
84 rowC[0] += result[3] * alpha; \
85 rowC = (v4sf_t *) &CO[1*ldc+J]; \
86 rowC[0] += result[2] * alpha; \
87 rowC = (v4sf_t *) &CO[2*ldc+J]; \
88 rowC[0] += result[1] * alpha; \
89 rowC = (v4sf_t *) &CO[3*ldc+J]; \
90 rowC[0] += result[0] * alpha;
91 #define SAVE_ACC1(ACC, J) \
92 __builtin_mma_disassemble_acc (result, ACC); \
93 rowC = (v4sf_t *) &CO[4* ldc+J]; \
94 rowC[0] += result[3] * alpha; \
95 rowC = (v4sf_t *) &CO[5*ldc+J]; \
96 rowC[0] += result[2] * alpha; \
97 rowC = (v4sf_t *) &CO[6*ldc+J]; \
98 rowC[0] += result[1] * alpha; \
99 rowC = (v4sf_t *) &CO[7*ldc+J]; \
100 rowC[0] += result[0] * alpha;
101 #define SAVE4x2_ACC(ACC, J) \
102 __builtin_mma_disassemble_acc (result, ACC); \
103 rowC = (v2sf_t *) &CO[0* ldc+J]; \
104 rowC[0] += result[6] * alpha; \
105 rowC = (v2sf_t *) &CO[1* ldc+J]; \
106 rowC[0] += result[4] * alpha; \
107 rowC = (v2sf_t *) &CO[2* ldc+J]; \
108 rowC[0] += result[2] * alpha; \
109 rowC = (v2sf_t *) &CO[3* ldc+J]; \
110 rowC[0] += result[0] * alpha;
111 #define SAVE4x2_ACC1(ACC, J) \
112 __builtin_mma_disassemble_acc (result, ACC); \
113 rowC = (v2sf_t *) &CO[4* ldc+J]; \
114 rowC[0] += result[6] * alpha; \
115 rowC = (v2sf_t *) &CO[5* ldc+J]; \
116 rowC[0] += result[4] * alpha; \
117 rowC = (v2sf_t *) &CO[6* ldc+J]; \
118 rowC[0] += result[2] * alpha; \
119 rowC = (v2sf_t *) &CO[7* ldc+J]; \
120 rowC[0] += result[0] * alpha;
121 #define SAVE2x4_ACC(ACC, J) \
122 __builtin_mma_disassemble_acc (result, ACC); \
123 rowC = (v4sf_t *) &CO[0* ldc+J]; \
124 rowC[0] += result[3] * alpha; \
125 rowC = (v4sf_t *) &CO[1* ldc+J]; \
126 rowC[0] += result[2] * alpha;
128 #define KERNEL(i, j) \
129 __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
130 __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
131 __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
132 __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
133 __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
134 __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
135 __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
136 __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
137 #define SET_ACC_ZERO4() \
138 __builtin_mma_xxsetaccz (&acc0); \
139 __builtin_mma_xxsetaccz (&acc1); \
140 __builtin_mma_xxsetaccz (&acc2); \
141 __builtin_mma_xxsetaccz (&acc3);
143 #define SET_ACC_ZERO8() \
144 __builtin_mma_xxsetaccz (&acc0); \
145 __builtin_mma_xxsetaccz (&acc1); \
146 __builtin_mma_xxsetaccz (&acc2); \
147 __builtin_mma_xxsetaccz (&acc3); \
148 __builtin_mma_xxsetaccz (&acc4); \
149 __builtin_mma_xxsetaccz (&acc5); \
150 __builtin_mma_xxsetaccz (&acc6); \
151 __builtin_mma_xxsetaccz (&acc7);
153 #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
155 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
156 #define REFRESH_TEMP_BK(x, y) \
159 #define REFRESH_TEMP_BK(x, y) \
162 #define REFRESH_TEMP_BK(x, y) \
165 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
166 #define REFRESH_POINTERS(x, y) \
168 REFRESH_TEMP_BK(x, y)
170 #define REFRESH_POINTERS(x, y) \
173 REFRESH_TEMP_BK(x, y)
177 #define REFRESH_OFF(x) \
180 #define REFRESH_OFF(x)
184 #define UPDATE_TEMP(x, y) \
187 #define UPDATE_TEMP(x, y) \
191 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
192 #define REFRESH_TMP_AFTER_SAVE(x, y) \
198 #define REFRESH_TMP_AFTER_SAVE(x, y)
201 #define REFRESH_AFTER_SAVE(x,y) \
202 REFRESH_TMP_AFTER_SAVE(x, y) \
204 /*************************************************************************************
206 *************************************************************************************/
208 CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
209 FLOAT * C, BLASLONG ldc
217 #if defined(TRMMKERNEL)
220 #if defined(TRMMKERNEL) && !defined(LEFT)
224 v4sf_t valpha = { alpha, alpha, alpha, alpha };
226 for (i1 = 0; i1 < N; i1++)
231 #if defined(TRMMKERNEL) && defined(LEFT)
240 for (j = 0; j < i; j++)
243 #if defined(TRMMKERNEL)
244 REFRESH_POINTERS (16, 8);
251 __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
254 BLASLONG K = temp / 64;
255 for (l = 0; l < K; l++)
257 vec_t *rowA = (vec_t *) & AO[0];
258 vec_t *rowB = (vec_t *) & BO[0];
326 if ((temp & 63) >> 5)
328 vec_t *rowA = (vec_t *) & AO[0];
329 vec_t *rowB = (vec_t *) & BO[0];
365 if ((temp & 31) >> 4)
367 vec_t *rowA = (vec_t *) & AO[0];
368 vec_t *rowB = (vec_t *) & BO[0];
388 if ((temp & 15) >> 3)
390 vec_t *rowA = (vec_t *) & AO[0];
391 vec_t *rowB = (vec_t *) & BO[0];
405 vec_t *rowA = (vec_t *) & AO[0];
406 vec_t *rowB = (vec_t *) & BO[0];
416 vec_t *rowA = (vec_t *) & AO[0];
417 vec_t *rowB = (vec_t *) & BO[0];
425 vec_t *rowA = (vec_t *) & AO[0];
426 vec_t *rowB = (vec_t *) & BO[0];
433 SAVE_ACC1 (&acc1, 0);
434 SAVE_ACC1 (&acc3, 4);
436 SAVE_ACC (&acc6, 12);
437 SAVE_ACC1 (&acc5, 8);
438 SAVE_ACC1 (&acc7, 12);
439 #if defined(TRMMKERNEL)
440 REFRESH_AFTER_SAVE (16, 8)
445 for (j = 0; j < i; j++)
448 #if defined(TRMMKERNEL)
449 REFRESH_POINTERS (8, 8);
456 __vector_quad acc0, acc1, acc2, acc3;
459 for (l = 0; l < temp; l++)
461 vec_t *rowA = (vec_t *) & AO[l << 3];
462 vec_t *rowB = (vec_t *) & BO[l << 3];
463 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
464 __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
465 __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
466 __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
470 SAVE_ACC1 (&acc1, 0);
471 SAVE_ACC1 (&acc3, 4);
475 #if defined(TRMMKERNEL)
476 REFRESH_AFTER_SAVE (8, 8)
480 for (j = 0; j < i; j++)
483 #if defined(TRMMKERNEL)
484 REFRESH_POINTERS (4, 8);
491 __vector_quad acc0, acc1;
492 __builtin_mma_xxsetaccz (&acc0);
493 __builtin_mma_xxsetaccz (&acc1);
495 for (l = 0; l < temp; l++)
497 vec_t *rowA = (vec_t *) & AO[l << 2];
498 vec_t *rowB = (vec_t *) & BO[l << 3];
499 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
500 __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
503 SAVE_ACC1 (&acc1, 0);
507 #if defined(TRMMKERNEL)
508 REFRESH_AFTER_SAVE (4, 8)
512 for (j = 0; j < i; j++)
515 #if defined(TRMMKERNEL)
516 REFRESH_POINTERS (2, 8);
524 __vector_quad acc0, acc1;
525 __builtin_mma_xxsetaccz (&acc0);
526 __builtin_mma_xxsetaccz (&acc1);
528 for (l = 0; l < temp; l++)
531 t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
532 vec_t *rowA = (vec_t *) & t[0];
533 vec_t *rowB = (vec_t *) & BO[l << 3];
534 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
535 __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
537 SAVE4x2_ACC (&acc0, 0);
538 SAVE4x2_ACC1 (&acc1, 0);
542 #if defined(TRMMKERNEL)
543 REFRESH_AFTER_SAVE (2, 8)
547 for (j = 0; j < i; j++)
550 #if defined(TRMMKERNEL)
551 REFRESH_POINTERS (1, 8);
557 v4sf_t t = { 0, 0, 0, 0 };
558 v4sf_t t1 = { 0, 0, 0, 0 };
559 for (l = 0; l < temp; l++)
561 v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
562 v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
566 { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
574 #if defined(TRMMKERNEL)
588 CO[4 * ldc] += t1[0];
589 CO[5 * ldc] += t1[1];
590 CO[6 * ldc] += t1[2];
591 CO[7 * ldc] += t1[3];
596 #if defined(TRMMKERNEL)
597 REFRESH_AFTER_SAVE (1, 8)
600 #if defined(TRMMKERNEL) && !defined(LEFT)
601 off += 8; // number of values in A
607 for (i1 = 0; i1 < N; i1++)
610 #if defined(TRMMKERNEL) && defined(LEFT)
618 #if !defined(TRMMKERNEL)
620 for (j = 0; j < i; j++)
627 __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
630 for (l = 0; l < k; l++)
632 vec_t *rowA = (vec_t *) & AO[l << 4];
633 vec_t *rowA1 = (vec_t *) & A1[l << 4];
634 vec_t *rowB = (vec_t *) & BO[l << 2];
635 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
636 __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
637 __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
638 __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
639 __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
640 __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
641 __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
642 __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
664 for (j = 0; j < i; j++)
667 #if defined(TRMMKERNEL)
668 REFRESH_POINTERS (16, 4);
675 __vector_quad acc0, acc1, acc2, acc3;
678 for (l = 0; l < temp; l++)
680 vec_t *rowA = (vec_t *) & AO[l << 4];
681 vec_t *rowB = (vec_t *) & BO[l << 2];
682 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
683 __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
684 __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
685 __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
696 #if defined(TRMMKERNEL)
697 REFRESH_AFTER_SAVE (16, 4)
701 for (j = 0; j < i; j++)
704 #if defined(TRMMKERNEL)
705 REFRESH_POINTERS (8, 4);
712 __vector_quad acc0, acc1;
713 __builtin_mma_xxsetaccz (&acc0);
714 __builtin_mma_xxsetaccz (&acc1);
716 for (l = 0; l < temp; l++)
718 vec_t *rowA = (vec_t *) & AO[l << 3];
719 vec_t *rowB = (vec_t *) & BO[l << 2];
720 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
721 __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
728 #if defined(TRMMKERNEL)
729 REFRESH_AFTER_SAVE (8, 4)
733 for (j = 0; j < i; j++)
736 #if defined(TRMMKERNEL)
737 REFRESH_POINTERS (4, 4);
745 __builtin_mma_xxsetaccz (&acc0);
747 for (l = 0; l < temp; l++)
749 vec_t *rowA = (vec_t *) & AO[l << 2];
750 vec_t *rowB = (vec_t *) & BO[l << 2];
751 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
757 #if defined(TRMMKERNEL)
758 REFRESH_AFTER_SAVE (4, 4)
762 for (j = 0; j < i; j++)
765 #if defined(TRMMKERNEL)
766 REFRESH_POINTERS (2, 4);
774 __builtin_mma_xxsetaccz (&acc0);
776 for (l = 0; l < temp; l++)
779 t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
780 vec_t *rowA = (vec_t *) & t[0];
781 vec_t *rowB = (vec_t *) & BO[l << 2];
782 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
784 SAVE4x2_ACC (&acc0, 0);
788 #if defined(TRMMKERNEL)
789 REFRESH_AFTER_SAVE (2, 4)
793 for (j = 0; j < i; j++)
796 #if defined(TRMMKERNEL)
797 REFRESH_POINTERS (1, 4)
803 v4sf_t t = { 0, 0, 0, 0 };
804 for (l = 0; l < temp; l++)
806 v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
807 v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
813 #if defined(TRMMKERNEL)
827 #if defined(TRMMKERNEL)
828 REFRESH_AFTER_SAVE (1, 4)
831 #if defined(TRMMKERNEL) && !defined(LEFT)
832 off += 4; // number of values in A
838 for (i1 = 0; i1 < N; i1++)
841 #if defined(TRMMKERNEL) && defined(LEFT)
849 #if !defined(TRMMKERNEL)
851 for (j = 0; j < i; j++)
858 __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
861 for (l = 0; l < k; l++)
864 t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
865 vec_t *rowB = (vec_t *) & t[0];
866 vec_t *rowA = (vec_t *) & AO[l << 4];
867 vec_t *rowA1 = (vec_t *) & A1[l << 4];
868 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
869 __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
870 __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
871 __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
872 __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
873 __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
874 __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
875 __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
877 SAVE2x4_ACC (&acc0, 0);
878 SAVE2x4_ACC (&acc1, 4);
879 SAVE2x4_ACC (&acc2, 8);
880 SAVE2x4_ACC (&acc3, 12);
882 SAVE2x4_ACC (&acc4, 0);
883 SAVE2x4_ACC (&acc5, 4);
884 SAVE2x4_ACC (&acc6, 8);
885 SAVE2x4_ACC (&acc7, 12);
894 for (j = 0; j < i; j++)
899 __vector_quad acc0, acc1, acc2, acc3;
902 #if defined(TRMMKERNEL)
903 REFRESH_POINTERS (16, 2)
908 for (l = 0; l < temp; l++)
911 t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
912 vec_t *rowB = (vec_t *) & t[0];
913 vec_t *rowA = (vec_t *) & AO[l << 4];
914 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
915 __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
916 __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
917 __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
919 SAVE2x4_ACC (&acc0, 0);
920 SAVE2x4_ACC (&acc1, 4);
921 SAVE2x4_ACC (&acc2, 8);
922 SAVE2x4_ACC (&acc3, 12);
926 #if defined(TRMMKERNEL)
927 REFRESH_AFTER_SAVE (16, 2)
931 for (j = 0; j < i; j++)
936 __vector_quad acc0, acc1;
937 __builtin_mma_xxsetaccz (&acc0);
938 __builtin_mma_xxsetaccz (&acc1);
939 #if defined(TRMMKERNEL)
940 REFRESH_POINTERS (8, 2)
946 for (l = 0; l < temp; l++)
949 t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
950 vec_t *rowB = (vec_t *) & t[0];
951 vec_t *rowA = (vec_t *) & AO[l << 3];
952 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
953 __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
955 SAVE2x4_ACC (&acc0, 0);
956 SAVE2x4_ACC (&acc1, 4);
960 #if defined(TRMMKERNEL)
961 REFRESH_AFTER_SAVE (8, 2)
965 for (j = 0; j < i; j++)
971 __builtin_mma_xxsetaccz (&acc0);
972 #if defined(TRMMKERNEL)
973 REFRESH_POINTERS (4, 2)
979 for (l = 0; l < temp; l++)
982 t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
983 vec_t *rowB = (vec_t *) & t[0];
984 vec_t *rowA = (vec_t *) & AO[l << 2];
985 __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
987 SAVE2x4_ACC (&acc0, 0);
991 #if defined(TRMMKERNEL)
992 REFRESH_AFTER_SAVE (4, 2)
996 for (j = 0; j < i; j++)
1000 #if defined(TRMMKERNEL)
1001 REFRESH_POINTERS (2, 2)
1006 v4sf_t t = { 0, 0, 0, 0 };
1007 for (l = 0; l < (temp << 1); l += 2)
1009 v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
1010 v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
1014 #if defined(TRMMKERNEL)
1017 CO[0 * ldc + 1] = t[2];
1018 CO[1 * ldc + 1] = t[3];
1020 CO[0 * ldc] += t[0];
1021 CO[1 * ldc] += t[1];
1022 CO[0 * ldc + 1] += t[2];
1023 CO[1 * ldc + 1] += t[3];
1028 #if defined(TRMMKERNEL)
1029 REFRESH_AFTER_SAVE (2, 2)
1033 for (j = 0; j < i; j++)
1037 #if defined(TRMMKERNEL)
1038 REFRESH_POINTERS (1, 2)
1043 v4sf_t t = { 0, 0, 0, 0 };
1044 for (l = 0; l < temp; l++)
1046 v4sf_t rowA = { AO[l], AO[l], 0, 0 };
1047 v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
1051 #if defined(TRMMKERNEL)
1055 CO[0 * ldc] += t[0];
1056 CO[1 * ldc] += t[1];
1061 #if defined(TRMMKERNEL)
1062 REFRESH_AFTER_SAVE (1, 2)
1065 #if defined(TRMMKERNEL) && !defined(LEFT)
1066 off += 2; // number of values in A
1072 for (i1 = 0; i1 < N; i1++)
1075 #if defined(TRMMKERNEL) && defined(LEFT)
1088 #if defined(TRMMKERNEL)
1089 REFRESH_POINTERS (16, 1)
1095 v4sf_t t = { 0, 0, 0, 0 };
1096 v4sf_t t1 = { 0, 0, 0, 0 };
1097 v4sf_t t2 = { 0, 0, 0, 0 };
1098 v4sf_t t3 = { 0, 0, 0, 0 };
1099 for (l = 0; l < temp; l++)
1101 v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
1102 v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
1106 { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
1110 { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
1114 { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
1126 #if defined(TRMMKERNEL)
1165 #if defined(TRMMKERNEL)
1166 REFRESH_AFTER_SAVE (16, 1)
1173 v4sf_t t = { 0, 0, 0, 0 };
1174 v4sf_t t1 = { 0, 0, 0, 0 };
1175 #if defined(TRMMKERNEL)
1176 REFRESH_POINTERS (8, 1)
1182 for (l = 0; l < temp; l++)
1184 v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
1185 v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
1189 { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
1197 #if defined(TRMMKERNEL)
1220 #if defined(TRMMKERNEL)
1221 REFRESH_AFTER_SAVE (8, 1)
1228 v4sf_t t = { 0, 0, 0, 0 };
1229 #if defined(TRMMKERNEL)
1230 REFRESH_POINTERS (4, 1)
1236 for (l = 0; l < temp; l++)
1238 v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
1239 v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
1245 #if defined(TRMMKERNEL)
1260 #if defined(TRMMKERNEL)
1261 REFRESH_AFTER_SAVE (4, 1)
1268 #if defined(TRMMKERNEL)
1269 REFRESH_POINTERS (2, 1)
1275 v4sf_t t = { 0, 0, 0, 0 };
1276 for (l = 0; l < temp; l++)
1278 v4sf_t rowB = { BO[l], BO[l], 0, 0 };
1279 v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
1283 #if defined(TRMMKERNEL)
1294 #if defined(TRMMKERNEL)
1295 REFRESH_AFTER_SAVE (2, 1)
1301 #if defined(TRMMKERNEL)
1302 REFRESH_POINTERS (1, 1)
1310 for (l = 0; l < temp; l++)
1316 #if defined(TRMMKERNEL)
1323 #if defined(TRMMKERNEL)
1324 REFRESH_AFTER_SAVE (1, 1)
1328 #if defined(TRMMKERNEL) && !defined(LEFT)
1329 off += 1; // number of values in A