rowC[0] += result[1] * alpha;
#endif
-#define SET_ACC_ZERO4() \
- __builtin_mma_xxsetaccz (&acc0); \
- __builtin_mma_xxsetaccz (&acc1); \
- __builtin_mma_xxsetaccz (&acc2); \
- __builtin_mma_xxsetaccz (&acc3);
-
-#define SET_ACC_ZERO8() \
- __builtin_mma_xxsetaccz (&acc0); \
- __builtin_mma_xxsetaccz (&acc1); \
- __builtin_mma_xxsetaccz (&acc2); \
- __builtin_mma_xxsetaccz (&acc3); \
- __builtin_mma_xxsetaccz (&acc4); \
- __builtin_mma_xxsetaccz (&acc5); \
- __builtin_mma_xxsetaccz (&acc6); \
- __builtin_mma_xxsetaccz (&acc7);
-
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
PREFETCH1 (CO + ldc + ldc, 128);
PREFETCH1 (CO + ldc + ldc + ldc, 128);
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
- SET_ACC_ZERO8 ();
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
+ __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
+ __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
+ __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
+ __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
+ __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
+ __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 4];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 4];
+ rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3;
- SET_ACC_ZERO4 ();
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
+ __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
+ __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 3];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 3];
+ rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1;
- __builtin_mma_xxsetaccz (&acc0);
- __builtin_mma_xxsetaccz (&acc1);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 2];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 2];
+ rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0;
- __builtin_mma_xxsetaccz (&acc0);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 1];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 1];
+ rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
}
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
- SET_ACC_ZERO8 ();
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0, 0, 0, 0 };
+ t[0] = BO[0], t[1] = BO[1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & t[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rowA = (vec_t *) & AO[0];
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
+ __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
+ __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
+ __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
+ __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
+ __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
+ __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
+ rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- vec_t *rowA = (vec_t *) & AO[l << 4];
+ rowA = (vec_t *) & AO[l << 4];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3;
- SET_ACC_ZERO4 ();
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0, 0, 0, 0 };
+ t[0] = BO[0], t[1] = BO[1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & t[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rowA = (vec_t *) & AO[0];
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
+ __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
+ __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
+ rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- vec_t *rowA = (vec_t *) & AO[l << 3];
+ rowA = (vec_t *) & AO[l << 3];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1;
- __builtin_mma_xxsetaccz (&acc0);
- __builtin_mma_xxsetaccz (&acc1);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0, 0, 0, 0 };
+ t[0] = BO[0], t[1] = BO[1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & t[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rowA = (vec_t *) & AO[0];
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
+ rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- vec_t *rowA = (vec_t *) & AO[l << 2];
+ rowA = (vec_t *) & AO[l << 2];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
}
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0;
- __builtin_mma_xxsetaccz (&acc0);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0, 0, 0, 0 };
+ t[0] = BO[0], t[1] = BO[1];
+ __vector_pair rowB;
+ vec_t *rb = (vec_t *) & t[0];
+ __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rowA = (vec_t *) & AO[0];
+ __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- __vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
+ rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- vec_t *rowA = (vec_t *) & AO[l << 1];
+ rowA = (vec_t *) & AO[l << 1];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
}
SAVE2x4_ACC (&acc0, 0);
__builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
__builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
__builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
-#define SET_ACC_ZERO4() \
- __builtin_mma_xxsetaccz (&acc0); \
- __builtin_mma_xxsetaccz (&acc1); \
- __builtin_mma_xxsetaccz (&acc2); \
- __builtin_mma_xxsetaccz (&acc3);
-
-#define SET_ACC_ZERO8() \
- __builtin_mma_xxsetaccz (&acc0); \
- __builtin_mma_xxsetaccz (&acc1); \
- __builtin_mma_xxsetaccz (&acc2); \
- __builtin_mma_xxsetaccz (&acc3); \
- __builtin_mma_xxsetaccz (&acc4); \
- __builtin_mma_xxsetaccz (&acc5); \
- __builtin_mma_xxsetaccz (&acc6); \
- __builtin_mma_xxsetaccz (&acc7);
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
- SET_ACC_ZERO8 ();
BLASLONG l = 0;
+ vec_t *rowA1 = (vec_t *) & AO[0];
+ vec_t *rowB1 = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]);
+ __builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]);
+ __builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]);
+ __builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]);
+ __builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]);
+ __builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]);
+ __builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]);
+ AO += 16;
+ BO += 8;
+ temp--;
BLASLONG K = temp / 64;
for (l = 0; l < K; l++)
{
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3;
- SET_ACC_ZERO4 ();
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
+ __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]);
+ __builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 3];
- vec_t *rowB = (vec_t *) & BO[l << 3];
+ rowA = (vec_t *) & AO[l << 3];
+ rowB = (vec_t *) & BO[l << 3];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1;
- __builtin_mma_xxsetaccz (&acc0);
- __builtin_mma_xxsetaccz (&acc1);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 2];
- vec_t *rowB = (vec_t *) & BO[l << 3];
+ rowA = (vec_t *) & AO[l << 2];
+ rowB = (vec_t *) & BO[l << 3];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
}
v2sf_t *rowC;
v2sf_t result[8];
__vector_quad acc0, acc1;
- __builtin_mma_xxsetaccz (&acc0);
- __builtin_mma_xxsetaccz (&acc1);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0 };
+ t[0] = AO[0], t[1] = AO[1];
+ vec_t *rowA = (vec_t *) & t[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0 };
t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
- vec_t *rowA = (vec_t *) & t[0];
- vec_t *rowB = (vec_t *) & BO[l << 3];
+ rowA = (vec_t *) & t[0];
+ rowB = (vec_t *) & BO[l << 3];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
}
FLOAT *A1;
A1 = AO + (16 * k);
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
- SET_ACC_ZERO8 ();
BLASLONG l = 0;
- for (l = 0; l < k; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowA1 = (vec_t *) & A1[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
+ __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
+ __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
+ __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
+ __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
+ __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
+ __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
+ for (l = 1; l < k; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 4];
- vec_t *rowA1 = (vec_t *) & A1[l << 4];
- vec_t *rowB = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 4];
+ rowA1 = (vec_t *) & A1[l << 4];
+ rowB = (vec_t *) & BO[l << 2];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3;
- SET_ACC_ZERO4 ();
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
+ __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
+ __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 4];
- vec_t *rowB = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 4];
+ rowB = (vec_t *) & BO[l << 2];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1;
- __builtin_mma_xxsetaccz (&acc0);
- __builtin_mma_xxsetaccz (&acc1);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 3];
- vec_t *rowB = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 3];
+ rowB = (vec_t *) & BO[l << 2];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
}
v4sf_t *rowC;
__vector_quad acc0;
v4sf_t result[4];
- __builtin_mma_xxsetaccz (&acc0);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ for (l = 1; l < temp; l++)
{
- vec_t *rowA = (vec_t *) & AO[l << 2];
- vec_t *rowB = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & AO[l << 2];
+ rowB = (vec_t *) & BO[l << 2];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
}
SAVE_ACC (&acc0, 0);
v2sf_t *rowC;
v2sf_t result[8];
__vector_quad acc0;
- __builtin_mma_xxsetaccz (&acc0);
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0 };
+ t[0] = AO[0], t[1] = AO[1];
+ vec_t *rowA = (vec_t *) & t[0];
+ vec_t *rowB = (vec_t *) & BO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0 };
t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
- vec_t *rowA = (vec_t *) & t[0];
- vec_t *rowB = (vec_t *) & BO[l << 2];
+ rowA = (vec_t *) & t[0];
+ rowB = (vec_t *) & BO[l << 2];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
}
SAVE4x2_ACC (&acc0, 0);
FLOAT *A1;
A1 = AO + (16 * k);
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
- SET_ACC_ZERO8 ();
BLASLONG l = 0;
- for (l = 0; l < k; l++)
+ FLOAT t[4] = { 0 };
+ t[0] = BO[0], t[1] = BO[1];
+ vec_t *rowB = (vec_t *) & t[0];
+ vec_t *rowA = (vec_t *) & AO[0];
+ vec_t *rowA1 = (vec_t *) & A1[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
+ __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
+ __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
+ __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]);
+ __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]);
+ __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]);
+ __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]);
+ for (l = 1; l < k; l++)
{
- FLOAT t[4] = { 0 };
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- vec_t *rowB = (vec_t *) & t[0];
- vec_t *rowA = (vec_t *) & AO[l << 4];
- vec_t *rowA1 = (vec_t *) & A1[l << 4];
+ rowB = (vec_t *) & t[0];
+ rowA = (vec_t *) & AO[l << 4];
+ rowA1 = (vec_t *) & A1[l << 4];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3;
- SET_ACC_ZERO4 ();
BLASLONG l = 0;
#if defined(TRMMKERNEL)
REFRESH_POINTERS (16, 2)
BO = B;
temp = k;
#endif
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0 };
+ t[0] = BO[0], t[1] = BO[1];
+ vec_t *rowB = (vec_t *) & t[0];
+ vec_t *rowA = (vec_t *) & AO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
+ __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]);
+ __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0 };
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- vec_t *rowB = (vec_t *) & t[0];
- vec_t *rowA = (vec_t *) & AO[l << 4];
+ rowB = (vec_t *) & t[0];
+ rowA = (vec_t *) & AO[l << 4];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
__builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0, acc1;
- __builtin_mma_xxsetaccz (&acc0);
- __builtin_mma_xxsetaccz (&acc1);
#if defined(TRMMKERNEL)
REFRESH_POINTERS (8, 2)
#else
temp = k;
#endif
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0 };
+ t[0] = BO[0], t[1] = BO[1];
+ vec_t *rowB = (vec_t *) & t[0];
+ vec_t *rowA = (vec_t *) & AO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0 };
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- vec_t *rowB = (vec_t *) & t[0];
- vec_t *rowA = (vec_t *) & AO[l << 3];
+ rowB = (vec_t *) & t[0];
+ rowA = (vec_t *) & AO[l << 3];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
__builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
}
v4sf_t *rowC;
v4sf_t result[4];
__vector_quad acc0;
- __builtin_mma_xxsetaccz (&acc0);
#if defined(TRMMKERNEL)
REFRESH_POINTERS (4, 2)
#else
temp = k;
#endif
BLASLONG l = 0;
- for (l = 0; l < temp; l++)
+ FLOAT t[4] = { 0 };
+ t[0] = BO[0], t[1] = BO[1];
+ vec_t *rowB = (vec_t *) & t[0];
+ vec_t *rowA = (vec_t *) & AO[0];
+ __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]);
+ for (l = 1; l < temp; l++)
{
- FLOAT t[4] = { 0 };
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- vec_t *rowB = (vec_t *) & t[0];
- vec_t *rowA = (vec_t *) & AO[l << 2];
+ rowB = (vec_t *) & t[0];
+ rowA = (vec_t *) & AO[l << 2];
__builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
}
SAVE2x4_ACC (&acc0, 0);