typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
-typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+
+#if !__has_builtin(__builtin_vsx_disassemble_pair)
+#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
+#endif
#ifdef TRMMKERNEL
#define SAVE_ACC(ACC, J) \
vec_t *rowA = (vec_t *) & AO[0];
vec_t *rb = (vec_t *) & BO[0];
__vector_pair rowB, rowB1;
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
{
rowA = (vec_t *) & AO[l << 3];
rb = (vec_t *) & BO[l << 3];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
vec_t *rb = (vec_t *) & BO[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
{
rowA = (vec_t *) & AO[l << 2];
rb = (vec_t *) & BO[l << 3];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
vec_t *rb = (vec_t *) & BO[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 1];
rb = (vec_t *) & BO[l << 3];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
- __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
}
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
{
rowA = (vec_t *) & AO[l << 3];
rb = (vec_t *) & BO[l << 2];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 2];
rb = (vec_t *) & BO[l << 2];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
}
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 1];
rb = (vec_t *) & BO[l << 2];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
}
SAVE_ACC (&acc0, 0);
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3;
BLASLONG l = 0;
- FLOAT t[4] = { 0, 0, 0, 0 };
- t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
for (l = 1; l < temp; l++)
{
- t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ rb = (vec_t *) & BO[l << 1];
+ __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
rowA = (vec_t *) & AO[l << 3];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
v4sf_t result[4];
__vector_quad acc0, acc1;
BLASLONG l = 0;
- FLOAT t[4] = { 0, 0, 0, 0 };
- t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
for (l = 1; l < temp; l++)
{
- t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ rb = (vec_t *) & BO[l << 1];
+ __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
rowA = (vec_t *) & AO[l << 2];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
v4sf_t result[4];
__vector_quad acc0;
BLASLONG l = 0;
- FLOAT t[4] = { 0, 0, 0, 0 };
- t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB;
- vec_t *rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ vec_t *rb = (vec_t *) & BO[0];
+ __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
for (l = 1; l < temp; l++)
{
- t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
- rb = (vec_t *) & t[0];
- __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+ rb = (vec_t *) & BO[l << 1];
+ __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
rowA = (vec_t *) & AO[l << 1];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
}