typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
-vector char mask =
- { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe,
- 0xf
-};
-
/*
* BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of
* bfloat16 floating-point values as input. Hence this
* merging is needed on A and B matrices.
*/
-#define MERGE_ROW(x) vec_perm(x, x, mask)
#define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y)
#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y)
l = (k / 2) << 4;
vec_t *rowA = (vec_t *) & (AO[l << 1]);
vec_t *rowB = (vec_t *) & (BO[l]);
- vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
- vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
+ vec_t rowB_h = MERGE_HIGH (rowB[0], vzero);
+ vec_t rowB_l = MERGE_LOW (rowB[0], vzero);
vec_t rowA_h = MERGE_HIGH (rowA[0], vzero);
vec_t rowA_l = MERGE_LOW (rowA[0], vzero);
vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero);
l = (k / 2) << 4;
vec_t *rowA = (vec_t *) & (AO[l]);
vec_t *rowB = (vec_t *) & (BO[l]);
- vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]);
- vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]);
+ vec_t rowB_h = MERGE_HIGH (rowB[0], vzero);
+ vec_t rowB_l = MERGE_LOW (rowB[0], vzero);
vec_t rowA_h = MERGE_HIGH (rowA[0], vzero);
vec_t rowA_l = MERGE_LOW (rowA[0], vzero);
MMA (&acc0, rowB_h, rowA_h);
vector short rowA =
{ AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 };
vec_t *rowB = (vec_t *) & (BO[l << 1]);
- MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
- MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+ MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA);
+ MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA);
}
SAVE_ACC (&acc0, 0);
SAVE_ACC1 (&acc1, 0);
l = (k / 2) << 2;
vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 };
vec_t *rowB = (vec_t *) & (BO[(l << 2)]);
- MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
- MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+ MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA);
+ MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA);
}
SAVE4x2_ACC (&acc0, 0);
SAVE4x2_ACC1 (&acc1, 0);
l = (k / 2) << 1;
vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 };
vec_t *rowB = (vec_t *) & (BO[(l << 3)]);
- MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA);
- MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA);
+ MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA);
+ MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA);
}
SAVE4x2_ACC (&acc0, 0);
SAVE4x2_ACC1 (&acc1, 0);
l = (k / 2) << 3;
vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]);
- vec_t *rowB = (vec_t *) & (BO[l]);
- vec_t rowB_mrg = MERGE_ROW (rowB[0]);
- MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
- MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
- MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero));
- MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero));
- MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero));
- MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero));
- MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero));
- MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero));
+ vector short rowB_mrg =
+ { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
+ MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+ MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero));
+ MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero));
+ MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero));
+ MMA (&acc4, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[0], vzero));
+ MMA (&acc5, (vec_t)rowB_mrg, MERGE_LOW (rowA1[0], vzero));
+ MMA (&acc6, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[1], vzero));
+ MMA (&acc7, (vec_t)rowB_mrg, MERGE_LOW (rowA1[1], vzero));
}
SAVE_ACC (&acc0, 0);
if (k > 1)
l = (k / 2) << 3;
vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
- vec_t *rowB = (vec_t *) & (BO[l]);
- vec_t rowB_mrg = MERGE_ROW (rowB[0]);
- MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
- MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
- MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero));
- MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero));
+ vector short rowB_mrg =
+ { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
+ MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+ MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero));
+ MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero));
+ MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero));
}
SAVE_ACC (&acc0, 0);
l = (k / 2) << 3;
vec_t *rowA = (vec_t *) & (AO[l << 1]);
vec_t *rowB = (vec_t *) & (BO[l]);
- vec_t rowB_mrg = MERGE_ROW (rowB[0]);
- MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero));
- MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero));
+ vector short rowB_mrg =
+ { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
+ MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
+ MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero));
}
SAVE_ACC (&acc0, 0);
SAVE_ACC (&acc1, 4);
l = (k / 2) << 3;
vector short rowA =
{ AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 };
- vec_t *rowB = (vec_t *) & (BO[l]);
- MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+ vector short rowB_mrg =
+ { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
+ MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA);
}
SAVE_ACC (&acc0, 0);
CO += 4;
if (k > 1)
l = (k / 2) << 2;
vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 };
- vec_t *rowB = (vec_t *) & (BO[l << 1]);
- MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+ vector short rowB_mrg =
+ { BO[(l<<1)], 0, BO[(l<<1) + 1], 0, BO[(l<<1) + 2], 0,
+ BO[(l<<1) + 3], 0
+ };
+ MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA);
}
SAVE4x2_ACC (&acc0, 0);
CO += 2;
if (k > 1)
l = (k / 2) << 1;
vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 };
- vec_t *rowB = (vec_t *) & (BO[l << 2]);
- MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA);
+ vector short rowB_mrg =
+ { BO[(l<<2) + 0], 0, BO[(l<<2) + 1], 0, BO[(l <<2) + 2], 0,
+ BO[(l<<2) + 3], 0
+ };
+ MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA);
}
SAVE4x2_ACC (&acc0, 0);
AO += k;
vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
vec_t *rowA = (vec_t *) & (AO[l << 3]);
vec_t *rowA1 = (vec_t *) & (A1[l << 3]);
- MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
- MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
- MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
- MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
- MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2]));
- MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2]));
- MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3]));
- MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3]));
+ MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero));
+ MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
+ MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero));
+ MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero));
+ MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], vzero));
+ MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], vzero));
+ MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], vzero));
+ MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], vzero));
}
SAVE2x4_ACC (&acc0, 0);
SAVE2x4_ACC (&acc1, 4);
l = (k / 2) << 2;
vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
vec_t *rowA = (vec_t *) & (AO[l << 3]);
- MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
- MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
- MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
- MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
+ MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero ));
+ MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
+ MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero));
+ MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero));
}
SAVE2x4_ACC (&acc0, 0);
SAVE2x4_ACC (&acc1, 4);
l = (k / 2) << 2;
vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
- MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1]));
- MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1]));
+ MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero));
+ MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
}
SAVE2x4_ACC (&acc0, 0);
SAVE2x4_ACC (&acc1, 4);
if (k > 1)
l = (k / 2) << 2;
vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 };
- vec_t *rowA = (vec_t *) & (AO[l << 1]);
- MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0]));
+ vector short rowA =
+ { AO[(l << 1)], 0, AO[(l << 1) + 1] , 0 , AO[(l<<1) + 2],
+ 0, AO[(l << 1) + 3], 0 };
+ MMA (&acc0, (vec_t) rowB, (vec_t)(rowA));
}
SAVE2x4_ACC (&acc0, 0);
CO += 4;
l = (k / 2) << 1;
vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
vec_t *rowA = (vec_t *) & (AO[(l << 4)]);
- MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2]));
- MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2]));
- MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3]));
- MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3]));
+ MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero));
+ MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
+ MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero));
+ MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero));
}
rowC = (v4sf_t *) &CO[0];
__builtin_mma_disassemble_acc ((void *)result, &acc0);
l = (k / 2) << 1;
vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
vec_t *rowA = (vec_t *) & (AO[(l << 3)]);
- MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1]));
- MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1]));
+ MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero));
+ MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero));
}
rowC = (v4sf_t *) &CO[0];
__builtin_mma_disassemble_acc ((void *)result, &acc0);
if (k > 1)
l = (k / 2) << 1;
vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 };
- vec_t *rowA = (vec_t *) & (AO[(l << 2)]);
- MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0]));
+ vector short rowA =
+ { AO[(l << 2)], 0, AO[(l << 2) + 1] , 0 ,
+ AO[(l << 2) + 2], 0, AO[(l << 2) + 3], 0 };
+ MMA (&acc0, (vec_t) rowB, (vec_t)(rowA));
}
rowC = (v4sf_t *) &CO[0];
__builtin_mma_disassemble_acc ((void *)result, &acc0);