a -= 64;
b -= 32;
- res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
- res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
- res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
- res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
- res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
- res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
- res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
- res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
- res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8);
- res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8);
- res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9);
- res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9);
- res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10);
- res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10);
- res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11);
- res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11);
+ ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
+ ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
+ ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
+ ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
+ ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
src_a54 = __msa_cast_to_vector_double(*(a + 54));
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
ST_DP(res_c6, b + 24);
ST_DP(res_c15, b + 30);
ST_DP(res_c14, b + 26);
- src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
- src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
- src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14);
- src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14);
+ ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
+ ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
ST_DP(src_c3, c + 6);
ST_DP(src_c7, c_nxt1line + 6);
ST_DP(src_c11, c_nxt2line + 6);
ST_DP(res_c12, b + 18);
ST_DP(res_c13, b + 22);
- src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
- src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
- src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12);
- src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12);
+ ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
+ ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
ST_DP(src_c2, c + 4);
ST_DP(src_c6, c_nxt1line + 4);
ST_DP(src_c10, c_nxt2line + 4);
src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1);
src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0);
- src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
- src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
- src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10);
- src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
+ ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
ST_DP(src_c1, c + 2);
ST_DP(src_c5, c_nxt1line + 2);
ST_DP(src_c9, c_nxt2line + 2);
ST_DP(res_c1, b + 4);
ST_DP(res_c9, b + 6);
- src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
- src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
- src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8);
- src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8);
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
+ ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);
ST_DP(src_c0, c);
ST_DP(src_c4, c_nxt1line);
src_c7 -= src_a3 * src_b;
}
- res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
- res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
- res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
- res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
- res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
- res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
- res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
- res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
+ ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
src_a56 = LD_DP(a - 8);
src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1);
ST_DP(res_c1, b - 14);
ST_DP(res_c0, b - 16);
- src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
- src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
- src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
- src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
- src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
- src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
- src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
- src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
+ ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
+ ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2);
ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2);
if (bk > 0)
{
- int i;
+ BLASLONG i;
FLOAT *aa = a, *bb = b;
- FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0;
for (i = bk; i--; )
{
- a0 = aa[0];
- a1 = aa[1];
- a2 = aa[2];
- a3 = aa[3];
- a4 = aa[4];
- a5 = aa[5];
- a6 = aa[6];
- a7 = aa[7];
-
- b0 = bb[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
- c2 -= a2 * b0;
- c3 -= a3 * b0;
- c4 -= a4 * b0;
- c5 -= a5 * b0;
- c6 -= a6 * b0;
- c7 -= a7 * b0;
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+ c4 -= aa[4] * bb[0];
+ c5 -= aa[5] * bb[0];
+ c6 -= aa[6] * bb[0];
+ c7 -= aa[7] * bb[0];
aa += 8;
bb += 1;
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 16, *bb = b + 16;
+ FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
for (i = bk; i--;)
}
}
- res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
- res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
- res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
- res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
- res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4);
- res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4);
- res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5);
- res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5);
+ a -= 16;
+ b -= 16;
+
+ ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7);
src_a14 = LD_DP(a + 14);
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
ST_DP(res_c4, b + 2);
ST_DP(res_c0, b + 0);
- src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
- src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
- src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
- src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
- src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
- src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
- src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
- src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
+ ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6);
+ ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c2, src_c3, c + ldc, 2);
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 16, *bb = b + 8;
+ FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_b, src_b0;
for (i = bk; i--;)
}
}
- res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
- res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
- res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
- res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
+ a -= 16;
+ b -= 8;
+
+ ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
src_a14 = LD_DP(a + 14);
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1);
ST_DP(res_c1, b + 2);
ST_DP(res_c0, b + 0);
- src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
- src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
- src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
- src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c2, src_c3, c + ldc, 2);
static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
- FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15;
- FLOAT c0, c1, c2, c3;
+ FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15, c0, c1, c2, c3;
c0 = *(c + 0);
c1 = *(c + 1);
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 16, *bb = b + 4;
- FLOAT a0, a1, a2, a3, b0;
+ FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
- a0 = aa[0];
- a1 = aa[1];
- a2 = aa[2];
- a3 = aa[3];
-
- b0 = bb[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
- c2 -= a2 * b0;
- c3 -= a3 * b0;
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
aa += 4;
bb += 1;
}
}
+ a -= 16;
+ b -= 4;
+
a0 = *(a + 0);
a4 = *(a + 4);
a5 = *(a + 5);
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 4, *bb = b + 8;
- FLOAT a0, a1, b0, b1, b2, b3;
+ FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
- a0 = aa[0];
- a1 = aa[1];
-
- b0 = bb[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
-
- b1 = bb[1];
- c0_nxt1 -= a0 * b1;
- c1_nxt1 -= a1 * b1;
-
- b2 = bb[2];
- c0_nxt2 -= a0 * b2;
- c1_nxt2 -= a1 * b2;
-
- b3 = bb[3];
- c0_nxt3 -= a0 * b3;
- c1_nxt3 -= a1 * b3;
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c1_nxt1 -= aa[1] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c1_nxt2 -= aa[1] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+ c1_nxt3 -= aa[1] * bb[3];
aa += 2;
bb += 4;
}
}
+ a -= 4;
+ b -= 8;
+
a0 = *(a + 0);
a2 = *(a + 2);
a3 = *(a + 3);
*(c + 0) = c0;
*(c + 1) = c1;
-
*(c + 0 + ldc) = c0_nxt1;
*(c + 1 + ldc) = c1_nxt1;
-
*(c + 0 + 2 * ldc) = c0_nxt2;
*(c + 1 + 2 * ldc) = c1_nxt2;
-
*(c + 0 + 3 * ldc) = c0_nxt3;
*(c + 1 + 3 * ldc) = c1_nxt3;
}
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 4, *bb = b + 4;
- FLOAT a0, a1, b0, b1;
+ FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
- a0 = aa[0];
- a1 = aa[1];
-
- b0 = bb[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
- b1 = bb[1];
- c0_nxt -= a0 * b1;
- c1_nxt -= a1 * b1;
+ c0_nxt -= aa[0] * bb[1];
+ c1_nxt -= aa[1] * bb[1];
aa += 2;
bb += 2;
}
}
+ a -= 4;
+ b -= 4;
+
a0 = *(a + 0);
a2 = *(a + 2);
a3 = *(a + 3);
if (bk > 0)
{
BLASLONG i;
- FLOAT a0, a1, b0;
- FLOAT *aa = a + 4, *bb = b + 2;
+ FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
- a0 = aa[0];
- a1 = aa[1];
-
- b0 = bb[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
aa += 2;
bb += 1;
}
}
- a0 = *(a + 0);
- a2 = *(a + 2);
- a3 = *(a + 3);
+ a0 = *(a - 4);
+ a2 = *(a - 2);
+ a3 = *(a - 1);
c1 *= a3;
c0 -= c1 * a2;
c0 *= a0;
- *(b + 0) = c0;
- *(b + 1) = c1;
+ *(b - 2) = c0;
+ *(b - 1) = c1;
*(c + 0) = c0;
*(c + 1) = c1;
static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT a0;
- FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+ FLOAT c0, c1, c2, c3;
- a0 = *a;
c0 = *(c + 0);
- c0_nxt1 = *(c + 1 * ldc);
- c0_nxt2 = *(c + 2 * ldc);
- c0_nxt3 = *(c + 3 * ldc);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 1, *bb = b + 4;
+ FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
c0 -= aa[0] * bb[0];
- c0_nxt1 -= aa[0] * bb[1];
- c0_nxt2 -= aa[0] * bb[2];
- c0_nxt3 -= aa[0] * bb[3];
+ c1 -= aa[0] * bb[1];
+ c2 -= aa[0] * bb[2];
+ c3 -= aa[0] * bb[3];
aa += 1;
bb += 4;
}
}
- c0 *= a0;
- c0_nxt1 *= a0;
- c0_nxt2 *= a0;
- c0_nxt3 *= a0;
+ c0 *= *(a - 1);
+ c1 *= *(a - 1);
+ c2 *= *(a - 1);
+ c3 *= *(a - 1);
*(c + 0 * ldc) = c0;
- *(c + 1 * ldc) = c0_nxt1;
- *(c + 2 * ldc) = c0_nxt2;
- *(c + 3 * ldc) = c0_nxt3;
-
- *(b + 0) = c0;
- *(b + 1) = c0_nxt1;
- *(b + 2) = c0_nxt2;
- *(b + 3) = c0_nxt3;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
+
+ *(b - 4) = c0;
+ *(b - 3) = c1;
+ *(b - 2) = c2;
+ *(b - 1) = c3;
}
static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc)
bb = b + 4 * kk;
cc = c + (m - 1);
- dsolve_1x4_ln_msa(aa - 1, bb - 4, cc, ldc, k - kk);
+ dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk);
kk -= 1;
}
bb = b + 4 * kk;
cc = c + ((m & -2) - 2);
- dsolve_2x4_ln_msa(aa - 4, bb - 8, cc, ldc, k - kk);
+ dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk);
kk -= 2;
}
bb = b + 4 * kk;
cc = c + ((m & -4) - 4);
- dsolve_4x4_ln_msa(aa - 16, bb - 16, cc, ldc, k - kk);
+ dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk);
kk -= 4;
}
aa = a + ((m & -2) - 2) * k;
cc = c + ((m & -2) - 2);
- dsolve_2x2_ln_msa(aa + kk * 2 - 4, b + kk * 2 - 4, cc, ldc, k - kk);
+ dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk);
kk -= 2;
}
aa = a + ((m & -4) - 4) * k;
cc = c + ((m & -4) - 4);
- dsolve_4x2_ln_msa(aa + kk * 4 - 16, b + kk * 2 - 8, cc, ldc, k - kk);
+ dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk);
kk -= 4;
}
aa = a + ((m & -2) - 2) * k + kk * 2;
cc = c + ((m & -2) - 2);
- dsolve_2x1_ln_msa(aa - 4, b + kk - 2, cc, k - kk);
+ dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk);
kk -= 2;
}
aa = a + ((m & -4) - 4) * k;
cc = c + ((m & -4) - 4);
- dsolve_4x1_ln_msa(aa + 4 * kk - 16, b + kk - 4, cc, k - kk);
+ dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk);
kk -= 4;
}
LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
b += 4;
}
- res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
- res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
- res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
- res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
- res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
- res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
- res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
- res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
- res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8);
- res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8);
- res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9);
- res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9);
- res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10);
- res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10);
- res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11);
- res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11);
+ ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
+ ILVRL_D2_DP(src_c12, src_c8, res_c8, res_c9);
+ ILVRL_D2_DP(src_c13, src_c9, res_c10, res_c11);
+ ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
+ ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
src_a0 = LD_DP(a + 0);
src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
ST_DP(res_c1, b + 4);
ST_DP(res_c9, b + 6);
- src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
- src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
- src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8);
- src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8);
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
+ ILVRL_D2_DP(res_c9, res_c8, src_c8, src_c12);
ST_DP(src_c0, c);
ST_DP(src_c4, c_nxt1line);
ST_DP(res_c3, b + 12);
ST_DP(res_c11, b + 14);
- src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
- src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
- src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10);
- src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
+ ILVRL_D2_DP(res_c11, res_c10, src_c9, src_c13);
src_a36 = LD_DP(a + 36);
src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1);
ST_DP(res_c5, b + 20);
ST_DP(res_c13, b + 22);
- src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
- src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
- src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12);
- src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12);
+ ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
+ ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
src_a63 = __msa_cast_to_vector_double(*(a + 63));
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
ST_DP(res_c7, b + 28);
ST_DP(res_c15, b + 30);
- src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
- src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
- src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14);
- src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14);
+ ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
+ ILVRL_D2_DP(res_c15, res_c14, src_c11, src_c15);
ST_DP(src_c3, c + 6);
ST_DP(src_c7, c_nxt1line + 6);
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
+ v2f64 src_b, src_b0, src_b1;
- for (i = bk; i--;)
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(b);
+
+ a += 8;
+ b += 2;
+
+ for (i = (bk - 1); i--;)
{
- LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
- src_b0 = LD_DP(b);
+ LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
+ src_b1 = LD_DP(b);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b1;
+
a += 8;
b += 2;
}
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
}
- res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0);
- res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0);
- res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1);
- res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1);
- res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2);
- res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2);
- res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3);
- res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3);
+ ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c5, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c2, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c3, res_c6, res_c7);
src_a0 = LD_DP(a + 0);
src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
ST_DP(res_c2, b + 4);
ST_DP(res_c3, b + 6);
- src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
- src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
- src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
- src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c4);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c5);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c4, src_c5, c + ldc, 2);
ST_DP(res_c6, b + 12);
ST_DP(res_c7, b + 14);
- src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
- src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
- src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
- src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+ ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
+ ILVRL_D2_DP(res_c7, res_c6, src_c3, src_c7);
ST_DP2(src_c2, src_c3, c + 4, 2);
ST_DP2(src_c6, src_c7, c + 4 + ldc, 2);
{
FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
- FLOAT a45, a46, a47, a54, a55, a63;
- FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
+ FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7;
c0 = *(c + 0);
c1 = *(c + 1);
c6 = *(c + 6);
c7 = *(c + 7);
- if (bk > 0)
+ if (bk)
{
- int i;
- FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0;
+ BLASLONG i;
for (i = bk; i--; )
{
- a0 = a[0];
- a1 = a[1];
- a2 = a[2];
- a3 = a[3];
- a4 = a[4];
- a5 = a[5];
- a6 = a[6];
- a7 = a[7];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
- c2 -= a2 * b0;
- c3 -= a3 * b0;
- c4 -= a4 * b0;
- c5 -= a5 * b0;
- c6 -= a6 * b0;
- c7 -= a7 * b0;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
+ c4 -= a[4] * b[0];
+ c5 -= a[5] * b[0];
+ c6 -= a[6] * b[0];
+ c7 -= a[7] * b[0];
a += 8;
b += 1;
LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
}
}
- res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
- res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
- res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
- res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
- res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4);
- res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4);
- res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5);
- res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5);
+ ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
+ ILVRL_D2_DP(src_c6, src_c4, res_c4, res_c5);
+ ILVRL_D2_DP(src_c7, src_c5, res_c6, res_c7);
src_a0 = LD_DP(a + 0);
src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
ST_DP(res_c3, b + 12);
ST_DP(res_c7, b + 14);
- src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
- src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
- src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4);
- src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4);
-
- src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
- src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
- src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6);
- src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6);
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
+ ILVRL_D2_DP(res_c5, res_c4, src_c4, src_c6);
+ ILVRL_D2_DP(res_c7, res_c6, src_c5, src_c7);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c2, src_c3, c + ldc, 2);
LD_DP2(c, 2, src_c0, src_c1);
LD_DP2(c + ldc, 2, src_c2, src_c3);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_b, src_b0;
}
}
- res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0);
- res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0);
- res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1);
- res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1);
+ ILVRL_D2_DP(src_c2, src_c0, res_c0, res_c1);
+ ILVRL_D2_DP(src_c3, src_c1, res_c2, res_c3);
src_a0 = LD_DP(a + 0);
src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1);
ST_DP(res_c2, b + 4);
ST_DP(res_c3, b + 6);
- src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0);
- src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2);
- src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0);
- src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2);
+ ILVRL_D2_DP(res_c1, res_c0, src_c0, src_c2);
+ ILVRL_D2_DP(res_c3, res_c2, src_c1, src_c3);
ST_DP2(src_c0, src_c1, c, 2);
ST_DP2(src_c2, src_c3, c + ldc, 2);
static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
- FLOAT c0, c1, c2, c3;
- FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15;
+ FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3;
c0 = *(c + 0);
c1 = *(c + 1);
c2 = *(c + 2);
c3 = *(c + 3);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- FLOAT a0, a1, a2, a3, b0;
for (i = bk; i--;)
{
- a0 = a[0];
- a1 = a[1];
- a2 = a[2];
- a3 = a[3];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
- c2 -= a2 * b0;
- c3 -= a3 * b0;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
a += 4;
b += 1;
static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT a0, a1, a3;
- FLOAT c0, c1, c0_nxt1, c1_nxt1;
+ FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1;
FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
c0 = *(c + 0);
c0_nxt3 = *(c + 3 * ldc);
c1_nxt3 = *(c + 1 + 3 * ldc);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- FLOAT a0, a1, b0, b1, b2, b3;
for (i = bk; i--;)
{
- a0 = a[0];
- a1 = a[1];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
-
- b1 = b[1];
- c0_nxt1 -= a0 * b1;
- c1_nxt1 -= a1 * b1;
-
- b2 = b[2];
- c0_nxt2 -= a0 * b2;
- c1_nxt2 -= a1 * b2;
-
- b3 = b[3];
- c0_nxt3 -= a0 * b3;
- c1_nxt3 -= a1 * b3;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c1_nxt1 -= a[1] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c1_nxt2 -= a[1] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+ c1_nxt3 -= a[1] * b[3];
a += 2;
b += 4;
*(c + 0) = c0;
*(c + 1) = c1;
-
*(c + 0 + ldc) = c0_nxt1;
*(c + 1 + ldc) = c1_nxt1;
-
*(c + 0 + 2 * ldc) = c0_nxt2;
*(c + 1 + 2 * ldc) = c1_nxt2;
-
*(c + 0 + 3 * ldc) = c0_nxt3;
*(c + 1 + 3 * ldc) = c1_nxt3;
}
static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT a0, a1, a3;
- FLOAT c0, c1, c0_nxt, c1_nxt;
+ FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt;
c0 = *(c + 0);
c1 = *(c + 1);
c0_nxt = *(c + ldc);
c1_nxt = *(c + 1 + ldc);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- FLOAT a0, a1, b0, b1;
for (i = bk; i--;)
{
- a0 = a[0];
- a1 = a[1];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
- b1 = b[1];
- c0_nxt -= a0 * b1;
- c1_nxt -= a1 * b1;
+ c0_nxt -= a[0] * b[1];
+ c1_nxt -= a[1] * b[1];
a += 2;
b += 2;
c0 = *(c + 0);
c1 = *(c + 1);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- FLOAT a0, a1, b0;
for (i = bk; i--;)
{
- a0 = a[0];
- a1 = a[1];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
a += 2;
b += 1;
static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT a0;
- FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+ FLOAT c0, c1, c2, c3;
c0 = *(c + 0);
- c0_nxt1 = *(c + 1 * ldc);
- c0_nxt2 = *(c + 2 * ldc);
- c0_nxt3 = *(c + 3 * ldc);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
for (i = bk; i--;)
{
c0 -= a[0] * b[0];
- c0_nxt1 -= a[0] * b[1];
- c0_nxt2 -= a[0] * b[2];
- c0_nxt3 -= a[0] * b[3];
+ c1 -= a[0] * b[1];
+ c2 -= a[0] * b[2];
+ c3 -= a[0] * b[3];
a += 1;
b += 4;
}
}
- a0 = *a;
-
- c0 *= a0;
- c0_nxt1 *= a0;
- c0_nxt2 *= a0;
- c0_nxt3 *= a0;
+ c0 *= *a;
+ c1 *= *a;
+ c2 *= *a;
+ c3 *= *a;
*(c + 0 * ldc) = c0;
- *(c + 1 * ldc) = c0_nxt1;
- *(c + 2 * ldc) = c0_nxt2;
- *(c + 3 * ldc) = c0_nxt3;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
*(b + 0) = c0;
- *(b + 1) = c0_nxt1;
- *(b + 2) = c0_nxt2;
- *(b + 3) = c0_nxt3;
+ *(b + 1) = c1;
+ *(b + 2) = c2;
+ *(b + 3) = c3;
}
static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT c0, c0_nxt;
+ FLOAT c0, c1;
c0 = *c;
- c0_nxt = *(c + ldc);
+ c1 = *(c + ldc);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
for (i = bk; i--;)
{
c0 -= *a * b[0];
- c0_nxt -= *a * b[1];
+ c1 -= *a * b[1];
a += 1;
b += 2;
}
c0 *= *a;
- c0_nxt *= *a;
+ c1 *= *a;
*(b + 0) = c0;
- *(b + 1) = c0_nxt;
+ *(b + 1) = c1;
*(c + 0) = c0;
- *(c + ldc) = c0_nxt;
+ *(c + ldc) = c1;
}
static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
- v2f64 src_b0, src_b1, src_b3;
+ v2f64 src_b0, src_b1, src_b3, src_b;
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
- for (i = bk; i--;)
+ LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(b);
+
+ a += 8;
+ b += 2;
+
+ for (i = (bk - 1); i--;)
{
- LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
- src_b0 = LD_DP(b);
+ LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
+ src_b1 = LD_DP(b);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c6 -= src_a2 * src_b;
src_c7 -= src_a3 * src_b;
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b1;
+
a += 8;
b += 2;
}
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
}
src_b0 = LD_DP(b + 0);
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_a2, src_a3, src_b;
LD_DP2(c + 2 * ldc, 2, src_c4, src_c5);
LD_DP2(c + 3 * ldc, 2, src_c6, src_c7);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
LD_DP2(c, 2, src_c0, src_c1);
LD_DP2(c + ldc, 2, src_c2, src_c3);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
v2f64 src_a0, src_a1, src_b, src_b0;
static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
- FLOAT b0, c0, c1, c2, c3;
+ FLOAT c0, c1, c2, c3;
c0 = *(c + 0);
c1 = *(c + 1);
c2 = *(c + 2);
c3 = *(c + 3);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- FLOAT a0, a1, a2, a3;
for (i = bk; i--;)
{
- a0 = a[0];
- a1 = a[1];
- a2 = a[2];
- a3 = a[3];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
- c2 -= a2 * b0;
- c3 -= a3 * b0;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c2 -= a[2] * b[0];
+ c3 -= a[3] * b[0];
a += 4;
b += 1;
}
}
- b0 = *b;
-
- c0 *= b0;
- c1 *= b0;
- c2 *= b0;
- c3 *= b0;
+ c0 *= *b;
+ c1 *= *b;
+ c2 *= *b;
+ c3 *= *b;
*(a + 0) = c0;
*(a + 1) = c1;
c0_nxt3 = *(c + 0 + 3 * ldc);
c1_nxt3 = *(c + 1 + 3 * ldc);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- FLOAT a0, a1, b0, b1, b2, b3;
for (i = bk; i--;)
{
- a0 = a[0];
- a1 = a[1];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
-
- b1 = b[1];
- c0_nxt1 -= a0 * b1;
- c1_nxt1 -= a1 * b1;
-
- b2 = b[2];
- c0_nxt2 -= a0 * b2;
- c1_nxt2 -= a1 * b2;
-
- b3 = b[3];
- c0_nxt3 -= a0 * b3;
- c1_nxt3 -= a1 * b3;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
+ c0_nxt1 -= a[0] * b[1];
+ c1_nxt1 -= a[1] * b[1];
+ c0_nxt2 -= a[0] * b[2];
+ c1_nxt2 -= a[1] * b[2];
+ c0_nxt3 -= a[0] * b[3];
+ c1_nxt3 -= a[1] * b[3];
a += 2;
b += 4;
*(a + 7) = c1_nxt3;
*(c + 0) = c0;
- *(c + 1 * ldc) = c0_nxt1;
- *(c + 2 * ldc) = c0_nxt2;
- *(c + 3 * ldc) = c0_nxt3;
-
*(c + 1) = c1;
+ *(c + 1 * ldc) = c0_nxt1;
*(c + 1 + 1 * ldc) = c1_nxt1;
+ *(c + 2 * ldc) = c0_nxt2;
*(c + 1 + 2 * ldc) = c1_nxt2;
+ *(c + 3 * ldc) = c0_nxt3;
*(c + 1 + 3 * ldc) = c1_nxt3;
}
c0 = *(c + 0);
c1 = *(c + 1);
-
c0_nxt = *(c + 0 + ldc);
c1_nxt = *(c + 1 + ldc);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- FLOAT a0, a1, b0, b1;
for (i = bk; i--;)
{
- a0 = a[0];
- a1 = a[1];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
- b1 = b[1];
- c0_nxt -= a0 * b1;
- c1_nxt -= a1 * b1;
+ c0_nxt -= a[0] * b[1];
+ c1_nxt -= a[1] * b[1];
a += 2;
b += 2;
*(c + 0) = c0;
*(c + 1) = c1;
- *(c + ldc) = c0_nxt;
+ *(c + 0 + ldc) = c0_nxt;
*(c + 1 + ldc) = c1_nxt;
}
c0 = *(c + 0);
c1 = *(c + 1);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
- FLOAT a0, a1, b0;
for (i = bk; i--;)
{
- a0 = a[0];
- a1 = a[1];
-
- b0 = b[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
+ c0 -= a[0] * b[0];
+ c1 -= a[1] * b[0];
a += 2;
b += 1;
static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15;
- FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+ FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3;
c0 = *(c + 0);
- c0_nxt1 = *(c + 1 * ldc);
- c0_nxt2 = *(c + 2 * ldc);
- c0_nxt3 = *(c + 3 * ldc);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
for (i = bk; i--;)
{
c0 -= a[0] * b[0];
- c0_nxt1 -= a[0] * b[1];
- c0_nxt2 -= a[0] * b[2];
- c0_nxt3 -= a[0] * b[3];
+ c1 -= a[0] * b[1];
+ c2 -= a[0] * b[2];
+ c3 -= a[0] * b[3];
a += 1;
b += 4;
c0 *= b0;
- c0_nxt1 -= c0 * b1;
- c0_nxt1 *= b5;
+ c1 -= c0 * b1;
+ c1 *= b5;
- c0_nxt2 -= c0 * b2;
- c0_nxt2 -= c0_nxt1 * b6;
- c0_nxt2 *= b10;
+ c2 -= c0 * b2;
+ c2 -= c1 * b6;
+ c2 *= b10;
- c0_nxt3 -= c0 * b3;
- c0_nxt3 -= c0_nxt1 * b7;
- c0_nxt3 -= c0_nxt2 * b11;
- c0_nxt3 *= b15;
+ c3 -= c0 * b3;
+ c3 -= c1 * b7;
+ c3 -= c2 * b11;
+ c3 *= b15;
*(a + 0) = c0;
- *(a + 1) = c0_nxt1;
- *(a + 2) = c0_nxt2;
- *(a + 3) = c0_nxt3;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
*(c + 0) = c0;
- *(c + 1 * ldc) = c0_nxt1;
- *(c + 2 * ldc) = c0_nxt2;
- *(c + 3 * ldc) = c0_nxt3;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
}
static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT b0, b1, b3, c0, c0_nxt;
+ FLOAT b0, b1, b3, c0, c1;
c0 = *c;
- c0_nxt = *(c + ldc);
+ c1 = *(c + ldc);
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
for (i = bk; i--;)
{
c0 -= *a * b[0];
- c0_nxt -= *a * b[1];
+ c1 -= *a * b[1];
a += 1;
b += 2;
c0 *= b0;
- c0_nxt -= c0 * b1;
- c0_nxt *= b3;
+ c1 -= c0 * b1;
+ c1 *= b3;
*(a + 0) = c0;
- *(a + 1) = c0_nxt;
+ *(a + 1) = c1;
*(c + 0) = c0;
- *(c + ldc) = c0_nxt;
+ *(c + ldc) = c1;
}
static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
- if (bk > 0)
+ if (bk)
{
BLASLONG i;
ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
}
-static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
+static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_b0, src_b2, src_b3;
if (bk > 0)
{
- v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0;
+ BLASLONG i;
+ FLOAT *pba = a, *pbb = b;
+ v2f64 src_b, src_b1, src_a0, src_a1, src_a2, src_a3;
+ v2f64 src_a4, src_a5, src_a6, src_a7;
+
+ LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(pbb);
- LD_DP4(a + 16, 2, src_a0, src_a1, src_a2, src_a3);
- src_b0 = LD_DP(b + 4);
+ for (i = bk - 1; i--;)
+ {
+ pba += 8;
+ pbb += 2;
+
+ LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
+ src_b1 = LD_DP(pbb);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b1;
+ }
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c7 -= src_a3 * src_b;
}
+ a -= 16;
+ b -= 4;
+
src_b0 = __msa_cast_to_vector_double(*(b + 0));
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b2 = LD_DP(b + 2);
ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2);
}
-static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
+static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3;
v2f64 src_b0;
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
+ v2f64 src_b1;
+
+ LD_DP4(aa, 2, src_a0, src_a1, src_a2, src_a3);
+ src_b0 = LD_DP(bb);
+
+ aa += 8;
+ bb += 1;
+
+ for (i = (bk - 1); i--;)
+ {
+ LD_DP4(aa, 2, src_a4, src_a5, src_a6, src_a7);
+ src_b1 = LD_DP(bb);
+
+ src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a2 * src_b0;
+ src_c3 -= src_a3 * src_b0;
+
+ src_a0 = src_a4;
+ src_a1 = src_a5;
+ src_a2 = src_a6;
+ src_a3 = src_a7;
+ src_b0 = src_b1;
+
+ aa += 8;
+ bb += 1;
+ }
+
+ src_b0 = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a2 * src_b0;
+ src_c3 -= src_a3 * src_b0;
+ }
+
+ a -= 8;
+ b -= 1;
+
src_b0 = __msa_cast_to_vector_double(*b);
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 16, *bb = b + 16;
+ FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_b, src_b0, src_b1;
for (i = bk; i--;)
}
}
+ a -= 16;
+ b -= 16;
+
src_b12 = LD_DP(b + 12);
src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1);
src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0);
ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
}
-static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
+static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
if (bk > 0)
{
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
v2f64 src_a0, src_a1, src_b, src_b0;
- LD_DP2(a + 8, 2, src_a0, src_a1);
- src_b0 = LD_DP(b + 4);
+ for (i = bk; i--;)
+ {
+ LD_DP2(aa, 2, src_a0, src_a1);
+ src_b0 = LD_DP(bb);
- src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
- src_c0 -= src_a0 * src_b;
- src_c1 -= src_a1 * src_b;
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
- src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
- src_c2 -= src_a0 * src_b;
- src_c3 -= src_a1 * src_b;
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c2 -= src_a0 * src_b;
+ src_c3 -= src_a1 * src_b;
+
+ aa += 4;
+ bb += 2;
+ }
}
+ a -= 8;
+ b -= 4;
+
src_b0 = __msa_cast_to_vector_double(*(b + 0));
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b2 = LD_DP(b + 2);
ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2);
}
-static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
+static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
FLOAT b0, c0, c1, c2, c3;
- b0 = *(b + 0);
-
c0 = *(c + 0);
c1 = *(c + 1);
c2 = *(c + 2);
c3 = *(c + 3);
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c2 -= aa[2] * bb[0];
+ c3 -= aa[3] * bb[0];
+
+ aa += 4;
+ bb += 1;
+ }
+ }
+
+ a -= 4;
+
+ b0 = *(b - 1);
+
c0 *= b0;
c1 *= b0;
c2 *= b0;
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 8, *bb = b + 16;
- FLOAT a0, a1, b0, b1, b2, b3;
+ FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
- a0 = aa[0];
- a1 = aa[1];
-
- b0 = bb[0];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
-
- b1 = bb[1];
- c0_nxt1 -= a0 * b1;
- c1_nxt1 -= a1 * b1;
-
- b2 = bb[2];
- c0_nxt2 -= a0 * b2;
- c1_nxt2 -= a1 * b2;
-
- b3 = bb[3];
- c0_nxt3 -= a0 * b3;
- c1_nxt3 -= a1 * b3;
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+ c0_nxt1 -= aa[0] * bb[1];
+ c1_nxt1 -= aa[1] * bb[1];
+ c0_nxt2 -= aa[0] * bb[2];
+ c1_nxt2 -= aa[1] * bb[2];
+ c0_nxt3 -= aa[0] * bb[3];
+ c1_nxt3 -= aa[1] * bb[3];
aa += 2;
bb += 4;
}
}
+ a -= 8;
+ b -= 16;
+
b0 = *b;
b4 = *(b + 4);
b5 = *(b + 5);
*(c + 0) = c0;
*(c + 1) = c1;
-
*(c + 0 + 1 * ldc) = c0_nxt1;
*(c + 1 + 1 * ldc) = c1_nxt1;
-
*(c + 0 + 2 * ldc) = c0_nxt2;
*(c + 1 + 2 * ldc) = c1_nxt2;
-
*(c + 0 + 3 * ldc) = c0_nxt3;
*(c + 1 + 3 * ldc) = c1_nxt3;
}
-static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk)
+static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT b0, b2, b3;
- FLOAT c0, c1, c0_nxt, c1_nxt;
+ FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
c0 = *(c + 0);
c1 = *(c + 1);
-
c0_nxt = *(c + 0 + ldc);
c1_nxt = *(c + 1 + ldc);
if (bk > 0)
{
- FLOAT a0, a1, b0, b1;
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
- a0 = a[4];
- a1 = a[5];
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
- b0 = b[4];
- c0 -= a0 * b0;
- c1 -= a1 * b0;
+ c0_nxt -= aa[0] * bb[1];
+ c1_nxt -= aa[1] * bb[1];
- b1 = b[5];
- c0_nxt -= a0 * b1;
- c1_nxt -= a1 * b1;
+ aa += 2;
+ bb += 2;
+ }
}
+ a -= 4;
+ b -= 4;
+
b3 = *(b + 3);
b2 = *(b + 2);
b0 = *b;
*(c + 1 + ldc) = c1_nxt;
}
-static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c)
+static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
{
FLOAT b0, c0, c1;
c0 = *(c + 0);
c1 = *(c + 1);
- b0 = *b;
+ if (bk > 0)
+ {
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= aa[0] * bb[0];
+ c1 -= aa[1] * bb[0];
+
+ aa += 2;
+ bb += 1;
+ }
+ }
+
+ b0 = *(b - 1);
c0 *= b0;
c1 *= b0;
- *(a + 0) = c0;
- *(a + 1) = c1;
+ *(a - 2) = c0;
+ *(a - 1) = c1;
*(c + 0) = c0;
*(c + 1) = c1;
static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
- FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3;
+ FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15, c0, c1, c2, c3;
c0 = *(c + 0);
- c0_nxt1 = *(c + 1 * ldc);
- c0_nxt2 = *(c + 2 * ldc);
- c0_nxt3 = *(c + 3 * ldc);
+ c1 = *(c + 1 * ldc);
+ c2 = *(c + 2 * ldc);
+ c3 = *(c + 3 * ldc);
if (bk > 0)
{
BLASLONG i;
- FLOAT *aa = a + 4, *bb = b + 16;
+ FLOAT *aa = a, *bb = b;
for (i = bk; i--;)
{
c0 -= aa[0] * bb[0];
- c0_nxt1 -= aa[0] * bb[1];
- c0_nxt2 -= aa[0] * bb[2];
- c0_nxt3 -= aa[0] * bb[3];
+ c1 -= aa[0] * bb[1];
+ c2 -= aa[0] * bb[2];
+ c3 -= aa[0] * bb[3];
aa += 1;
bb += 4;
}
}
+ a -= 4;
+ b -= 16;
+
b0 = *b;
b4 = *(b + 4);
b5 = *(b + 5);
b14 = *(b + 14);
b15 = *(b + 15);
- c0_nxt3 *= b15;
+ c3 *= b15;
- c0_nxt2 -= c0_nxt3 * b14;
- c0_nxt2 *= b10;
+ c2 -= c3 * b14;
+ c2 *= b10;
- c0_nxt1 -= c0_nxt3 * b13;
- c0_nxt1 -= c0_nxt2 * b9;
- c0_nxt1 *= b5;
+ c1 -= c3 * b13;
+ c1 -= c2 * b9;
+ c1 *= b5;
- c0 -= c0_nxt3 * b12;
- c0 -= c0_nxt2 * b8;
- c0 -= c0_nxt1 * b4;
+ c0 -= c3 * b12;
+ c0 -= c2 * b8;
+ c0 -= c1 * b4;
c0 *= b0;
*(a + 0) = c0;
- *(a + 1) = c0_nxt1;
- *(a + 2) = c0_nxt2;
- *(a + 3) = c0_nxt3;
-
- *(c) = c0;
- *(c + 1 * ldc) = c0_nxt1;
- *(c + 2 * ldc) = c0_nxt2;
- *(c + 3 * ldc) = c0_nxt3;
+ *(a + 1) = c1;
+ *(a + 2) = c2;
+ *(a + 3) = c3;
+
+ *(c + 0 * ldc) = c0;
+ *(c + 1 * ldc) = c1;
+ *(c + 2 * ldc) = c2;
+ *(c + 3 * ldc) = c3;
}
static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- FLOAT b0, b2, b3, c0, c0_nxt;
+ FLOAT b0, b2, b3, c0, c1;
c0 = *(c + 0);
- c0_nxt = *(c + ldc);
+ c1 = *(c + ldc);
if (bk > 0)
{
- c0 -= a[2] * b[4];
- c0_nxt -= a[2] * b[5];
+ BLASLONG i;
+ FLOAT *aa = a, *bb = b;
+
+ for (i = bk; i--;)
+ {
+ c0 -= *aa * bb[0];
+ c1 -= *aa * bb[1];
+
+ aa += 1;
+ bb += 2;
+ }
}
+ a -= 2;
+ b -= 4;
+
b3 = *(b + 3);
b2 = *(b + 2);
b0 = *b;
- c0_nxt *= b3;
+ c1 *= b3;
- c0 -= c0_nxt * b2;
+ c0 -= c1 * b2;
c0 *= b0;
*(a + 0) = c0;
- *(a + 1) = c0_nxt;
+ *(a + 1) = c1;
*(c + 0) = c0;
- *(c + ldc) = c0_nxt;
+ *(c + ldc) = c1;
+}
+
+static void dsolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
+{
+ if (bk > 0)
+ {
+ BLASLONG i;
+
+ for (i = 0; i < bk; i++)
+ {
+ *c -= a[i] * b[i];
+ }
+ }
+
+ *c *= *(b - 1);
+ *(a - 1) = *c;
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
aa = a;
c -= ldc;
b -= k;
- bb = b + (kk - 1);
+ bb = b + kk;
cc = c;
for (i = (m >> 3); i--;)
{
- dsolve_8x1_rt_msa(aa + 8 * kk - 8, bb, cc);
+ dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk);
aa += 8 * k;
cc += 8;
{
if (m & 4)
{
- dsolve_4x1_rt_msa(aa + 4 * kk - 4, bb, cc);
+ dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk);
aa += 4 * k;
cc += 4;
if (m & 2)
{
- dsolve_2x1_rt_msa(aa + 2 * kk - 2, bb, cc);
+ dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk);
aa += 2 * k;
cc += 2;
if (m & 1)
{
- *cc *= *bb;
- *(aa + kk - 1) = *cc;
+ dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk);
aa += k;
cc += 1;
for (i = (m >> 3); i--;)
{
- dsolve_8x2_rt_msa(aa + 8 * kk - 16, bb - 4, cc, ldc, k - kk);
+ dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk);
aa += 8 * k;
cc += 8;
{
if (m & 4)
{
- dsolve_4x2_rt_msa(aa + 4 * kk - 8, bb - 4, cc, ldc, k - kk);
+ dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk);
aa += 4 * k;
cc += 4;
if (m & 2)
{
- dsolve_2x2_rt_msa(aa + 2 * kk - 4, bb - 4, cc, ldc, k - kk);
+ dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk);
aa += 2 * k;
cc += 2;
if (m & 1)
{
- dsolve_1x2_rt_msa(aa + kk - 2, bb - 4, cc, ldc, k - kk);
+ dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk);
+
+ aa += k;
+ cc += 1;
}
}
{
if (m & 4)
{
- dsolve_4x4_rt_msa(aa + kk * 4 - 16, bb - 16, cc, ldc, k - kk);
+ dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk);
aa += 4 * k;
cc += 4;
if (m & 2)
{
- dsolve_2x4_rt_msa(aa + kk * 2 - 8, bb - 16, cc, ldc, k - kk);
+ dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk);
aa += 2 * k;
cc += 2;
if (m & 1)
{
- dsolve_1x4_rt_msa(aa + kk - 4, bb - 16, cc, ldc, k - kk);
+ dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk);
aa += k;
cc += 1;
ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
}
+/* Description : Interleave both left and right half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Right half of byte elements from 'in0' and 'in1' are
+ interleaved and written to 'out0'
+*/
+#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
+}
+#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
+
#endif /* __MACROS_MSA_H__ */