#include "common.h"
#include "macros_msa.h"
-static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+static __attribute__ ((noinline))
+void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
FLOAT *c_nxt2line = c + 2 * ldc;
FLOAT *c_nxt3line = c + 3 * ldc;
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, -96(%[a]) \n\t"
+ "pref 0, -32(%[a]) \n\t"
+ "pref 0, -160(%[a]) \n\t"
+ "pref 0, -224(%[a]) \n\t"
+ "pref 0, -64(%[a]) \n\t"
+ "pref 0, -128(%[a]) \n\t"
+ "pref 0, -192(%[a]) \n\t"
+ "pref 0, -256(%[a]) \n\t"
+ "pref 0, -320(%[a]) \n\t"
+ "pref 0, -384(%[a]) \n\t"
+ "pref 0, -448(%[a]) \n\t"
+ "pref 0, -512(%[a]) \n\t"
+
+ :
+ : [a] "r"(a)
+ );
+#endif
+
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
FLOAT *pba = a, *pbb = b;
v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
- LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
- LD_DP2(pbb, 2, src_b0, src_b1);
+ LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pbb, 2, src_b0, src_b1);
- for (i = (bk - 1); i--;)
+ for (i = (bk - 1) >> 1; i--;)
{
- pba += 8;
- pbb += 4;
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 128(%[pba]) \n\t"
+ "pref 0, 160(%[pba]) \n\t"
+ "pref 0, 192(%[pba]) \n\t"
+ "pref 0, 224(%[pba]) \n\t"
- LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17);
- LD_DP2(pbb, 2, src_b2, src_b3);
+ :
+ : [pba] "r"(pba)
+ );
+#endif
+
+ LD_DP4_INC(pba, 2, src_a8, src_a9, src_a16, src_a17);
+ LD_DP2_INC(pbb, 2, src_b2, src_b3);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
- src_a0 = src_a8;
- src_a1 = src_a9;
- src_a2 = src_a16;
- src_a3 = src_a17;
- src_b0 = src_b2;
- src_b1 = src_b3;
+ LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pbb, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2);
+ src_c0 -= src_a8 * src_b;
+ src_c1 -= src_a9 * src_b;
+ src_c2 -= src_a16 * src_b;
+ src_c3 -= src_a17 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2);
+ src_c4 -= src_a8 * src_b;
+ src_c5 -= src_a9 * src_b;
+ src_c6 -= src_a16 * src_b;
+ src_c7 -= src_a17 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3);
+ src_c8 -= src_a8 * src_b;
+ src_c9 -= src_a9 * src_b;
+ src_c10 -= src_a16 * src_b;
+ src_c11 -= src_a17 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3);
+ src_c12 -= src_a8 * src_b;
+ src_c13 -= src_a9 * src_b;
+ src_c14 -= src_a16 * src_b;
+ src_c15 -= src_a17 * src_b;
+ }
+
+ if ((bk - 1) & 1)
+ {
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pbb, 2, src_b0, src_b1);
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
bb = b + 4 * kk;
cc = c + (m - 1);
- dsolve_1x4_ln_msa(aa, bb, cc, ldc, k - kk);
+ dsolve_1x4_ln_msa(aa, bb, cc, ldc, (k - kk));
kk -= 1;
}
bb = b + 4 * kk;
cc = c + ((m & -2) - 2);
- dsolve_2x4_ln_msa(aa, bb, cc, ldc, k - kk);
+ dsolve_2x4_ln_msa(aa, bb, cc, ldc, (k - kk));
kk -= 2;
}
bb = b + 4 * kk;
cc = c + ((m & -4) - 4);
- dsolve_4x4_ln_msa(aa, bb, cc, ldc, k - kk);
+ dsolve_4x4_ln_msa(aa, bb, cc, ldc, (k - kk));
kk -= 4;
}
do
{
- dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, k - kk);
+ dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
aa -= 8 * k;
cc -= 8;
aa = a + ((m & -2) - 2) * k;
cc = c + ((m & -2) - 2);
- dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, k - kk);
+ dsolve_2x2_ln_msa(aa + kk * 2, b + kk * 2, cc, ldc, (k - kk));
kk -= 2;
}
aa = a + ((m & -4) - 4) * k;
cc = c + ((m & -4) - 4);
- dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, k - kk);
+ dsolve_4x2_ln_msa(aa + kk * 4, b + kk * 2, cc, ldc, (k - kk));
kk -= 4;
}
do
{
- dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, k - kk);
+ dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, (k - kk));
aa -= 8 * k;
cc -= 8;
aa = a + ((m & -2) - 2) * k + kk * 2;
cc = c + ((m & -2) - 2);
- dsolve_2x1_ln_msa(aa, b + kk, cc, k - kk);
+ dsolve_2x1_ln_msa(aa, b + kk, cc, (k - kk));
kk -= 2;
}
aa = a + ((m & -4) - 4) * k;
cc = c + ((m & -4) - 4);
- dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, k - kk);
+ dsolve_4x1_ln_msa(aa + 4 * kk, b + kk, cc, (k - kk));
kk -= 4;
}
do
{
- dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, k - kk);
+ dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, (k - kk));
aa -= 8 * k;
cc -= 8;
#include "common.h"
#include "macros_msa.h"
-static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+static __attribute__ ((noinline))
+void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
FLOAT *c_nxt2line = c + 2 * ldc;
FLOAT *c_nxt3line = c + 3 * ldc;
+#ifdef ENABLE_PREFETCH
+ a += bk * 8;
+ __asm__ __volatile__(
+ "pref 0, (%[a]) \n\t"
+ "pref 0, 32(%[a]) \n\t"
+ "pref 0, 72(%[a]) \n\t"
+ "pref 0, 104(%[a]) \n\t"
+ "pref 0, 144(%[a]) \n\t"
+ "pref 0, 176(%[a]) \n\t"
+ "pref 0, 216(%[a]) \n\t"
+ "pref 0, 248(%[a]) \n\t"
+ "pref 0, 288(%[a]) \n\t"
+ "pref 0, 360(%[a]) \n\t"
+ "pref 0, 504(%[a]) \n\t"
+ "pref 0, 432(%[a]) \n\t"
+
+ :
+ : [a] "r"(a)
+ );
+ a -= bk * 8;
+#endif
+
LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3);
LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7);
LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11);
BLASLONG i;
v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
- LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
- LD_DP2(b, 2, src_b0, src_b1);
+ LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(b, 2, src_b0, src_b1);
- for (i = (bk - 1); i--;)
+ for (i = ((bk - 1) >> 1); i--;)
{
- a += 8;
- b += 4;
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 128(%[a]) \n\t"
+ "pref 0, 160(%[a]) \n\t"
+ "pref 0, 192(%[a]) \n\t"
+ "pref 0, 224(%[a]) \n\t"
- LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
- LD_DP2(b, 2, src_b2, src_b3);
+ :
+ : [a] "r"(a)
+ );
+#endif
+
+ LD_DP4_INC(a, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2_INC(b, 2, src_b2, src_b3);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
- src_a0 = src_a4;
- src_a1 = src_a5;
- src_a2 = src_a6;
- src_a3 = src_a7;
- src_b0 = src_b2;
- src_b1 = src_b3;
+ LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(b, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2);
+ src_c0 -= src_a4 * src_b;
+ src_c1 -= src_a5 * src_b;
+ src_c2 -= src_a6 * src_b;
+ src_c3 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2);
+ src_c4 -= src_a4 * src_b;
+ src_c5 -= src_a5 * src_b;
+ src_c6 -= src_a6 * src_b;
+ src_c7 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3);
+ src_c8 -= src_a4 * src_b;
+ src_c9 -= src_a5 * src_b;
+ src_c10 -= src_a6 * src_b;
+ src_c11 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3);
+ src_c12 -= src_a4 * src_b;
+ src_c13 -= src_a5 * src_b;
+ src_c14 -= src_a6 * src_b;
+ src_c15 -= src_a7 * src_b;
+ }
+
+ if ((bk - 1) & 1)
+ {
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(b, 2, src_b0, src_b1);
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c13 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
-
- a += 8;
- b += 4;
}
ILVRL_D2_DP(src_c4, src_c0, res_c0, res_c1);
#include "common.h"
#include "macros_msa.h"
-static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+static __attribute__ ((noinline))
+void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
v2f64 src_b;
- LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3);
- LD_DP2(b, 2, src_b0, src_b1);
+ LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(b, 2, src_b0, src_b1);
- for (i = (bk - 1); i--;)
+ for (i = ((bk - 1) >> 1); i--;)
{
- a += 8;
- b += 4;
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 128(%[a]) \n\t"
+ "pref 0, 160(%[a]) \n\t"
+ "pref 0, 192(%[a]) \n\t"
+ "pref 0, 224(%[a]) \n\t"
- LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7);
- LD_DP2(b, 2, src_b2, src_b3);
+ :
+ : [a] "r"(a)
+ );
+#endif
+
+ LD_DP4_INC(a, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2_INC(b, 2, src_b2, src_b3);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
- src_a0 = src_a4;
- src_a1 = src_a5;
- src_a2 = src_a6;
- src_a3 = src_a7;
- src_b0 = src_b2;
- src_b1 = src_b3;
+ LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(b, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2);
+ src_c0 -= src_a4 * src_b;
+ src_c1 -= src_a5 * src_b;
+ src_c2 -= src_a6 * src_b;
+ src_c3 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2);
+ src_c4 -= src_a4 * src_b;
+ src_c5 -= src_a5 * src_b;
+ src_c6 -= src_a6 * src_b;
+ src_c7 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3);
+ src_c8 -= src_a4 * src_b;
+ src_c9 -= src_a5 * src_b;
+ src_c10 -= src_a6 * src_b;
+ src_c11 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3);
+ src_c12 -= src_a4 * src_b;
+ src_c13 -= src_a5 * src_b;
+ src_c14 -= src_a6 * src_b;
+ src_c15 -= src_a7 * src_b;
+ }
+
+ if ((bk - 1) & 1)
+ {
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ LD_DP4_INC(a, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(b, 2, src_b0, src_b1);
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c13 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
-
- a += 8;
- b += 4;
}
src_b0 = LD_DP(b + 0);
#include "common.h"
#include "macros_msa.h"
-static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
+static __attribute__ ((noinline))
+void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v2f64 src_b, src_b0, src_b1, src_b2, src_b3;
v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
- LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3);
- LD_DP2(pbb, 2, src_b0, src_b1);
+ LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pbb, 2, src_b0, src_b1);
- for (i = (bk - 1); i--;)
+ for (i = ((bk - 1) >> 1); i--;)
{
- pba += 8;
- pbb += 4;
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 128(%[pba]) \n\t"
+ "pref 0, 160(%[pba]) \n\t"
+ "pref 0, 192(%[pba]) \n\t"
+ "pref 0, 224(%[pba]) \n\t"
+
+ :
+ : [pba] "r"(pba)
+ );
+#endif
+ LD_DP4_INC(pba, 2, src_a4, src_a5, src_a6, src_a7);
+ LD_DP2_INC(pbb, 2, src_b2, src_b3);
- LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7);
- LD_DP2(pbb, 2, src_b2, src_b3);
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c0 -= src_a0 * src_b;
+ src_c1 -= src_a1 * src_b;
+ src_c2 -= src_a2 * src_b;
+ src_c3 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
+ src_c4 -= src_a0 * src_b;
+ src_c5 -= src_a1 * src_b;
+ src_c6 -= src_a2 * src_b;
+ src_c7 -= src_a3 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c8 -= src_a0 * src_b;
+ src_c9 -= src_a1 * src_b;
+ src_c10 -= src_a2 * src_b;
+ src_c11 -= src_a3 * src_b;
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
+ src_c12 -= src_a0 * src_b;
+ src_c13 -= src_a1 * src_b;
+ src_c14 -= src_a2 * src_b;
+ src_c15 -= src_a3 * src_b;
+
+ LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pbb, 2, src_b0, src_b1);
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b2, (v2i64) src_b2);
+ src_c0 -= src_a4 * src_b;
+ src_c1 -= src_a5 * src_b;
+ src_c2 -= src_a6 * src_b;
+ src_c3 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b2, (v2i64) src_b2);
+ src_c4 -= src_a4 * src_b;
+ src_c5 -= src_a5 * src_b;
+ src_c6 -= src_a6 * src_b;
+ src_c7 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvr_d((v2i64) src_b3, (v2i64) src_b3);
+ src_c8 -= src_a4 * src_b;
+ src_c9 -= src_a5 * src_b;
+ src_c10 -= src_a6 * src_b;
+ src_c11 -= src_a7 * src_b;
+
+ src_b = (v2f64) __msa_ilvl_d((v2i64) src_b3, (v2i64) src_b3);
+ src_c12 -= src_a4 * src_b;
+ src_c13 -= src_a5 * src_b;
+ src_c14 -= src_a6 * src_b;
+ src_c15 -= src_a7 * src_b;
+ }
+
+ if ((bk - 1) & 1)
+ {
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
src_c0 -= src_a0 * src_b;
src_c1 -= src_a1 * src_b;
src_c14 -= src_a2 * src_b;
src_c15 -= src_a3 * src_b;
- src_a0 = src_a4;
- src_a1 = src_a5;
- src_a2 = src_a6;
- src_a3 = src_a7;
- src_b0 = src_b2;
- src_b1 = src_b3;
+ LD_DP4_INC(pba, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pbb, 2, src_b0, src_b1);
}
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
for (i = (m >> 3); i--;)
{
- dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, k - kk);
+ dsolve_8x1_rt_msa(aa + 8 * kk, bb, cc, (k - kk));
aa += 8 * k;
cc += 8;
{
if (m & 4)
{
- dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, k - kk);
+ dsolve_4x1_rt_msa(aa + 4 * kk, bb, cc, (k - kk));
aa += 4 * k;
cc += 4;
if (m & 2)
{
- dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, k - kk);
+ dsolve_2x1_rt_msa(aa + 2 * kk, bb, cc, (k - kk));
aa += 2 * k;
cc += 2;
if (m & 1)
{
- dsolve_1x1_rt_msa(aa + kk, bb, cc, k - kk);
+ dsolve_1x1_rt_msa(aa + kk, bb, cc, (k - kk));
aa += k;
cc += 1;
for (i = (m >> 3); i--;)
{
- dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, k - kk);
+ dsolve_8x2_rt_msa(aa + 8 * kk, bb, cc, ldc, (k - kk));
aa += 8 * k;
cc += 8;
{
if (m & 4)
{
- dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, k - kk);
+ dsolve_4x2_rt_msa(aa + 4 * kk, bb, cc, ldc, (k - kk));
aa += 4 * k;
cc += 4;
if (m & 2)
{
- dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, k - kk);
+ dsolve_2x2_rt_msa(aa + 2 * kk, bb, cc, ldc, (k - kk));
aa += 2 * k;
cc += 2;
if (m & 1)
{
- dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, k - kk);
+ dsolve_1x2_rt_msa(aa + kk, bb, cc, ldc, (k - kk));
aa += k;
cc += 1;
for (i = (m >> 3); i--;)
{
- dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, k - kk);
+ dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, (k - kk));
aa += 8 * k;
cc += 8;
{
if (m & 4)
{
- dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, k - kk);
+ dsolve_4x4_rt_msa(aa + kk * 4, bb, cc, ldc, (k - kk));
aa += 4 * k;
cc += 4;
if (m & 2)
{
- dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, k - kk);
+ dsolve_2x4_rt_msa(aa + kk * 2, bb, cc, ldc, (k - kk));
aa += 2 * k;
cc += 2;
if (m & 1)
{
- dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, k - kk);
+ dsolve_1x4_rt_msa(aa + kk, bb, cc, ldc, (k - kk));
aa += k;
cc += 1;
#include <msa.h>
+#define ENABLE_PREFETCH
+
#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
#define LD_SP(...) LD_W(v4f32, __VA_ARGS__)
static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- BLASLONG k;
- FLOAT *aa = a, *bb = b;
- v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_a1;
v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
LD_SP2(c_nxt6line, 4, src_c12, src_c13);
LD_SP2(c_nxt7line, 4, src_c14, src_c15);
- for (k = 0; k < bk; k++)
+ if (bk > 0)
{
- LD_SP2(aa, 4, src_a0, src_a1);
-
- src_b = LD_SP(bb + 0);
- SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
- src_c0 -= src_a0 * src_b0;
- src_c1 -= src_a1 * src_b0;
- src_c2 -= src_a0 * src_b1;
- src_c3 -= src_a1 * src_b1;
- src_c4 -= src_a0 * src_b2;
- src_c5 -= src_a1 * src_b2;
- src_c6 -= src_a0 * src_b3;
- src_c7 -= src_a1 * src_b3;
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_bb0, src_bb1, src_b0, src_b1, src_b2, src_b3, src_a1;
- src_b = LD_SP(bb + 4);
- SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
- src_c8 -= src_a0 * src_b0;
- src_c9 -= src_a1 * src_b0;
- src_c10 -= src_a0 * src_b1;
- src_c11 -= src_a1 * src_b1;
- src_c12 -= src_a0 * src_b2;
- src_c13 -= src_a1 * src_b2;
- src_c14 -= src_a0 * src_b3;
- src_c15 -= src_a1 * src_b3;
+ for (k = 0; k < (bk >> 1); k++)
+ {
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 64(%[aa]) \n\t"
+ "pref 0, 96(%[aa]) \n\t"
+
+ :
+ : [aa] "r" (aa)
+ );
+#endif
+
+ LD_SP2_INC(aa, 4, src_a0, src_a1);
+ LD_SP2_INC(bb, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+
+ LD_SP2_INC(aa, 4, src_a0, src_a1);
+ LD_SP2_INC(bb, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+ }
- aa += 8;
- bb += 8;
+ if (bk & 1)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+ LD_SP2(bb, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+ }
}
a -= 64;
static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- BLASLONG k;
- v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
LD_SP2(c_nxt6line, 4, src_c12, src_c13);
LD_SP2(c_nxt7line, 4, src_c14, src_c15);
- for (k = 0; k < bk; k++)
+ if (bk > 0)
{
- LD_SP2(a, 4, src_a0, src_a1);
+ BLASLONG k;
+ v4f32 src_b, src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1;
- src_b = LD_SP(b + 0);
- SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
- src_c0 -= src_a0 * src_b0;
- src_c1 -= src_a1 * src_b0;
- src_c2 -= src_a0 * src_b1;
- src_c3 -= src_a1 * src_b1;
- src_c4 -= src_a0 * src_b2;
- src_c5 -= src_a1 * src_b2;
- src_c6 -= src_a0 * src_b3;
- src_c7 -= src_a1 * src_b3;
-
- src_b = LD_SP(b + 4);
- SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
- src_c8 -= src_a0 * src_b0;
- src_c9 -= src_a1 * src_b0;
- src_c10 -= src_a0 * src_b1;
- src_c11 -= src_a1 * src_b1;
- src_c12 -= src_a0 * src_b2;
- src_c13 -= src_a1 * src_b2;
- src_c14 -= src_a0 * src_b3;
- src_c15 -= src_a1 * src_b3;
+ for (k = 0; k < (bk >> 1); k++)
+ {
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 64(%[a]) \n\t"
+ "pref 0, 96(%[a]) \n\t"
+
+ :
+ : [a] "r" (a)
+ );
+#endif
+
+ LD_SP2_INC(a, 4, src_a0, src_a1);
+ LD_SP2_INC(b, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+
+ LD_SP2_INC(a, 4, src_a0, src_a1);
+ LD_SP2_INC(b, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+ }
- a += 8;
- b += 8;
+ if (bk & 1)
+ {
+ LD_SP2_INC(a, 4, src_a0, src_a1);
+ LD_SP2_INC(b, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+ }
}
TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- BLASLONG k;
- v4f32 src_a0, src_a1;
v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
LD_SP2(c_nxt6line, 4, src_c12, src_c13);
LD_SP2(c_nxt7line, 4, src_c14, src_c15);
- for (k = 0; k < bk; k++)
+ if (bk > 0)
{
- LD_SP2(a, 4, src_a0, src_a1);
+ BLASLONG k;
+ v4f32 src_a0, src_a1, src_bb0, src_bb1;
- src_b = LD_SP(b + 0);
- SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
- src_c0 -= src_a0 * src_b0;
- src_c1 -= src_a1 * src_b0;
- src_c2 -= src_a0 * src_b1;
- src_c3 -= src_a1 * src_b1;
- src_c4 -= src_a0 * src_b2;
- src_c5 -= src_a1 * src_b2;
- src_c6 -= src_a0 * src_b3;
- src_c7 -= src_a1 * src_b3;
+ for (k = 0; k < (bk >> 1); k++)
+ {
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 64(%[a]) \n\t"
+ "pref 0, 96(%[a]) \n\t"
+
+ :
+ : [a] "r" (a)
+ );
+#endif
+ LD_SP2_INC(a, 4, src_a0, src_a1);
+ LD_SP2_INC(b, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+
+ LD_SP2_INC(a, 4, src_a0, src_a1);
+ LD_SP2_INC(b, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+ }
- src_b = LD_SP(b + 4);
- SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
- src_c8 -= src_a0 * src_b0;
- src_c9 -= src_a1 * src_b0;
- src_c10 -= src_a0 * src_b1;
- src_c11 -= src_a1 * src_b1;
- src_c12 -= src_a0 * src_b2;
- src_c13 -= src_a1 * src_b2;
- src_c14 -= src_a0 * src_b3;
- src_c15 -= src_a1 * src_b3;
+ if (bk & 1)
+ {
+ LD_SP2_INC(a, 4, src_a0, src_a1);
+ LD_SP2_INC(b, 4, src_bb0, src_bb1);
- a += 8;
- b += 8;
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+ }
}
src_b = LD_SP(b + 0);
static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
{
- BLASLONG k;
- FLOAT *aa = a, *bb = b;
- v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
LD_SP2(c_nxt6line, 4, src_c12, src_c13);
LD_SP2(c_nxt7line, 4, src_c14, src_c15);
- for (k = 0; k < bk; k++)
+ if (bk > 0)
{
- LD_SP2(aa, 4, src_a0, src_a1);
+ BLASLONG k;
+ FLOAT *aa = a, *bb = b;
+ v4f32 src_a0, src_a1, src_b1, src_b2, src_b3, src_bb0, src_bb1;
- src_b = LD_SP(bb + 0);
- SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
- src_c0 -= src_a0 * src_b0;
- src_c1 -= src_a1 * src_b0;
- src_c2 -= src_a0 * src_b1;
- src_c3 -= src_a1 * src_b1;
- src_c4 -= src_a0 * src_b2;
- src_c5 -= src_a1 * src_b2;
- src_c6 -= src_a0 * src_b3;
- src_c7 -= src_a1 * src_b3;
+ for (k = 0; k < (bk >> 1); k++)
+ {
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 64(%[aa]) \n\t"
+ "pref 0, 96(%[aa]) \n\t"
- src_b = LD_SP(bb + 4);
- SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
- src_c8 -= src_a0 * src_b0;
- src_c9 -= src_a1 * src_b0;
- src_c10 -= src_a0 * src_b1;
- src_c11 -= src_a1 * src_b1;
- src_c12 -= src_a0 * src_b2;
- src_c13 -= src_a1 * src_b2;
- src_c14 -= src_a0 * src_b3;
- src_c15 -= src_a1 * src_b3;
+ :
+ : [aa] "r" (aa)
+ );
+#endif
- aa += 8;
- bb += 8;
+ LD_SP2_INC(aa, 4, src_a0, src_a1);
+ LD_SP2_INC(bb, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+
+ LD_SP2_INC(aa, 4, src_a0, src_a1);
+ LD_SP2_INC(bb, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+ }
+
+ if (bk & 1)
+ {
+ LD_SP2(aa, 4, src_a0, src_a1);
+ LD_SP2(bb, 4, src_bb0, src_bb1);
+
+ SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
+ src_c0 -= src_a0 * src_b0;
+ src_c1 -= src_a1 * src_b0;
+ src_c2 -= src_a0 * src_b1;
+ src_c3 -= src_a1 * src_b1;
+ src_c4 -= src_a0 * src_b2;
+ src_c5 -= src_a1 * src_b2;
+ src_c6 -= src_a0 * src_b3;
+ src_c7 -= src_a1 * src_b3;
+
+ SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
+ src_c8 -= src_a0 * src_b0;
+ src_c9 -= src_a1 * src_b0;
+ src_c10 -= src_a0 * src_b1;
+ src_c11 -= src_a1 * src_b1;
+ src_c12 -= src_a0 * src_b2;
+ src_c13 -= src_a1 * src_b2;
+ src_c14 -= src_a0 * src_b3;
+ src_c15 -= src_a1 * src_b3;
+ }
}
b -= 64;