BLASLONG i, inc_x2;
FLOAT sumf = 0.0;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
- v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
- v4f32 zero_v = {0};
+ v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+ v4f32 sum_abs0 = {0, 0, 0, 0};
+ v4f32 sum_abs1 = {0, 0, 0, 0};
+ v4f32 sum_abs2 = {0, 0, 0, 0};
+ v4f32 sum_abs3 = {0, 0, 0, 0};
v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
if (n <= 0 || inc_x <= 0) return (sumf);
if (1 == inc_x)
{
- if (n > 15)
- {
- n -= 16;
-
- LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+#ifdef ENABLE_PREFETCH
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
- sum_abs0 = AND_VEC_W(src0);
- sum_abs1 = AND_VEC_W(src1);
- sum_abs2 = AND_VEC_W(src2);
- sum_abs3 = AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
- sum_abs1 += AND_VEC_W(src5);
- sum_abs2 += AND_VEC_W(src6);
- sum_abs3 += AND_VEC_W(src7);
- }
- else
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
{
- sum_abs0 = zero_v;
- sum_abs1 = zero_v;
- sum_abs2 = zero_v;
- sum_abs3 = zero_v;
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
}
+ pref_offset = pref_offset / sizeof(FLOAT);
+ x_pref = x + pref_offset + 128;
+#endif
- for (i = (n >> 4); i--;)
+ for (i = (n >> 5); i--;)
{
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 0(%[x_pref])\n\t"
+ "pref 0, 32(%[x_pref])\n\t"
+ "pref 0, 64(%[x_pref])\n\t"
+ "pref 0, 96(%[x_pref])\n\t"
+ "pref 0, 128(%[x_pref])\n\t"
+ "pref 0, 160(%[x_pref])\n\t"
+ "pref 0, 192(%[x_pref])\n\t"
+ "pref 0, 224(%[x_pref])\n\t"
+
+ : : [x_pref] "r" (x_pref)
+ );
+
+ x_pref += 64;
+#endif
+
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
sum_abs3 += AND_VEC_W(src7);
+ sum_abs0 += AND_VEC_W(src8);
+ sum_abs1 += AND_VEC_W(src9);
+ sum_abs2 += AND_VEC_W(src10);
+ sum_abs3 += AND_VEC_W(src11);
+ sum_abs0 += AND_VEC_W(src12);
+ sum_abs1 += AND_VEC_W(src13);
+ sum_abs2 += AND_VEC_W(src14);
+ sum_abs3 += AND_VEC_W(src15);
}
- if (n & 15)
+ if (n & 31)
{
- if ((n & 8) && (n & 4) && (n & 2))
+ if (n & 16)
{
- LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
+ LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
+ sum_abs3 += AND_VEC_W(src7);
}
- else if ((n & 8) && (n & 4))
- {
- LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
-
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
- sum_abs3 += AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
- sum_abs1 += AND_VEC_W(src5);
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
- else if ((n & 8) && (n & 2))
- {
- LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
-
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
- sum_abs3 += AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
- else if ((n & 4) && (n & 2))
- {
- LD_SP3_INC(x, 4, src0, src1, src2);
-
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
- else if (n & 8)
+ if (n & 8)
{
LD_SP4_INC(x, 4, src0, src1, src2, src3);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
}
- else if (n & 4)
+
+ if (n & 4)
{
LD_SP2_INC(x, 4, src0, src1);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
}
- else if (n & 2)
+
+ if (n & 2)
{
src0 = LD_SP(x); x += 4;
sum_abs0 += AND_VEC_W(src0);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
- else
- {
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
}
if (n & 1)
{
- sumf += fabsf(*(x + 0));
+ sumf += fabsf(*x);
sumf += fabsf(*(x + 1));
}
}
- else
- {
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
- sumf = sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
}
else
{
inc_x2 = 2 * inc_x;
- if (n > 8)
- {
- n -= 8;
-
- LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
-
- sum_abs0 = AND_VEC_W(src0);
- sum_abs1 = AND_VEC_W(src1);
- sum_abs2 = AND_VEC_W(src2);
- sum_abs3 = AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
- sum_abs1 += AND_VEC_W(src5);
- sum_abs2 += AND_VEC_W(src6);
- sum_abs3 += AND_VEC_W(src7);
- }
- else
- {
- sum_abs0 = zero_v;
- sum_abs1 = zero_v;
- sum_abs2 = zero_v;
- sum_abs3 = zero_v;
- }
-
- for (i = (n >> 3); i--;)
+ for (i = (n >> 4); i--;)
{
LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
sum_abs3 += AND_VEC_W(src7);
+ sum_abs0 += AND_VEC_W(src8);
+ sum_abs1 += AND_VEC_W(src9);
+ sum_abs2 += AND_VEC_W(src10);
+ sum_abs3 += AND_VEC_W(src11);
+ sum_abs0 += AND_VEC_W(src12);
+ sum_abs1 += AND_VEC_W(src13);
+ sum_abs2 += AND_VEC_W(src14);
+ sum_abs3 += AND_VEC_W(src15);
}
- if (n & 7)
+ if (n & 15)
{
- if ((n & 4) && (n & 2) && (n & 1))
+ if (n & 8)
{
- LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6);
+ LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
+ sum_abs3 += AND_VEC_W(src7);
}
- else if ((n & 4) && (n & 2))
- {
- LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5);
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
- sum_abs3 += AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
- sum_abs1 += AND_VEC_W(src5);
- }
- else if ((n & 4) && (n & 1))
- {
- LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4);
-
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
- sum_abs3 += AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
- }
- else if ((n & 2) && (n & 1))
- {
- LD_SP3_INC(x, inc_x2, src0, src1, src2);
-
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
- }
- else if (n & 4)
+ if (n & 4)
{
LD_SP4_INC(x, inc_x2, src0, src1, src2, src3);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
}
- else if (n & 2)
+
+ if (n & 2)
{
LD_SP2_INC(x, inc_x2, src0, src1);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
}
- else if (n & 1)
+
+ if (n & 1)
{
- src0 = LD_SP(x); x += inc_x2;
+ src0 = LD_SP(x);
sum_abs0 += AND_VEC_W(src0);
}
}
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0] + sum_abs0[1];
}
#include "macros_msa.h"
#if !defined(CONJ)
- #define OP2 +=
- #define OP3 -
- #define OP4 +
+ #define OP1 -=
+ #define OP2 +=
+ #define OP3 -
+ #define OP4 +
#else
- #define OP2 -=
- #define OP3 +
- #define OP4 -
+ #define OP1 +=
+ #define OP2 -=
+ #define OP3 +
+ #define OP4 -
#endif
-#define DOT16_KERNEL(OPR0, OPR1) \
- dot0 += (vx0r * vy0r); \
- dot0 OPR0## = (vx0i * vy0i); \
- dot1 OPR1## = (vx0i * vy0r); \
- dot1 += (vx0r * vy0i); \
- \
- dot0 += (vx1r * vy1r); \
- dot0 OPR0## = (vx1i * vy1i); \
- dot1 OPR1## = (vx1i * vy1r); \
- dot1 += (vx1r * vy1i); \
- \
- dot0 += (vx2r * vy2r); \
- dot0 OPR0## = (vx2i * vy2i); \
- dot1 OPR1## = (vx2i * vy2r); \
- dot1 += (vx2r * vy2i); \
- \
- dot0 += (vx3r * vy3r); \
- dot0 OPR0## = (vx3i * vy3i); \
- dot1 OPR1## = (vx3i * vy3r); \
- dot1 += (vx3r * vy3i);
-
-#define DOT12_KERNEL(OPR0, OPR1) \
- dot0 += (vx0r * vy0r); \
- dot0 OPR0## = (vx0i * vy0i); \
- dot1 OPR1## = (vx0i * vy0r); \
- dot1 += (vx0r * vy0i); \
- \
- dot0 += (vx1r * vy1r); \
- dot0 OPR0## = (vx1i * vy1i); \
- dot1 OPR1## = (vx1i * vy1r); \
- dot1 += (vx1r * vy1i); \
- \
- dot0 += (vx2r * vy2r); \
- dot0 OPR0## = (vx2i * vy2i); \
- dot1 OPR1## = (vx2i * vy2r); \
- dot1 += (vx2r * vy2i);
-
-#define DOT8_KERNEL(OPR0, OPR1) \
- dot0 += (vx0r * vy0r); \
- dot0 OPR0## = (vx0i * vy0i); \
- dot1 OPR1## = (vx0i * vy0r); \
- dot1 += (vx0r * vy0i); \
- \
- dot0 += (vx1r * vy1r); \
- dot0 OPR0## = (vx1i * vy1i); \
- dot1 OPR1## = (vx1i * vy1r); \
- dot1 += (vx1r * vy1i);
-
-#define DOT4_KERNEL(OPR0, OPR1) \
- dot0 += (vx0r * vy0r); \
- dot0 OPR0## = (vx0i * vy0i); \
- dot1 OPR1## = (vx0i * vy0r); \
- dot1 += (vx0r * vy0i);
-
-/* return float, x,y float */
-/* cdotc - CONJ */
-/* cdotu - !CONJ */
-#ifndef _MSC_VER
-#include <complex.h>
-FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-#else
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-#endif
{
BLASLONG i = 0;
FLOAT dot[2];
- BLASLONG inc_x2;
- BLASLONG inc_y2;
+ BLASLONG inc_x2, inc_y2;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
- v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
- v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
+ v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
+ v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
v4f32 dot0 = {0, 0, 0, 0};
v4f32 dot1 = {0, 0, 0, 0};
- openblas_complex_float result;
+ v4f32 dot2 = {0, 0, 0, 0};
+ v4f32 dot3 = {0, 0, 0, 0};
+ v4f32 dot4 = {0, 0, 0, 0};
+ v4f32 dot5 = {0, 0, 0, 0};
+ v4f32 dot6 = {0, 0, 0, 0};
+ v4f32 dot7 = {0, 0, 0, 0};
+ OPENBLAS_COMPLEX_FLOAT result;
dot[0] = 0.0;
dot[1] = 0.0;
- __real__(result) = 0.0;
- __imag__(result) = 0.0;
+ CREAL(result) = 0.0;
+ CIMAG(result) = 0.0;
- if ( n < 1 ) return(result);
+ if (n < 1) return (result);
if ((1 == inc_x) && (1 == inc_y))
{
+#ifdef ENABLE_PREFETCH
+ FLOAT *x_pref, *y_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ }
+ pref_offset = pref_offset / sizeof(FLOAT);
+ x_pref = x + pref_offset + 64;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ }
+ pref_offset = pref_offset / sizeof(FLOAT);
+ y_pref = y + pref_offset + 64;
+#endif
+
for (i = (n >> 4); i--;)
{
- LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
- LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
-
- PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
- PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
- PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
- PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
-
- PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
- PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
- PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
- PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
-
- #if !defined(CONJ)
- DOT16_KERNEL(-, +);
- #else
- DOT16_KERNEL(+, -);
- #endif
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 0(%[x_pref])\n\t"
+ "pref 0, 32(%[x_pref])\n\t"
+ "pref 0, 64(%[x_pref])\n\t"
+ "pref 0, 96(%[x_pref])\n\t"
+ "pref 0, 0(%[y_pref])\n\t"
+ "pref 0, 32(%[y_pref])\n\t"
+ "pref 0, 64(%[y_pref])\n\t"
+ "pref 0, 96(%[y_pref])\n\t"
+
+ : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref)
+ );
+
+ x_pref += 32;
+ y_pref += 32;
+#endif
+
+ LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+ PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+ PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
+ PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
+
+ PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+ PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
+ PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
+
+ dot0 += (vx0r * vy0r);
+ dot0 OP1 (vx0i * vy0i);
+ dot1 OP2 (vx0i * vy0r);
+ dot1 += (vx0r * vy0i);
+
+ dot2 += (vx1r * vy1r);
+ dot2 OP1 (vx1i * vy1i);
+ dot3 OP2 (vx1i * vy1r);
+ dot3 += (vx1r * vy1i);
+
+ dot4 += (vx2r * vy2r);
+ dot4 OP1 (vx2i * vy2i);
+ dot5 OP2 (vx2i * vy2r);
+ dot5 += (vx2r * vy2i);
+
+ dot6 += (vx3r * vy3r);
+ dot6 OP1 (vx3i * vy3i);
+ dot7 OP2 (vx3i * vy3r);
+ dot7 += (vx3r * vy3i);
}
if (n & 15)
{
- if ((n & 8) && (n & 4))
+ if (n & 8)
{
- LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
- LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
- LD_SP2_INC(x, 4, vx4, vx5);
- LD_SP2_INC(y, 4, vy4, vy5);
-
- PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
- PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
- PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
-
- PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
- PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
- PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
-
- #if !defined(CONJ)
- DOT12_KERNEL(-, +);
- #else
- DOT12_KERNEL(+, -);
- #endif
+ LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+ LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+
+ PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+
+ PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+
+ dot0 += (vx0r * vy0r);
+ dot0 OP1 (vx0i * vy0i);
+ dot1 OP2 (vx0i * vy0r);
+ dot1 += (vx0r * vy0i);
+
+ dot2 += (vx1r * vy1r);
+ dot2 OP1 (vx1i * vy1i);
+ dot3 OP2 (vx1i * vy1r);
+ dot3 += (vx1r * vy1i);
}
- else if (n & 8)
+
+ if (n & 4)
{
- LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
- LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+ LD_SP2_INC(x, 4, vx0, vx1);
+ LD_SP2_INC(y, 4, vy0, vy1);
+ PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+
+ dot0 += (vx0r * vy0r);
+ dot0 OP1 (vx0i * vy0i);
+ dot1 OP2 (vx0i * vy0r);
+ dot1 += (vx0r * vy0i);
+ }
- PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
- PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+ if (n & 2)
+ {
+ LD_GP4_INC(x, 1, x0, x1, x2, x3);
+ LD_GP4_INC(y, 1, y0, y1, y2, y3);
- PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
- PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+ dot[0] += (x0 * y0 OP3 x1 * y1);
+ dot[1] OP2 (x1 * y0 OP4 x0 * y1);
- #if !defined(CONJ)
- DOT8_KERNEL(-, +);
- #else
- DOT8_KERNEL(+, -);
- #endif
+ dot[0] += (x2 * y2 OP3 x3 * y3);
+ dot[1] OP2 (x3 * y2 OP4 x2 * y3);
}
- else if (n & 4)
+
+ if (n & 1)
{
- LD_SP2_INC(x, 4, vx0, vx1);
- LD_SP2_INC(y, 4, vy0, vy1);
- PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
- PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
-
- #if !defined(CONJ)
- DOT4_KERNEL(-, +);
- #else
- DOT4_KERNEL(+, -);
- #endif
+ LD_GP2_INC(x, 1, x0, x1);
+ LD_GP2_INC(y, 1, y0, y1);
+
+ dot[0] += (x0 * y0 OP3 x1 * y1);
+ dot[1] OP2 (x1 * y0 OP4 x0 * y1);
}
+ }
- if ((n & 2) && (n & 1))
- {
- LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5);
- LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5);
+ dot0 += dot2 + dot4 + dot6;
+ dot1 += dot3 + dot5 + dot7;
- dot[0] += ( x0 * y0 OP3 x1 * y1 );
- dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+ dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
+ dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
- dot[0] += ( x2 * y2 OP3 x3 * y3 );
- dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+ for (i = (n >> 2); i--;)
+ {
+ x0 = *x;
+ x1 = *(x + 1);
+ x += inc_x2;
+ x2 = *x;
+ x3 = *(x + 1);
+ x += inc_x2;
+ x4 = *x;
+ x5 = *(x + 1);
+ x += inc_x2;
+ x6 = *x;
+ x7 = *(x + 1);
+ x += inc_x2;
+
+ y0 = *y;
+ y1 = *(y + 1);
+ y += inc_y2;
+ y2 = *y;
+ y3 = *(y + 1);
+ y += inc_y2;
+ y4 = *y;
+ y5 = *(y + 1);
+ y += inc_y2;
+ y6 = *y;
+ y7 = *(y + 1);
+ y += inc_y2;
+
+ dot[0] += (x0 * y0 OP3 x1 * y1);
+ dot[1] OP2 (x1 * y0 OP4 x0 * y1);
+
+ dot[0] += (x2 * y2 OP3 x3 * y3);
+ dot[1] OP2 (x3 * y2 OP4 x2 * y3);
+
+ dot[0] += (x4 * y4 OP3 x5 * y5);
+ dot[1] OP2 (x5 * y4 OP4 x4 * y5);
+
+ dot[0] += (x6 * y6 OP3 x7 * y7);
+ dot[1] OP2 (x7 * y6 OP4 x6 * y7);
+ }
- dot[0] += ( x4 * y4 OP3 x5 * y5 );
- dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
- }
- else if (n & 2)
- {
- LD_GP4_INC(x, 1, x0, x1, x2, x3);
- LD_GP4_INC(y, 1, y0, y1, y2, y3);
+ if (n & 2)
+ {
+ x0 = *x;
+ x1 = *(x + 1);
+ x += inc_x2;
+ x2 = *x;
+ x3 = *(x + 1);
+ x += inc_x2;
+
+ y0 = *y;
+ y1 = *(y + 1);
+ y += inc_y2;
+ y2 = *y;
+ y3 = *(y + 1);
+ y += inc_y2;
+
+ dot[0] += (x0 * y0 OP3 x1 * y1);
+ dot[1] OP2 (x1 * y0 OP4 x0 * y1);
+
+ dot[0] += (x2 * y2 OP3 x3 * y3);
+ dot[1] OP2 (x3 * y2 OP4 x2 * y3);
+ }
- dot[0] += ( x0 * y0 OP3 x1 * y1 );
- dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+ if (n & 1)
+ {
+ x0 = *x;
+ x1 = *(x + 1);
+ x += inc_x2;
- dot[0] += ( x2 * y2 OP3 x3 * y3 );
- dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
- }
- else if (n & 1)
- {
- LD_GP2_INC(x, 1, x0, x1);
- LD_GP2_INC(y, 1, y0, y1);
+ y0 = *y;
+ y1 = *(y + 1);
+ y += inc_y2;
- dot[0] += ( x0 * y0 OP3 x1 * y1 );
- dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
- }
+ dot[0] += (x0 * y0 OP3 x1 * y1);
+ dot[1] OP2 (x1 * y0 OP4 x0 * y1);
}
+ }
+
+ CREAL(result) = dot[0];
+ CIMAG(result) = dot[1];
- dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
- dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
- }
- else
- {
- inc_x2 = 2 * inc_x;
- inc_y2 = 2 * inc_y;
-
- for (i = (n >> 2); i--;)
- {
- x0 = *x;
- x1 = *(x + 1);
- x += inc_x2;
- x2 = *x;
- x3 = *(x + 1);
- x += inc_x2;
- x4 = *x;
- x5 = *(x + 1);
- x += inc_x2;
- x6 = *x;
- x7 = *(x + 1);
- x += inc_x2;
-
- y0 = *y;
- y1 = *(y + 1);
- y += inc_y2;
- y2 = *y;
- y3 = *(y + 1);
- y += inc_y2;
- y4 = *y;
- y5 = *(y + 1);
- y += inc_y2;
- y6 = *y;
- y7 = *(y + 1);
- y += inc_y2;
-
- dot[0] += ( x0 * y0 OP3 x1 * y1 );
- dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
-
- dot[0] += ( x2 * y2 OP3 x3 * y3 );
- dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
-
- dot[0] += ( x4 * y4 OP3 x5 * y5 );
- dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
-
- dot[0] += ( x6 * y6 OP3 x7 * y7 );
- dot[1] OP2 ( x7 * y6 OP4 x6 * y7 );
- }
-
- if ((n & 2) && (n & 1))
- {
- x0 = *x;
- x1 = *(x + 1);
- x += inc_x2;
- x2 = *x;
- x3 = *(x + 1);
- x += inc_x2;
- x4 = *x;
- x5 = *(x + 1);
- x += inc_x2;
-
- y0 = *y;
- y1 = *(y + 1);
- y += inc_y2;
- y2 = *y;
- y3 = *(y + 1);
- y += inc_y2;
- y4 = *y;
- y5 = *(y + 1);
- y += inc_y2;
-
- dot[0] += ( x0 * y0 OP3 x1 * y1 );
- dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
-
- dot[0] += ( x2 * y2 OP3 x3 * y3 );
- dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
-
- dot[0] += ( x4 * y4 OP3 x5 * y5 );
- dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
- }
- else if (n & 2)
- {
- x0 = *x;
- x1 = *(x + 1);
- x += inc_x2;
- x2 = *x;
- x3 = *(x + 1);
- x += inc_x2;
-
- y0 = *y;
- y1 = *(y + 1);
- y += inc_y2;
- y2 = *y;
- y3 = *(y + 1);
- y += inc_y2;
-
- dot[0] += ( x0 * y0 OP3 x1 * y1 );
- dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
-
- dot[0] += ( x2 * y2 OP3 x3 * y3 );
- dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
- }
- else if (n & 1)
- {
- x0 = *x;
- x1 = *(x + 1);
- x += inc_x2;
-
- y0 = *y;
- y1 = *(y + 1);
- y += inc_y2;
-
- dot[0] += ( x0 * y0 OP3 x1 * y1 );
- dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
- }
- }
-
- __real__(result) = dot[0];
- __imag__(result) = dot[1];
-
- return(result);
+ return (result);
}
BLASLONG i;
FLOAT sumf = 0.0;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
- v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
- v2f64 zero_v = {0};
+ v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+ v2f64 sum_abs0 = {0, 0};
+ v2f64 sum_abs1 = {0, 0};
+ v2f64 sum_abs2 = {0, 0};
+ v2f64 sum_abs3 = {0, 0};
v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
if (n <= 0 || inc_x <= 0) return (sumf);
if (1 == inc_x)
{
- if (n > 15)
- {
- n -= 16;
-
- LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+#ifdef ENABLE_PREFETCH
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
- sum_abs0 = AND_VEC_D(src0);
- sum_abs1 = AND_VEC_D(src1);
- sum_abs2 = AND_VEC_D(src2);
- sum_abs3 = AND_VEC_D(src3);
- sum_abs0 += AND_VEC_D(src4);
- sum_abs1 += AND_VEC_D(src5);
- sum_abs2 += AND_VEC_D(src6);
- sum_abs3 += AND_VEC_D(src7);
- }
- else
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
{
- sum_abs0 = zero_v;
- sum_abs1 = zero_v;
- sum_abs2 = zero_v;
- sum_abs3 = zero_v;
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
}
+ pref_offset = pref_offset / sizeof(FLOAT);
+ x_pref = x + pref_offset + 64;
+#endif
- for (i = (n >> 4); i--;)
+ for (i = (n >> 5); i--;)
{
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 0(%[x_pref])\n\t"
+ "pref 0, 32(%[x_pref])\n\t"
+ "pref 0, 64(%[x_pref])\n\t"
+ "pref 0, 96(%[x_pref])\n\t"
+ "pref 0, 128(%[x_pref])\n\t"
+ "pref 0, 160(%[x_pref])\n\t"
+ "pref 0, 192(%[x_pref])\n\t"
+ "pref 0, 224(%[x_pref])\n\t"
+
+ : : [x_pref] "r" (x_pref)
+ );
+
+ x_pref += 32;
+#endif
+
LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
sum_abs3 += AND_VEC_D(src7);
+ sum_abs0 += AND_VEC_D(src8);
+ sum_abs1 += AND_VEC_D(src9);
+ sum_abs2 += AND_VEC_D(src10);
+ sum_abs3 += AND_VEC_D(src11);
+ sum_abs0 += AND_VEC_D(src12);
+ sum_abs1 += AND_VEC_D(src13);
+ sum_abs2 += AND_VEC_D(src14);
+ sum_abs3 += AND_VEC_D(src15);
}
- if (n & 15)
+ if (n & 31)
{
- if ((n & 8) && (n & 4) && (n & 2))
+ if (n & 16)
{
- LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6);
+ LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
}
- else if ((n & 8) && (n & 4))
- {
- LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5);
-
- sum_abs0 += AND_VEC_D(src0);
- sum_abs1 += AND_VEC_D(src1);
- sum_abs2 += AND_VEC_D(src2);
- sum_abs3 += AND_VEC_D(src3);
- sum_abs0 += AND_VEC_D(src4);
- sum_abs1 += AND_VEC_D(src5);
- }
- else if ((n & 8) && (n & 2))
- {
- LD_DP5_INC(x, 2, src0, src1, src2, src3, src4);
-
- sum_abs0 += AND_VEC_D(src0);
- sum_abs1 += AND_VEC_D(src1);
- sum_abs2 += AND_VEC_D(src2);
- sum_abs3 += AND_VEC_D(src3);
- sum_abs0 += AND_VEC_D(src4);
- }
- else if ((n & 4) && (n & 2))
- {
- LD_DP3_INC(x, 2, src0, src1, src2);
- sum_abs0 += AND_VEC_D(src0);
- sum_abs1 += AND_VEC_D(src1);
- sum_abs2 += AND_VEC_D(src2);
- }
- else if (n & 8)
+ if (n & 8)
{
LD_DP4_INC(x, 2, src0, src1, src2, src3);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
}
- else if (n & 4)
+
+ if (n & 4)
{
LD_DP2_INC(x, 2, src0, src1);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
}
- else if (n & 2)
+
+ if (n & 2)
{
src0 = LD_DP(x); x += 2;
sum_abs0 += AND_VEC_D(src0);
}
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf = sum_abs0[0] + sum_abs0[1];
-
if (n & 1)
{
sumf += fabs(*x);
}
}
- else
- {
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
- sumf = sum_abs0[0] + sum_abs0[1];
- }
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0] + sum_abs0[1];
}
else
{
- if (n > 8)
- {
- n -= 8;
-
- LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
-
- sum_abs0 = AND_VEC_D(src0);
- sum_abs1 = AND_VEC_D(src1);
- sum_abs2 = AND_VEC_D(src2);
- sum_abs3 = AND_VEC_D(src3);
- sum_abs0 += AND_VEC_D(src4);
- sum_abs1 += AND_VEC_D(src5);
- sum_abs2 += AND_VEC_D(src6);
- sum_abs3 += AND_VEC_D(src7);
- }
- else
- {
- sum_abs0 = zero_v;
- sum_abs1 = zero_v;
- sum_abs2 = zero_v;
- sum_abs3 = zero_v;
- }
-
- for (i = (n >> 3); i--;)
+ for (i = (n >> 4); i--;)
{
LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
sum_abs3 += AND_VEC_D(src7);
+ sum_abs0 += AND_VEC_D(src8);
+ sum_abs1 += AND_VEC_D(src9);
+ sum_abs2 += AND_VEC_D(src10);
+ sum_abs3 += AND_VEC_D(src11);
+ sum_abs0 += AND_VEC_D(src12);
+ sum_abs1 += AND_VEC_D(src13);
+ sum_abs2 += AND_VEC_D(src14);
+ sum_abs3 += AND_VEC_D(src15);
}
- if (n & 7)
+ if (n & 15)
{
- if ((n & 4) && (n & 2) && (n & 1))
+ if (n & 8)
{
- LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6);
+ LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
sum_abs0 += AND_VEC_D(src4);
sum_abs1 += AND_VEC_D(src5);
sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
}
- else if ((n & 4) && (n & 2))
- {
- LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5);
- sum_abs0 += AND_VEC_D(src0);
- sum_abs1 += AND_VEC_D(src1);
- sum_abs2 += AND_VEC_D(src2);
- sum_abs3 += AND_VEC_D(src3);
- sum_abs0 += AND_VEC_D(src4);
- sum_abs1 += AND_VEC_D(src5);
- }
- else if ((n & 4) && (n & 1))
- {
- LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4);
-
- sum_abs0 += AND_VEC_D(src0);
- sum_abs1 += AND_VEC_D(src1);
- sum_abs2 += AND_VEC_D(src2);
- sum_abs3 += AND_VEC_D(src3);
- sum_abs0 += AND_VEC_D(src4);
- }
- else if ((n & 2) && (n & 1))
- {
- LD_DP3_INC(x, inc_x, src0, src1, src2);
-
- sum_abs0 += AND_VEC_D(src0);
- sum_abs1 += AND_VEC_D(src1);
- sum_abs2 += AND_VEC_D(src2);
- }
- else if (n & 4)
+ if (n & 4)
{
LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
sum_abs2 += AND_VEC_D(src2);
sum_abs3 += AND_VEC_D(src3);
}
- else if (n & 2)
+
+ if (n & 2)
{
LD_DP2_INC(x, inc_x, src0, src1);
sum_abs0 += AND_VEC_D(src0);
sum_abs1 += AND_VEC_D(src1);
}
- else if (n & 1)
+
+ if (n & 1)
{
src0 = LD_DP(x);
}
}
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
sumf = sum_abs0[0];
}
#include "common.h"
#include "macros_msa.h"
-/* return float, x,y float */
-#if defined(DSDOT)
-double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-#else
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-#endif
{
BLASLONG i = 0;
- double dot = 0.0;
+ FLOAT dot = 0.0;
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
v2f64 dot0 = {0, 0};
+ v2f64 dot1 = {0, 0};
+ v2f64 dot2 = {0, 0};
+ v2f64 dot3 = {0, 0};
- if (n < 0) return (dot);
+ if (n < 1) return (dot);
if ((1 == inc_x) && (1 == inc_y))
{
for (i = (n >> 4); i--;)
{
- LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
- LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+ LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 256(%[x])\n\t"
+ "pref 0, 288(%[x])\n\t"
+ "pref 0, 320(%[x])\n\t"
+ "pref 0, 352(%[x])\n\t"
+ "pref 0, 256(%[y])\n\t"
+ "pref 0, 288(%[y])\n\t"
+ "pref 0, 320(%[y])\n\t"
+ "pref 0, 352(%[y])\n\t"
+
+ : : [x] "r" (x), [y] "r" (y)
+ );
+#endif
dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
+ dot1 += (vy1 * vx1);
+ dot2 += (vy2 * vx2);
+ dot3 += (vy3 * vx3);
dot0 += (vy4 * vx4);
- dot0 += (vy5 * vx5);
- dot0 += (vy6 * vx6);
- dot0 += (vy7 * vx7);
+ dot1 += (vy5 * vx5);
+ dot2 += (vy6 * vx6);
+ dot3 += (vy7 * vx7);
}
if (n & 15)
{
- if ((n & 8) && (n & 4) && (n & 2))
- {
- LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
- LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
-
- dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
- dot0 += (vy4 * vx4);
- dot0 += (vy5 * vx5);
- dot0 += (vy6 * vx6);
- }
- else if ((n & 8) && (n & 4))
- {
- LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5);
- LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5);
-
- dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
- dot0 += (vy4 * vx4);
- dot0 += (vy5 * vx5);
- }
- else if ((n & 8) && (n & 2))
- {
- LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4);
- LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4);
-
- dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
- dot0 += (vy4 * vx4);
- }
- else if ((n & 4) && (n & 2))
- {
- LD_DP3_INC(x, 2, vx0, vx1, vx2);
- LD_DP3_INC(y, 2, vy0, vy1, vy2);
-
- dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- }
- else if (n & 8)
+ if (n & 8)
{
- LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
- LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
+ LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
+ LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
+ dot1 += (vy1 * vx1);
+ dot2 += (vy2 * vx2);
+ dot3 += (vy3 * vx3);
}
- else if (n & 4)
+
+ if (n & 4)
{
- LD_DP2_INC(x, 2, vx0, vx1);
- LD_DP2_INC(y, 2, vy0, vy1);
+ LD_DP2_INC(x, 2, vx0, vx1);
+ LD_DP2_INC(y, 2, vy0, vy1);
dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
+ dot1 += (vy1 * vx1);
}
- else if (n & 2)
+
+ if (n & 2)
{
vx0 = LD_DP(x); x += 2;
vy0 = LD_DP(y); y += 2;
}
}
+ dot0 += dot1 + dot2 + dot3;
+
dot += dot0[0];
dot += dot0[1];
}
dot += (y3 * x3);
}
- if ((n & 2) && (n & 1))
- {
- LD_GP3_INC(x, inc_x, x0, x1, x2);
- LD_GP3_INC(y, inc_y, y0, y1, y2);
-
- dot += (y0 * x0);
- dot += (y1 * x1);
- dot += (y2 * x2);
- }
- else if (n & 2)
+ if (n & 2)
{
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(y, inc_y, y0, y1);
dot += (y0 * x0);
dot += (y1 * x1);
}
- else if (n & 1)
+
+ if (n & 1)
{
x0 = *x;
y0 = *y;
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
- FLOAT data0, data1, data2, sumf = 0.0;
+ FLOAT data0, data1, sumf = 0.0;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
- v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
- v4f32 zero_v = {0};
+ v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+ v4f32 sum_abs0 = {0, 0, 0, 0};
+ v4f32 sum_abs1 = {0, 0, 0, 0};
+ v4f32 sum_abs2 = {0, 0, 0, 0};
+ v4f32 sum_abs3 = {0, 0, 0, 0};
+ v4f32 zero_v = {0, 0, 0, 0};
v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
if (n <= 0 || inc_x <= 0) return (sumf);
if (1 == inc_x)
{
- if (n > 31)
- {
- n -= 32;
-
- LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+#ifdef ENABLE_PREFETCH
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
- sum_abs0 = AND_VEC_W(src0);
- sum_abs1 = AND_VEC_W(src1);
- sum_abs2 = AND_VEC_W(src2);
- sum_abs3 = AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
- sum_abs1 += AND_VEC_W(src5);
- sum_abs2 += AND_VEC_W(src6);
- sum_abs3 += AND_VEC_W(src7);
- }
- else
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
{
- sum_abs0 = zero_v;
- sum_abs1 = zero_v;
- sum_abs2 = zero_v;
- sum_abs3 = zero_v;
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
}
+ pref_offset = pref_offset / sizeof(FLOAT);
+ x_pref = x + pref_offset + 128;
+#endif
- for (i = 0; i < (n >> 5); i++)
+ for (i = 0; i < (n >> 6); i++)
{
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 0(%[x_pref])\n\t"
+ "pref 0, 32(%[x_pref])\n\t"
+ "pref 0, 64(%[x_pref])\n\t"
+ "pref 0, 96(%[x_pref])\n\t"
+ "pref 0, 128(%[x_pref])\n\t"
+ "pref 0, 160(%[x_pref])\n\t"
+ "pref 0, 192(%[x_pref])\n\t"
+ "pref 0, 224(%[x_pref])\n\t"
+
+ : : [x_pref] "r" (x_pref)
+ );
+
+ x_pref += 64;
+#endif
+
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
sum_abs3 += AND_VEC_W(src7);
+ sum_abs0 += AND_VEC_W(src8);
+ sum_abs1 += AND_VEC_W(src9);
+ sum_abs2 += AND_VEC_W(src10);
+ sum_abs3 += AND_VEC_W(src11);
+ sum_abs0 += AND_VEC_W(src12);
+ sum_abs1 += AND_VEC_W(src13);
+ sum_abs2 += AND_VEC_W(src14);
+ sum_abs3 += AND_VEC_W(src15);
}
- if (n & 31)
+ if (n & 63)
{
- if ((n & 16) && (n & 8) && (n & 4))
+ if (n & 32)
{
- LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
+ LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
sum_abs0 += AND_VEC_W(src4);
sum_abs1 += AND_VEC_W(src5);
sum_abs2 += AND_VEC_W(src6);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
- else if ((n & 16) && (n & 8))
- {
- LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
-
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
- sum_abs3 += AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
- sum_abs1 += AND_VEC_W(src5);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
- else if ((n & 16) && (n & 4))
- {
- LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
-
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
- sum_abs3 += AND_VEC_W(src3);
- sum_abs0 += AND_VEC_W(src4);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
+ sum_abs3 += AND_VEC_W(src7);
}
- else if ((n & 8) && (n & 4))
- {
- LD_SP3_INC(x, 4, src0, src1, src2);
-
- sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src1);
- sum_abs2 += AND_VEC_W(src2);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
- else if (n & 16)
+ if (n & 16)
{
LD_SP4_INC(x, 4, src0, src1, src2, src3);
sum_abs1 += AND_VEC_W(src1);
sum_abs2 += AND_VEC_W(src2);
sum_abs3 += AND_VEC_W(src3);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
}
- else if (n & 8)
+
+ if (n & 8)
{
LD_SP2_INC(x, 4, src0, src1);
sum_abs0 += AND_VEC_W(src0);
sum_abs1 += AND_VEC_W(src1);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
}
- else if (n & 4)
+
+ if (n & 4)
{
src0 = LD_SP(x); x += 4;
sum_abs0 += AND_VEC_W(src0);
-
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
- else
- {
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
}
if (n & 2)
{
- sumf += fabsf(*(x + 0));
+ sumf += fabsf(*x);
sumf += fabsf(*(x + 1));
x += 2;
}
if (n & 1)
{
- sumf += fabsf(*(x + 0));
+ sumf += fabsf(*x);
}
}
- else
- {
- sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
- }
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
}
else
{
- if (n > 8)
+ for (i = (n >> 4); i--;)
{
- n -= 8;
-
src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
x += inc_x;
src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
x += inc_x;
- src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
- src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
+ src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x));
x += inc_x;
- src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
+ src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x));
x += inc_x;
- src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
+ src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x));
x += inc_x;
-
- sum_abs0 = AND_VEC_W(src0);
- sum_abs1 = AND_VEC_W(src4);
- }
- else
- {
- sum_abs0 = zero_v;
- sum_abs1 = zero_v;
- }
-
- for (i = (n >> 3); i--;)
- {
- src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ src2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
- src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+ src2 = (v4f32) __msa_insert_w((v4i32) src2, 1, *((int *) x));
x += inc_x;
- src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+ src2 = (v4f32) __msa_insert_w((v4i32) src2, 2, *((int *) x));
x += inc_x;
- src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+ src2 = (v4f32) __msa_insert_w((v4i32) src2, 3, *((int *) x));
x += inc_x;
- src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ src3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
- src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
+ src3 = (v4f32) __msa_insert_w((v4i32) src3, 1, *((int *) x));
x += inc_x;
- src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
+ src3 = (v4f32) __msa_insert_w((v4i32) src3, 2, *((int *) x));
x += inc_x;
- src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
+ src3 = (v4f32) __msa_insert_w((v4i32) src3, 3, *((int *) x));
x += inc_x;
sum_abs0 += AND_VEC_W(src0);
- sum_abs1 += AND_VEC_W(src4);
+ sum_abs1 += AND_VEC_W(src1);
+ sum_abs2 += AND_VEC_W(src2);
+ sum_abs3 += AND_VEC_W(src3);
}
- if (n & 4)
+ if (n & 15)
{
- src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
- x += inc_x;
- src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
- x += inc_x;
- src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
- x += inc_x;
- src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
- x += inc_x;
+ if (n & 8)
+ {
+ src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+ x += inc_x;
+ src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x));
+ x += inc_x;
+ src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x));
+ x += inc_x;
+ src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x));
+ x += inc_x;
- sum_abs0 += AND_VEC_W(src0);
- }
+ sum_abs0 += AND_VEC_W(src0);
+ sum_abs1 += AND_VEC_W(src1);
+ }
- sum_abs0 += sum_abs1;
+ if (n & 4)
+ {
+ src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+ x += inc_x;
+ src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+ x += inc_x;
- sumf += sum_abs0[0];
- sumf += sum_abs0[1];
- sumf += sum_abs0[2];
- sumf += sum_abs0[3];
+ sum_abs0 += AND_VEC_W(src0);
+ }
- if ((n & 2) && (n & 1))
- {
- data0 = fabsf(*x); x += inc_x;
- data1 = fabsf(*x); x += inc_x;
- data2 = fabsf(*x);
+ if (n & 2)
+ {
+ data0 = fabsf(*x); x += inc_x;
+ data1 = fabsf(*x); x += inc_x;
- sumf += data0;
- sumf += data1;
- sumf += data2;
- }
- else if (n & 2)
- {
- data0 = fabsf(*x); x += inc_x;
- data1 = fabsf(*x);
+ sumf += data0;
+ sumf += data1;
+ }
- sumf += data0;
- sumf += data1;
+ if (n & 1)
+ {
+ sumf += fabsf(*x);
+ }
}
- else if (n & 1)
- {
- data0 = fabsf(*x);
- sumf += data0;
- }
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+
+ sumf += sum_abs0[0];
+ sumf += sum_abs0[1];
+ sumf += sum_abs0[2];
+ sumf += sum_abs0[3];
}
return (sumf);
#include "common.h"
#include "macros_msa.h"
-/* return float, x,y float */
#if defined(DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
{
BLASLONG i = 0;
double dot = 0.0;
- float x0, x1, x2, x3, y0, y1, y2, y3;
+ FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
v4f32 dot0 = {0, 0, 0, 0};
+ v4f32 dot1 = {0, 0, 0, 0};
+ v4f32 dot2 = {0, 0, 0, 0};
+ v4f32 dot3 = {0, 0, 0, 0};
- if (n < 0) return (dot);
+ if (n < 1) return (dot);
if ((1 == inc_x) && (1 == inc_y))
{
for (i = (n >> 5); i--;)
{
- LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
- LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+ LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 256(%[x])\n\t"
+ "pref 0, 288(%[x])\n\t"
+ "pref 0, 320(%[x])\n\t"
+ "pref 0, 352(%[x])\n\t"
+ "pref 0, 256(%[y])\n\t"
+ "pref 0, 288(%[y])\n\t"
+ "pref 0, 320(%[y])\n\t"
+ "pref 0, 352(%[y])\n\t"
+
+ : : [x] "r" (x), [y] "r" (y)
+ );
+#endif
dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
+ dot1 += (vy1 * vx1);
+ dot2 += (vy2 * vx2);
+ dot3 += (vy3 * vx3);
dot0 += (vy4 * vx4);
- dot0 += (vy5 * vx5);
- dot0 += (vy6 * vx6);
- dot0 += (vy7 * vx7);
+ dot1 += (vy5 * vx5);
+ dot2 += (vy6 * vx6);
+ dot3 += (vy7 * vx7);
}
if (n & 31)
{
- if ((n & 16) && (n & 8) && (n & 4))
+ if (n & 16)
{
- LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
- LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
+ LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+ LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
- dot0 += (vy4 * vx4);
- dot0 += (vy5 * vx5);
- dot0 += (vy6 * vx6);
+ dot1 += (vy1 * vx1);
+ dot2 += (vy2 * vx2);
+ dot3 += (vy3 * vx3);
}
- else if ((n & 16) && (n & 8))
- {
- LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5);
- LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5);
- dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
- dot0 += (vy4 * vx4);
- dot0 += (vy5 * vx5);
- }
- else if ((n & 16) && (n & 4))
+ if (n & 8)
{
- LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4);
- LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4);
+ LD_SP2_INC(x, 4, vx0, vx1);
+ LD_SP2_INC(y, 4, vy0, vy1);
dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
- dot0 += (vy4 * vx4);
+ dot1 += (vy1 * vx1);
}
- else if ((n & 8) && (n & 4))
- {
- LD_SP3_INC(x, 4, vx0, vx1, vx2);
- LD_SP3_INC(y, 4, vy0, vy1, vy2);
- dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- }
- else if (n & 16)
- {
- LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
- LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
-
- dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- dot0 += (vy2 * vx2);
- dot0 += (vy3 * vx3);
- }
- else if (n & 8)
- {
- LD_SP2_INC(x, 4, vx0, vx1);
- LD_SP2_INC(y, 4, vy0, vy1);
-
- dot0 += (vy0 * vx0);
- dot0 += (vy1 * vx1);
- }
- else if (n & 4)
+ if (n & 4)
{
vx0 = LD_SP(x); x += 4;
vy0 = LD_SP(y); y += 4;
dot0 += (vy0 * vx0);
}
- if ((n & 2) && (n & 1))
- {
- LD_GP3_INC(x, 1, x0, x1, x2);
- LD_GP3_INC(y, 1, y0, y1, y2);
-
- dot += (y0 * x0);
- dot += (y1 * x1);
- dot += (y2 * x2);
- }
- else if (n & 2)
+ if (n & 2)
{
LD_GP2_INC(x, 1, x0, x1);
LD_GP2_INC(y, 1, y0, y1);
dot += (y0 * x0);
dot += (y1 * x1);
}
- else if (n & 1)
+
+ if (n & 1)
{
x0 = *x;
y0 = *y;
}
}
+ dot0 += dot1 + dot2 + dot3;
+
dot += dot0[0];
dot += dot0[1];
dot += dot0[2];
dot += (y3 * x3);
}
- if ((n & 2) && (n & 1))
- {
- LD_GP3_INC(x, inc_x, x0, x1, x2);
- LD_GP3_INC(y, inc_y, y0, y1, y2);
-
- dot += (y0 * x0);
- dot += (y1 * x1);
- dot += (y2 * x2);
- }
- else if (n & 2)
+ if (n & 2)
{
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(y, inc_y, y0, y1);
dot += (y0 * x0);
dot += (y1 * x1);
}
- else if (n & 1)
+
+ if (n & 1)
{
x0 = *x;
y0 = *y;
#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
-#define PROCESS_ZD(inc_val) \
- if (n > 8) \
- { \
- n -= 8; \
- \
- LD_DP8_INC(x, inc_val, src0, src1, src2, \
- src3, src4, src5, src6, src7); \
- \
- sum_abs0 = AND_VEC_D(src0); \
- sum_abs1 = AND_VEC_D(src1); \
- sum_abs2 = AND_VEC_D(src2); \
- sum_abs3 = AND_VEC_D(src3); \
- sum_abs0 += AND_VEC_D(src4); \
- sum_abs1 += AND_VEC_D(src5); \
- sum_abs2 += AND_VEC_D(src6); \
- sum_abs3 += AND_VEC_D(src7); \
- } \
- else \
- { \
- sum_abs0 = zero_v; \
- sum_abs1 = zero_v; \
- sum_abs2 = zero_v; \
- sum_abs3 = zero_v; \
- } \
- \
- for (i = (n >> 3); i--;) \
- { \
- LD_DP8_INC(x, inc_val, src0, src1, src2, \
- src3, src4, src5, src6, src7); \
- \
- sum_abs0 += AND_VEC_D(src0); \
- sum_abs1 += AND_VEC_D(src1); \
- sum_abs2 += AND_VEC_D(src2); \
- sum_abs3 += AND_VEC_D(src3); \
- sum_abs0 += AND_VEC_D(src4); \
- sum_abs1 += AND_VEC_D(src5); \
- sum_abs2 += AND_VEC_D(src6); \
- sum_abs3 += AND_VEC_D(src7); \
- } \
- \
- if (n & 7) \
- { \
- if ((n & 4) && (n & 2) && (n & 1)) \
- { \
- LD_DP7_INC(x, inc_val, src0, src1, src2, \
- src3, src4, src5, src6); \
- \
- sum_abs0 += AND_VEC_D(src0); \
- sum_abs1 += AND_VEC_D(src1); \
- sum_abs2 += AND_VEC_D(src2); \
- sum_abs3 += AND_VEC_D(src3); \
- sum_abs0 += AND_VEC_D(src4); \
- sum_abs1 += AND_VEC_D(src5); \
- sum_abs2 += AND_VEC_D(src6); \
- } \
- else if ((n & 4) && (n & 2)) \
- { \
- LD_DP6_INC(x, inc_val, src0, src1, src2, \
- src3, src4, src5); \
- \
- sum_abs0 += AND_VEC_D(src0); \
- sum_abs1 += AND_VEC_D(src1); \
- sum_abs2 += AND_VEC_D(src2); \
- sum_abs3 += AND_VEC_D(src3); \
- sum_abs0 += AND_VEC_D(src4); \
- sum_abs1 += AND_VEC_D(src5); \
- } \
- else if ((n & 4) && (n & 1)) \
- { \
- LD_DP5_INC(x, inc_val, src0, src1, src2, \
- src3, src4); \
- \
- sum_abs0 += AND_VEC_D(src0); \
- sum_abs1 += AND_VEC_D(src1); \
- sum_abs2 += AND_VEC_D(src2); \
- sum_abs3 += AND_VEC_D(src3); \
- sum_abs0 += AND_VEC_D(src4); \
- } \
- else if ((n & 2) && (n & 1)) \
- { \
- LD_DP3_INC(x, inc_val, src0, src1, src2); \
- \
- sum_abs0 += AND_VEC_D(src0); \
- sum_abs1 += AND_VEC_D(src1); \
- sum_abs2 += AND_VEC_D(src2); \
- } \
- else if (n & 4) \
- { \
- LD_DP4_INC(x, inc_val, src0, src1, src2, \
- src3); \
- \
- sum_abs0 += AND_VEC_D(src0); \
- sum_abs1 += AND_VEC_D(src1); \
- sum_abs2 += AND_VEC_D(src2); \
- sum_abs3 += AND_VEC_D(src3); \
- } \
- else if (n & 2) \
- { \
- LD_DP2_INC(x, inc_val, src0, src1); \
- \
- sum_abs0 += AND_VEC_D(src0); \
- sum_abs1 += AND_VEC_D(src1); \
- } \
- else if (n & 1) \
- { \
- src0 = LD_DP(x); \
- \
- sum_abs0 += AND_VEC_D(src0); \
- } \
- } \
- \
- sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; \
- sumf = sum_abs0[0] + sum_abs0[1];
-
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i;
FLOAT sumf = 0.0;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
- v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
- v2f64 zero_v = {0};
+ v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+ v2f64 sum_abs0 = {0, 0};
+ v2f64 sum_abs1 = {0, 0};
+ v2f64 sum_abs2 = {0, 0};
+ v2f64 sum_abs3 = {0, 0};
v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
if (n <= 0 || inc_x <= 0) return (sumf);
if (1 == inc_x)
{
- PROCESS_ZD(2);
+#ifdef ENABLE_PREFETCH
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ }
+ pref_offset = pref_offset / sizeof(FLOAT);
+ x_pref = x + pref_offset + 64;
+#endif
+
+ for (i = (n >> 4); i--;)
+ {
+#ifdef ENABLE_PREFETCH
+ __asm__ __volatile__(
+ "pref 0, 0(%[x_pref])\n\t"
+ "pref 0, 32(%[x_pref])\n\t"
+ "pref 0, 64(%[x_pref])\n\t"
+ "pref 0, 96(%[x_pref])\n\t"
+ "pref 0, 128(%[x_pref])\n\t"
+ "pref 0, 160(%[x_pref])\n\t"
+ "pref 0, 192(%[x_pref])\n\t"
+ "pref 0, 224(%[x_pref])\n\t"
+
+ : : [x_pref] "r" (x_pref)
+ );
+
+ x_pref += 32;
+#endif
+
+ LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
+ sum_abs0 += AND_VEC_D(src8);
+ sum_abs1 += AND_VEC_D(src9);
+ sum_abs2 += AND_VEC_D(src10);
+ sum_abs3 += AND_VEC_D(src11);
+ sum_abs0 += AND_VEC_D(src12);
+ sum_abs1 += AND_VEC_D(src13);
+ sum_abs2 += AND_VEC_D(src14);
+ sum_abs3 += AND_VEC_D(src15);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(x, 2, src0, src1, src2, src3);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, 2, src0, src1);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ }
+
+ if (n & 1)
+ {
+ src0 = LD_DP(x);
+
+ sum_abs0 += AND_VEC_D(src0);
+ }
+ }
+
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+ sumf = sum_abs0[0] + sum_abs0[1];
}
else
{
inc_x *= 2;
- PROCESS_ZD(inc_x);
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
+ sum_abs0 += AND_VEC_D(src8);
+ sum_abs1 += AND_VEC_D(src9);
+ sum_abs2 += AND_VEC_D(src10);
+ sum_abs3 += AND_VEC_D(src11);
+ sum_abs0 += AND_VEC_D(src12);
+ sum_abs1 += AND_VEC_D(src13);
+ sum_abs2 += AND_VEC_D(src14);
+ sum_abs3 += AND_VEC_D(src15);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ sum_abs0 += AND_VEC_D(src4);
+ sum_abs1 += AND_VEC_D(src5);
+ sum_abs2 += AND_VEC_D(src6);
+ sum_abs3 += AND_VEC_D(src7);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ sum_abs2 += AND_VEC_D(src2);
+ sum_abs3 += AND_VEC_D(src3);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, inc_x, src0, src1);
+
+ sum_abs0 += AND_VEC_D(src0);
+ sum_abs1 += AND_VEC_D(src1);
+ }
+
+ if (n & 1)
+ {
+ src0 = LD_DP(x);
+
+ sum_abs0 += AND_VEC_D(src0);
+ }
+ }
+
+ sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+ sumf = sum_abs0[0] + sum_abs0[1];
}
return (sumf);
#include "macros_msa.h"
#if !defined(CONJ)
- #define OP2 +=
- #define OP3 -
- #define OP4 +
+ #define OP1 -=
+ #define OP2 +=
+ #define OP3 -
+ #define OP4 +
#else
- #define OP2 -=
- #define OP3 +
- #define OP4 -
+ #define OP1 +=
+ #define OP2 -=
+ #define OP3 +
+ #define OP4 -
#endif
-#define DOT16_KERNEL(OPR0, OPR1) \
- dot0 += (vx0r * vy0r); \
- dot0 OPR0## = (vx0i * vy0i); \
- dot1 OPR1## = (vx0i * vy0r); \
- dot1 += (vx0r * vy0i); \
- \
- dot0 += (vx1r * vy1r); \
- dot0 OPR0## = (vx1i * vy1i); \
- dot1 OPR1## = (vx1i * vy1r); \
- dot1 += (vx1r * vy1i); \
- \
- dot0 += (vx2r * vy2r); \
- dot0 OPR0## = (vx2i * vy2i); \
- dot1 OPR1## = (vx2i * vy2r); \
- dot1 += (vx2r * vy2i); \
- \
- dot0 += (vx3r * vy3r); \
- dot0 OPR0## = (vx3i * vy3i); \
- dot1 OPR1## = (vx3i * vy3r); \
- dot1 += (vx3r * vy3i);
-
-#define DOT12_KERNEL(OPR0, OPR1) \
- dot0 += (vx0r * vy0r); \
- dot0 OPR0## = (vx0i * vy0i); \
- dot1 OPR1## = (vx0i * vy0r); \
- dot1 += (vx0r * vy0i); \
- \
- dot0 += (vx1r * vy1r); \
- dot0 OPR0## = (vx1i * vy1i); \
- dot1 OPR1## = (vx1i * vy1r); \
- dot1 += (vx1r * vy1i); \
- \
- dot0 += (vx2r * vy2r); \
- dot0 OPR0## = (vx2i * vy2i); \
- dot1 OPR1## = (vx2i * vy2r); \
- dot1 += (vx2r * vy2i);
-
-#define DOT8_KERNEL(OPR0, OPR1) \
- dot0 += (vx0r * vy0r); \
- dot0 OPR0## = (vx0i * vy0i); \
- dot1 OPR1## = (vx0i * vy0r); \
- dot1 += (vx0r * vy0i); \
- \
- dot0 += (vx1r * vy1r); \
- dot0 OPR0## = (vx1i * vy1i); \
- dot1 OPR1## = (vx1i * vy1r); \
- dot1 += (vx1r * vy1i);
-
-#define DOT4_KERNEL(OPR0, OPR1) \
- dot0 += (vx0r * vy0r); \
- dot0 OPR0## = (vx0i * vy0i); \
- dot1 OPR1## = (vx0i * vy0r); \
- dot1 += (vx0r * vy0i);
-
-/* return double, x,y double */
-/* zdotc - CONJ */
-/* zdotu - !CONJ */
-
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i = 0;
FLOAT dot[2];
- BLASLONG inc_x2;
- BLASLONG inc_y2;
+ BLASLONG inc_x2, inc_y2;
v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
- v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
- v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
+ v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
+ v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
v2f64 dot0 = {0, 0};
v2f64 dot1 = {0, 0};
+ v2f64 dot2 = {0, 0};
+ v2f64 dot3 = {0, 0};
+ v2f64 dot4 = {0, 0};
+ v2f64 dot5 = {0, 0};
+ v2f64 dot6 = {0, 0};
+ v2f64 dot7 = {0, 0};
v2f64 zero = {0, 0};
- openblas_complex_double result;
+ OPENBLAS_COMPLEX_FLOAT result;
dot[0] = 0.0;
dot[1] = 0.0;
- __real__(result) = 0.0;
- __imag__(result) = 0.0;
+ CREAL(result) = 0.0;
+ CIMAG(result) = 0.0;
- if ( n < 1 ) return(result);
+ if (n < 1) return (result);
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
- for (i = (n >> 3); i--;)
- {
- LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
- LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
-
- PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
- PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
- PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
- PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
-
- PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
- PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
- PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
- PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
-
- #if !defined(CONJ)
- DOT16_KERNEL(-, +);
- #else
- DOT16_KERNEL(+, -);
- #endif
- }
-
- if (n & 7)
- {
- if ((n & 4) && (n & 2))
- {
- LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
- LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
- LD_DP2_INC(x, inc_x2, vx4, vx5);
- LD_DP2_INC(y, inc_y2, vy4, vy5);
-
- PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
- PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
- PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
-
- PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
- PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
- PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
-
- #if !defined(CONJ)
- DOT12_KERNEL(-, +);
- #else
- DOT12_KERNEL(+, -);
- #endif
- }
- else if (n & 4)
- {
- LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
- LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
-
- PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
- PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
-
- PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
- PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
-
- #if !defined(CONJ)
- DOT8_KERNEL(-, +);
- #else
- DOT8_KERNEL(+, -);
- #endif
- }
- else if (n & 2)
- {
- LD_DP2_INC(x, inc_x2, vx0, vx1);
- LD_DP2_INC(y, inc_y2, vy0, vy1);
- PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
- PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
-
- #if !defined(CONJ)
- DOT4_KERNEL(-, +);
- #else
- DOT4_KERNEL(+, -);
- #endif
- }
-
- if (n & 1)
- {
- vx0 = LD_DP(x);
- vy0 = LD_DP(y);
- PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i);
- PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i);
-
- #if !defined(CONJ)
- DOT4_KERNEL(-, +);
- #else
- DOT4_KERNEL(+, -);
- #endif
- }
- }
-
- dot[0] += (dot0[0] + dot0[1]);
- dot[1] += (dot1[0] + dot1[1]);
-
- __real__(result) = dot[0];
- __imag__(result) = dot[1];
-
- return(result);
+
+#ifdef ENABLE_PREFETCH
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ double *x_pref, *y_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ }
+ pref_offset = pref_offset / sizeof(double);
+ x_pref = x + pref_offset + 32;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ }
+ pref_offset = pref_offset / sizeof(double);
+ y_pref = y + pref_offset + 32;
+
+ for (i = (n >> 3); i--;)
+ {
+ __asm__ __volatile__(
+ "pref 0, 0(%[x_pref])\n\t"
+ "pref 0, 32(%[x_pref])\n\t"
+ "pref 0, 64(%[x_pref])\n\t"
+ "pref 0, 96(%[x_pref])\n\t"
+ "pref 0, 0(%[y_pref])\n\t"
+ "pref 0, 32(%[y_pref])\n\t"
+ "pref 0, 64(%[y_pref])\n\t"
+ "pref 0, 96(%[y_pref])\n\t"
+
+ : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref)
+ );
+
+ x_pref += 16;
+ y_pref += 16;
+
+ LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+ PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+ PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
+ PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
+
+ PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+ PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
+ PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
+
+ dot0 += (vx0r * vy0r);
+ dot0 OP1 (vx0i * vy0i);
+ dot1 OP2 (vx0i * vy0r);
+ dot1 += (vx0r * vy0i);
+
+ dot2 += (vx1r * vy1r);
+ dot2 OP1 (vx1i * vy1i);
+ dot3 OP2 (vx1i * vy1r);
+ dot3 += (vx1r * vy1i);
+
+ dot4 += (vx2r * vy2r);
+ dot4 OP1 (vx2i * vy2i);
+ dot5 OP2 (vx2i * vy2r);
+ dot5 += (vx2r * vy2i);
+
+ dot6 += (vx3r * vy3r);
+ dot6 OP1 (vx3i * vy3i);
+ dot7 OP2 (vx3i * vy3r);
+ dot7 += (vx3r * vy3i);
+ }
+ }
+ else
+#endif
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+ LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+ PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+ PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
+ PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
+
+ PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+ PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
+ PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
+
+ dot0 += (vx0r * vy0r);
+ dot0 OP1 (vx0i * vy0i);
+ dot1 OP2 (vx0i * vy0r);
+ dot1 += (vx0r * vy0i);
+
+ dot2 += (vx1r * vy1r);
+ dot2 OP1 (vx1i * vy1i);
+ dot3 OP2 (vx1i * vy1r);
+ dot3 += (vx1r * vy1i);
+
+ dot4 += (vx2r * vy2r);
+ dot4 OP1 (vx2i * vy2i);
+ dot5 OP2 (vx2i * vy2r);
+ dot5 += (vx2r * vy2i);
+
+ dot6 += (vx3r * vy3r);
+ dot6 OP1 (vx3i * vy3i);
+ dot7 OP2 (vx3i * vy3r);
+ dot7 += (vx3r * vy3i);
+ }
+
+ if (n & 7)
+ {
+ if (n & 4)
+ {
+ LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
+ LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
+
+ PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+
+ PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+ PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+
+ dot0 += (vx0r * vy0r);
+ dot0 OP1 (vx0i * vy0i);
+ dot1 OP2 (vx0i * vy0r);
+ dot1 += (vx0r * vy0i);
+
+ dot2 += (vx1r * vy1r);
+ dot2 OP1 (vx1i * vy1i);
+ dot3 OP2 (vx1i * vy1r);
+ dot3 += (vx1r * vy1i);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, inc_x2, vx0, vx1);
+ LD_DP2_INC(y, inc_y2, vy0, vy1);
+ PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+
+ dot0 += (vx0r * vy0r);
+ dot0 OP1 (vx0i * vy0i);
+ dot1 OP2 (vx0i * vy0r);
+ dot1 += (vx0r * vy0i);
+ }
+
+ if (n & 1)
+ {
+ vx0 = LD_DP(x);
+ vy0 = LD_DP(y);
+ PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i);
+ PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i);
+
+ dot0 += (vx0r * vy0r);
+ dot0 OP1 (vx0i * vy0i);
+ dot1 OP2 (vx0i * vy0r);
+ dot1 += (vx0r * vy0i);
+ }
+ }
+
+ dot0 += dot2 + dot4 + dot6;
+ dot1 += dot3 + dot5 + dot7;
+
+ dot[0] += (dot0[0] + dot0[1]);
+ dot[1] += (dot1[0] + dot1[1]);
+
+ CREAL(result) = dot[0];
+ CIMAG(result) = dot[1];
+
+ return (result);
}