Add data prefetch in DOT and ASUM functions
authorkaustubh <kaustubh.raste@imgtec.com>
Tue, 22 Nov 2016 05:51:03 +0000 (11:21 +0530)
committerkaustubh <kaustubh.raste@imgtec.com>
Tue, 22 Nov 2016 05:51:03 +0000 (11:21 +0530)
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
kernel/mips/casum_msa.c
kernel/mips/cdot_msa.c
kernel/mips/dasum_msa.c
kernel/mips/ddot_msa.c
kernel/mips/sasum_msa.c
kernel/mips/sdot_msa.c
kernel/mips/zasum_msa.c
kernel/mips/zdot_msa.c

index 454573d..5bb9483 100644 (file)
@@ -36,40 +36,51 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     BLASLONG i, inc_x2;
     FLOAT sumf = 0.0;
     v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
-    v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
-    v4f32 zero_v = {0};
+    v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+    v4f32 sum_abs0 = {0, 0, 0, 0};
+    v4f32 sum_abs1 = {0, 0, 0, 0};
+    v4f32 sum_abs2 = {0, 0, 0, 0};
+    v4f32 sum_abs3 = {0, 0, 0, 0};
     v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
 
     if (n <= 0 || inc_x <= 0) return (sumf);
 
     if (1 == inc_x)
     {
-        if (n > 15)
-        {
-            n -= 16;
-
-            LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+#ifdef ENABLE_PREFETCH
+        FLOAT *x_pref;
+        BLASLONG pref_offset;
 
-            sum_abs0 = AND_VEC_W(src0);
-            sum_abs1 = AND_VEC_W(src1);
-            sum_abs2 = AND_VEC_W(src2);
-            sum_abs3 = AND_VEC_W(src3);
-            sum_abs0 += AND_VEC_W(src4);
-            sum_abs1 += AND_VEC_W(src5);
-            sum_abs2 += AND_VEC_W(src6);
-            sum_abs3 += AND_VEC_W(src7);
-        }
-        else
+        pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+        if (pref_offset > 0)
         {
-            sum_abs0 = zero_v;
-            sum_abs1 = zero_v;
-            sum_abs2 = zero_v;
-            sum_abs3 = zero_v;
+            pref_offset = L1_DATA_LINESIZE - pref_offset;
         }
+        pref_offset = pref_offset / sizeof(FLOAT);
+        x_pref = x + pref_offset + 128;
+#endif
 
-        for (i = (n >> 4); i--;)
+        for (i = (n >> 5); i--;)
         {
+#ifdef ENABLE_PREFETCH
+            __asm__ __volatile__(
+                "pref   0,     0(%[x_pref])\n\t"
+                "pref   0,    32(%[x_pref])\n\t"
+                "pref   0,    64(%[x_pref])\n\t"
+                "pref   0,    96(%[x_pref])\n\t"
+                "pref   0,   128(%[x_pref])\n\t"
+                "pref   0,   160(%[x_pref])\n\t"
+                "pref   0,   192(%[x_pref])\n\t"
+                "pref   0,   224(%[x_pref])\n\t"
+
+                : : [x_pref] "r" (x_pref)
+            );
+
+            x_pref += 64;
+#endif
+
             LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+            LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15);
 
             sum_abs0 += AND_VEC_W(src0);
             sum_abs1 += AND_VEC_W(src1);
@@ -79,13 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             sum_abs1 += AND_VEC_W(src5);
             sum_abs2 += AND_VEC_W(src6);
             sum_abs3 += AND_VEC_W(src7);
+            sum_abs0 += AND_VEC_W(src8);
+            sum_abs1 += AND_VEC_W(src9);
+            sum_abs2 += AND_VEC_W(src10);
+            sum_abs3 += AND_VEC_W(src11);
+            sum_abs0 += AND_VEC_W(src12);
+            sum_abs1 += AND_VEC_W(src13);
+            sum_abs2 += AND_VEC_W(src14);
+            sum_abs3 += AND_VEC_W(src15);
         }
 
-        if (n & 15)
+        if (n & 31)
         {
-            if ((n & 8) && (n & 4) && (n & 2))
+            if (n & 16)
             {
-                LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
+                LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
 
                 sum_abs0 += AND_VEC_W(src0);
                 sum_abs1 += AND_VEC_W(src1);
@@ -94,65 +113,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs0 += AND_VEC_W(src4);
                 sum_abs1 += AND_VEC_W(src5);
                 sum_abs2 += AND_VEC_W(src6);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf = sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
+                sum_abs3 += AND_VEC_W(src7);
             }
-            else if ((n & 8) && (n & 4))
-            {
-                LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
-
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-                sum_abs3 += AND_VEC_W(src3);
-                sum_abs0 += AND_VEC_W(src4);
-                sum_abs1 += AND_VEC_W(src5);
 
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf = sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
-            }
-            else if ((n & 8) && (n & 2))
-            {
-                LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
-
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-                sum_abs3 += AND_VEC_W(src3);
-                sum_abs0 += AND_VEC_W(src4);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf = sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
-            }
-            else if ((n & 4) && (n & 2))
-            {
-                LD_SP3_INC(x, 4, src0, src1, src2);
-
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf = sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
-            }
-            else if (n & 8)
+            if (n & 8)
             {
                 LD_SP4_INC(x, 4, src0, src1, src2, src3);
 
@@ -160,97 +124,45 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs1 += AND_VEC_W(src1);
                 sum_abs2 += AND_VEC_W(src2);
                 sum_abs3 += AND_VEC_W(src3);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf = sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
             }
-            else if (n & 4)
+
+            if (n & 4)
             {
                 LD_SP2_INC(x, 4, src0, src1);
 
                 sum_abs0 += AND_VEC_W(src0);
                 sum_abs1 += AND_VEC_W(src1);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf = sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
             }
-            else if (n & 2)
+
+            if (n & 2)
             {
                 src0 = LD_SP(x); x += 4;
 
                 sum_abs0 += AND_VEC_W(src0);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf = sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
-            }
-            else
-            {
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf = sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
             }
 
             if (n & 1)
             {
-                sumf += fabsf(*(x + 0));
+                sumf += fabsf(*x);
                 sumf += fabsf(*(x + 1));
             }
         }
-        else
-        {
-            sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
 
-            sumf = sum_abs0[0];
-            sumf += sum_abs0[1];
-            sumf += sum_abs0[2];
-            sumf += sum_abs0[3];
-        }
+        sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+
+        sumf += sum_abs0[0];
+        sumf += sum_abs0[1];
+        sumf += sum_abs0[2];
+        sumf += sum_abs0[3];
     }
     else
     {
         inc_x2 = 2 * inc_x;
 
-        if (n > 8)
-        {
-            n -= 8;
-
-            LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
-
-            sum_abs0 = AND_VEC_W(src0);
-            sum_abs1 = AND_VEC_W(src1);
-            sum_abs2 = AND_VEC_W(src2);
-            sum_abs3 = AND_VEC_W(src3);
-            sum_abs0 += AND_VEC_W(src4);
-            sum_abs1 += AND_VEC_W(src5);
-            sum_abs2 += AND_VEC_W(src6);
-            sum_abs3 += AND_VEC_W(src7);
-        }
-        else
-        {
-            sum_abs0 = zero_v;
-            sum_abs1 = zero_v;
-            sum_abs2 = zero_v;
-            sum_abs3 = zero_v;
-        }
-
-        for (i = (n >> 3); i--;)
+        for (i = (n >> 4); i--;)
         {
             LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
+            LD_SP8_INC(x, inc_x2, src8, src9, src10, src11, src12, src13, src14, src15);
 
             sum_abs0 += AND_VEC_W(src0);
             sum_abs1 += AND_VEC_W(src1);
@@ -260,13 +172,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             sum_abs1 += AND_VEC_W(src5);
             sum_abs2 += AND_VEC_W(src6);
             sum_abs3 += AND_VEC_W(src7);
+            sum_abs0 += AND_VEC_W(src8);
+            sum_abs1 += AND_VEC_W(src9);
+            sum_abs2 += AND_VEC_W(src10);
+            sum_abs3 += AND_VEC_W(src11);
+            sum_abs0 += AND_VEC_W(src12);
+            sum_abs1 += AND_VEC_W(src13);
+            sum_abs2 += AND_VEC_W(src14);
+            sum_abs3 += AND_VEC_W(src15);
         }
 
-        if (n & 7)
+        if (n & 15)
         {
-            if ((n & 4) && (n & 2) && (n & 1))
+            if (n & 8)
             {
-                LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6);
+                LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
 
                 sum_abs0 += AND_VEC_W(src0);
                 sum_abs1 += AND_VEC_W(src1);
@@ -275,37 +195,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs0 += AND_VEC_W(src4);
                 sum_abs1 += AND_VEC_W(src5);
                 sum_abs2 += AND_VEC_W(src6);
+                sum_abs3 += AND_VEC_W(src7);
             }
-            else if ((n & 4) && (n & 2))
-            {
-                LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5);
 
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-                sum_abs3 += AND_VEC_W(src3);
-                sum_abs0 += AND_VEC_W(src4);
-                sum_abs1 += AND_VEC_W(src5);
-            }
-            else if ((n & 4) && (n & 1))
-            {
-                LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4);
-
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-                sum_abs3 += AND_VEC_W(src3);
-                sum_abs0 += AND_VEC_W(src4);
-            }
-            else if ((n & 2) && (n & 1))
-            {
-                LD_SP3_INC(x, inc_x2, src0, src1, src2);
-
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-            }
-            else if (n & 4)
+            if (n & 4)
             {
                 LD_SP4_INC(x, inc_x2, src0, src1, src2, src3);
 
@@ -314,22 +207,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs2 += AND_VEC_W(src2);
                 sum_abs3 += AND_VEC_W(src3);
             }
-            else if (n & 2)
+
+            if (n & 2)
             {
                 LD_SP2_INC(x, inc_x2, src0, src1);
 
                 sum_abs0 += AND_VEC_W(src0);
                 sum_abs1 += AND_VEC_W(src1);
             }
-            else if (n & 1)
+
+            if (n & 1)
             {
-                src0 = LD_SP(x); x += inc_x2;
+                src0 = LD_SP(x);
 
                 sum_abs0 += AND_VEC_W(src0);
             }
         }
 
-        sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+        sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
 
         sumf = sum_abs0[0] + sum_abs0[1];
     }
index bf9f6b7..2079c9e 100644 (file)
@@ -29,333 +29,274 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "macros_msa.h"
 
 #if !defined(CONJ)
-       #define OP2             +=
-       #define OP3             -
-       #define OP4             +
+    #define OP1     -=
+    #define OP2     +=
+    #define OP3     -
+    #define OP4     +
 #else
-       #define OP2             -=
-       #define OP3             +
-       #define OP4             -
+    #define OP1     +=
+    #define OP2     -=
+    #define OP3     +
+    #define OP4     -
 #endif
 
-#define DOT16_KERNEL(OPR0, OPR1)  \
-       dot0 += (vx0r * vy0r);            \
-       dot0 OPR0## = (vx0i * vy0i);  \
-       dot1 OPR1## = (vx0i * vy0r);  \
-       dot1 += (vx0r * vy0i);        \
-                                                                 \
-       dot0 += (vx1r * vy1r);        \
-       dot0 OPR0## = (vx1i * vy1i);  \
-       dot1 OPR1## = (vx1i * vy1r);  \
-       dot1 += (vx1r * vy1i);        \
-                                                                 \
-       dot0 += (vx2r * vy2r);        \
-       dot0 OPR0## = (vx2i * vy2i);  \
-       dot1 OPR1## = (vx2i * vy2r);  \
-       dot1 += (vx2r * vy2i);        \
-                                                                 \
-       dot0 += (vx3r * vy3r);        \
-       dot0 OPR0## = (vx3i * vy3i);  \
-       dot1 OPR1## = (vx3i * vy3r);  \
-       dot1 += (vx3r * vy3i);
-
-#define DOT12_KERNEL(OPR0, OPR1)  \
-       dot0 += (vx0r * vy0r);            \
-       dot0 OPR0## = (vx0i * vy0i);  \
-       dot1 OPR1## = (vx0i * vy0r);  \
-       dot1 += (vx0r * vy0i);        \
-                                                                 \
-       dot0 += (vx1r * vy1r);        \
-       dot0 OPR0## = (vx1i * vy1i);  \
-       dot1 OPR1## = (vx1i * vy1r);  \
-       dot1 += (vx1r * vy1i);            \
-                                                                 \
-       dot0 += (vx2r * vy2r);        \
-       dot0 OPR0## = (vx2i * vy2i);  \
-       dot1 OPR1## = (vx2i * vy2r);  \
-       dot1 += (vx2r * vy2i);
-
-#define DOT8_KERNEL(OPR0, OPR1)   \
-       dot0 += (vx0r * vy0r);            \
-       dot0 OPR0## = (vx0i * vy0i);  \
-       dot1 OPR1## = (vx0i * vy0r);  \
-       dot1 += (vx0r * vy0i);        \
-                                                                 \
-       dot0 += (vx1r * vy1r);        \
-       dot0 OPR0## = (vx1i * vy1i);  \
-       dot1 OPR1## = (vx1i * vy1r);  \
-       dot1 += (vx1r * vy1i);
-
-#define DOT4_KERNEL(OPR0, OPR1)   \
-       dot0 += (vx0r * vy0r);            \
-       dot0 OPR0## = (vx0i * vy0i);  \
-       dot1 OPR1## = (vx0i * vy0r);  \
-       dot1 += (vx0r * vy0i);
-
-/* return float, x,y float */
-/* cdotc -  CONJ */
-/* cdotu - !CONJ */
-#ifndef _MSC_VER
-#include <complex.h>
-FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-#else
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-#endif
 {
     BLASLONG i = 0;
     FLOAT dot[2];
-    BLASLONG inc_x2;
-    BLASLONG inc_y2;
+    BLASLONG inc_x2, inc_y2;
     FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
     FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
     v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
     v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
-       v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
-       v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
+    v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
+    v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
     v4f32 dot0 = {0, 0, 0, 0};
     v4f32 dot1 = {0, 0, 0, 0};
-    openblas_complex_float result;
+    v4f32 dot2 = {0, 0, 0, 0};
+    v4f32 dot3 = {0, 0, 0, 0};
+    v4f32 dot4 = {0, 0, 0, 0};
+    v4f32 dot5 = {0, 0, 0, 0};
+    v4f32 dot6 = {0, 0, 0, 0};
+    v4f32 dot7 = {0, 0, 0, 0};
+    OPENBLAS_COMPLEX_FLOAT result;
 
     dot[0] = 0.0;
     dot[1] = 0.0;
 
-    __real__(result) = 0.0;
-    __imag__(result) = 0.0;
+    CREAL(result) = 0.0;
+    CIMAG(result) = 0.0;
 
-    if ( n < 1 ) return(result);
+    if (n < 1) return (result);
 
     if ((1 == inc_x) && (1 == inc_y))
     {
+#ifdef ENABLE_PREFETCH
+        FLOAT *x_pref, *y_pref;
+        BLASLONG pref_offset;
+
+        pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+        if (pref_offset > 0)
+        {
+            pref_offset = L1_DATA_LINESIZE - pref_offset;
+        }
+        pref_offset = pref_offset / sizeof(FLOAT);
+        x_pref = x + pref_offset + 64;
+
+        pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+        if (pref_offset > 0)
+        {
+            pref_offset = L1_DATA_LINESIZE - pref_offset;
+        }
+        pref_offset = pref_offset / sizeof(FLOAT);
+        y_pref = y + pref_offset + 64;
+#endif
+
         for (i = (n >> 4); i--;)
         {
-                       LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
-                       LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
-
-                       PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
-                       PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
-                       PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
-                       PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
-
-                       PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
-                       PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
-                       PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
-                       PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
-
-               #if !defined(CONJ)
-                       DOT16_KERNEL(-, +);
-               #else
-                       DOT16_KERNEL(+, -);
-               #endif
+#ifdef ENABLE_PREFETCH
+            __asm__ __volatile__(
+                "pref   0,   0(%[x_pref])\n\t"
+                "pref   0,  32(%[x_pref])\n\t"
+                "pref   0,  64(%[x_pref])\n\t"
+                "pref   0,  96(%[x_pref])\n\t"
+                "pref   0,   0(%[y_pref])\n\t"
+                "pref   0,  32(%[y_pref])\n\t"
+                "pref   0,  64(%[y_pref])\n\t"
+                "pref   0,  96(%[y_pref])\n\t"
+
+                : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref)
+            );
+
+            x_pref += 32;
+            y_pref += 32;
+#endif
+
+            LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+            LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+            PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+            PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+            PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
+            PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
+
+            PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+            PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+            PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
+            PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
+
+            dot0 += (vx0r * vy0r);
+            dot0 OP1 (vx0i * vy0i);
+            dot1 OP2 (vx0i * vy0r);
+            dot1 += (vx0r * vy0i);
+
+            dot2 += (vx1r * vy1r);
+            dot2 OP1 (vx1i * vy1i);
+            dot3 OP2 (vx1i * vy1r);
+            dot3 += (vx1r * vy1i);
+
+            dot4 += (vx2r * vy2r);
+            dot4 OP1 (vx2i * vy2i);
+            dot5 OP2 (vx2i * vy2r);
+            dot5 += (vx2r * vy2i);
+
+            dot6 += (vx3r * vy3r);
+            dot6 OP1 (vx3i * vy3i);
+            dot7 OP2 (vx3i * vy3r);
+            dot7 += (vx3r * vy3i);
         }
 
         if (n & 15)
         {
-            if ((n & 8) && (n & 4))
+            if (n & 8)
             {
-                               LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
-                               LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
-                               LD_SP2_INC(x, 4, vx4, vx5);
-                               LD_SP2_INC(y, 4, vy4, vy5);
-
-                               PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
-                               PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
-                               PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
-
-                               PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
-                               PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
-                               PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
-
-                       #if !defined(CONJ)
-                               DOT12_KERNEL(-, +);
-                       #else
-                               DOT12_KERNEL(+, -);
-                       #endif
+                LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+                LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+
+                PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+                PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+
+                PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+                PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+
+                dot0 += (vx0r * vy0r);
+                dot0 OP1 (vx0i * vy0i);
+                dot1 OP2 (vx0i * vy0r);
+                dot1 += (vx0r * vy0i);
+
+                dot2 += (vx1r * vy1r);
+                dot2 OP1 (vx1i * vy1i);
+                dot3 OP2 (vx1i * vy1r);
+                dot3 += (vx1r * vy1i);
             }
-            else if (n & 8)
+
+            if (n & 4)
             {
-                               LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
-                               LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+                LD_SP2_INC(x, 4, vx0, vx1);
+                LD_SP2_INC(y, 4, vy0, vy1);
+                PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+                PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+
+                dot0 += (vx0r * vy0r);
+                dot0 OP1 (vx0i * vy0i);
+                dot1 OP2 (vx0i * vy0r);
+                dot1 += (vx0r * vy0i);
+            }
 
-                               PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
-                               PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+            if (n & 2)
+            {
+                LD_GP4_INC(x, 1, x0, x1, x2, x3);
+                LD_GP4_INC(y, 1, y0, y1, y2, y3);
 
-                               PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
-                               PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+                dot[0] += (x0 * y0 OP3 x1 * y1);
+                dot[1] OP2 (x1 * y0 OP4 x0 * y1);
 
-                       #if !defined(CONJ)
-                               DOT8_KERNEL(-, +);
-                       #else
-                               DOT8_KERNEL(+, -);
-                       #endif
+                dot[0] += (x2 * y2 OP3 x3 * y3);
+                dot[1] OP2 (x3 * y2 OP4 x2 * y3);
             }
-                       else if (n & 4)
+
+            if (n & 1)
             {
-                               LD_SP2_INC(x, 4, vx0, vx1);
-                               LD_SP2_INC(y, 4, vy0, vy1);
-                               PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
-                               PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
-
-                       #if !defined(CONJ)
-                               DOT4_KERNEL(-, +);
-                       #else
-                               DOT4_KERNEL(+, -);
-                       #endif
+                LD_GP2_INC(x, 1, x0, x1);
+                LD_GP2_INC(y, 1, y0, y1);
+
+                dot[0] += (x0 * y0 OP3 x1 * y1);
+                dot[1] OP2 (x1 * y0 OP4 x0 * y1);
             }
+        }
 
-                       if ((n & 2) && (n & 1))
-                       {
-                LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5);
-                LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5);
+        dot0 += dot2 + dot4 + dot6;
+        dot1 += dot3 + dot5 + dot7;
 
-                               dot[0] += ( x0 * y0 OP3 x1 * y1 );
-                               dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+        dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
+        dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
+    }
+    else
+    {
+        inc_x2 = 2 * inc_x;
+        inc_y2 = 2 * inc_y;
 
-                               dot[0] += ( x2 * y2 OP3 x3 * y3 );
-                               dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+        for (i = (n >> 2); i--;)
+        {
+            x0 = *x;
+            x1 = *(x + 1);
+            x += inc_x2;
+            x2 = *x;
+            x3 = *(x + 1);
+            x += inc_x2;
+            x4 = *x;
+            x5 = *(x + 1);
+            x += inc_x2;
+            x6 = *x;
+            x7 = *(x + 1);
+            x += inc_x2;
+
+            y0 = *y;
+            y1 = *(y + 1);
+            y += inc_y2;
+            y2 = *y;
+            y3 = *(y + 1);
+            y += inc_y2;
+            y4 = *y;
+            y5 = *(y + 1);
+            y += inc_y2;
+            y6 = *y;
+            y7 = *(y + 1);
+            y += inc_y2;
+
+            dot[0] += (x0 * y0 OP3 x1 * y1);
+            dot[1] OP2 (x1 * y0 OP4 x0 * y1);
+
+            dot[0] += (x2 * y2 OP3 x3 * y3);
+            dot[1] OP2 (x3 * y2 OP4 x2 * y3);
+
+            dot[0] += (x4 * y4 OP3 x5 * y5);
+            dot[1] OP2 (x5 * y4 OP4 x4 * y5);
+
+            dot[0] += (x6 * y6 OP3 x7 * y7);
+            dot[1] OP2 (x7 * y6 OP4 x6 * y7);
+        }
 
-                               dot[0] += ( x4 * y4 OP3 x5 * y5 );
-                               dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
-                       }
-                       else if (n & 2)
-                       {
-                LD_GP4_INC(x, 1, x0, x1, x2, x3);
-                LD_GP4_INC(y, 1, y0, y1, y2, y3);
+        if (n & 2)
+        {
+            x0 = *x;
+            x1 = *(x + 1);
+            x += inc_x2;
+            x2 = *x;
+            x3 = *(x + 1);
+            x += inc_x2;
+
+            y0 = *y;
+            y1 = *(y + 1);
+            y += inc_y2;
+            y2 = *y;
+            y3 = *(y + 1);
+            y += inc_y2;
+
+            dot[0] += (x0 * y0 OP3 x1 * y1);
+            dot[1] OP2 (x1 * y0 OP4 x0 * y1);
+
+            dot[0] += (x2 * y2 OP3 x3 * y3);
+            dot[1] OP2 (x3 * y2 OP4 x2 * y3);
+        }
 
-                               dot[0] += ( x0 * y0 OP3 x1 * y1 );
-                               dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+        if (n & 1)
+        {
+            x0 = *x;
+            x1 = *(x + 1);
+            x += inc_x2;
 
-                               dot[0] += ( x2 * y2 OP3 x3 * y3 );
-                               dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
-                       }
-                       else if (n & 1)
-                       {
-                LD_GP2_INC(x, 1, x0, x1);
-                LD_GP2_INC(y, 1, y0, y1);
+            y0 = *y;
+            y1 = *(y + 1);
+            y += inc_y2;
 
-                               dot[0] += ( x0 * y0 OP3 x1 * y1 );
-                               dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
-                       }
+            dot[0] += (x0 * y0 OP3 x1 * y1);
+            dot[1] OP2 (x1 * y0 OP4 x0 * y1);
         }
+    }
+
+    CREAL(result) = dot[0];
+    CIMAG(result) = dot[1];
 
-               dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
-               dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
-       }
-       else
-       {
-               inc_x2 = 2 * inc_x;
-               inc_y2 = 2 * inc_y;
-
-               for (i = (n >> 2); i--;)
-               {
-                       x0 = *x;
-                       x1 = *(x + 1);
-                       x += inc_x2;
-                       x2 = *x;
-                       x3 = *(x + 1);
-                       x += inc_x2;
-                       x4 = *x;
-                       x5 = *(x + 1);
-                       x += inc_x2;
-                       x6 = *x;
-                       x7 = *(x + 1);
-                       x += inc_x2;
-
-                       y0 = *y;
-                       y1 = *(y + 1);
-                       y += inc_y2;
-                       y2 = *y;
-                       y3 = *(y + 1);
-                       y += inc_y2;
-                       y4 = *y;
-                       y5 = *(y + 1);
-                       y += inc_y2;
-                       y6 = *y;
-                       y7 = *(y + 1);
-                       y += inc_y2;
-
-                       dot[0] += ( x0 * y0 OP3 x1 * y1 );
-                       dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
-
-                       dot[0] += ( x2 * y2 OP3 x3 * y3 );
-                       dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
-
-                       dot[0] += ( x4 * y4 OP3 x5 * y5 );
-                       dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
-
-                       dot[0] += ( x6 * y6 OP3 x7 * y7 );
-                       dot[1] OP2 ( x7 * y6 OP4 x6 * y7 );
-               }
-
-               if ((n & 2) && (n & 1))
-               {
-                       x0 = *x;
-                       x1 = *(x + 1);
-                       x += inc_x2;
-                       x2 = *x;
-                       x3 = *(x + 1);
-                       x += inc_x2;
-                       x4 = *x;
-                       x5 = *(x + 1);
-                       x += inc_x2;
-
-                       y0 = *y;
-                       y1 = *(y + 1);
-                       y += inc_y2;
-                       y2 = *y;
-                       y3 = *(y + 1);
-                       y += inc_y2;
-                       y4 = *y;
-                       y5 = *(y + 1);
-                       y += inc_y2;
-
-                       dot[0] += ( x0 * y0 OP3 x1 * y1 );
-                       dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
-
-                       dot[0] += ( x2 * y2 OP3 x3 * y3 );
-                       dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
-
-                       dot[0] += ( x4 * y4 OP3 x5 * y5 );
-                       dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
-               }
-               else if (n & 2)
-               {
-                       x0 = *x;
-                       x1 = *(x + 1);
-                       x += inc_x2;
-                       x2 = *x;
-                       x3 = *(x + 1);
-                       x += inc_x2;
-
-                       y0 = *y;
-                       y1 = *(y + 1);
-                       y += inc_y2;
-                       y2 = *y;
-                       y3 = *(y + 1);
-                       y += inc_y2;
-
-                       dot[0] += ( x0 * y0 OP3 x1 * y1 );
-                       dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
-
-                       dot[0] += ( x2 * y2 OP3 x3 * y3 );
-                       dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
-               }
-               else if (n & 1)
-               {
-                       x0 = *x;
-                       x1 = *(x + 1);
-                       x += inc_x2;
-
-                       y0 = *y;
-                       y1 = *(y + 1);
-                       y += inc_y2;
-
-                       dot[0] += ( x0 * y0 OP3 x1 * y1 );
-                       dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
-               }
-       }
-
-    __real__(result) = dot[0];
-    __imag__(result) = dot[1];
-
-    return(result);
+    return (result);
 }
index a3641cd..1128d63 100644 (file)
@@ -36,40 +36,51 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     BLASLONG i;
     FLOAT sumf = 0.0;
     v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
-    v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
-    v2f64 zero_v = {0};
+    v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+    v2f64 sum_abs0 = {0, 0};
+    v2f64 sum_abs1 = {0, 0};
+    v2f64 sum_abs2 = {0, 0};
+    v2f64 sum_abs3 = {0, 0};
     v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
 
     if (n <= 0 || inc_x <= 0) return (sumf);
 
     if (1 == inc_x)
     {
-        if (n > 15)
-        {
-            n -= 16;
-
-            LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+#ifdef ENABLE_PREFETCH
+        FLOAT *x_pref;
+        BLASLONG pref_offset;
 
-            sum_abs0 = AND_VEC_D(src0);
-            sum_abs1 = AND_VEC_D(src1);
-            sum_abs2 = AND_VEC_D(src2);
-            sum_abs3 = AND_VEC_D(src3);
-            sum_abs0 += AND_VEC_D(src4);
-            sum_abs1 += AND_VEC_D(src5);
-            sum_abs2 += AND_VEC_D(src6);
-            sum_abs3 += AND_VEC_D(src7);
-        }
-        else
+        pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+        if (pref_offset > 0)
         {
-            sum_abs0 = zero_v;
-            sum_abs1 = zero_v;
-            sum_abs2 = zero_v;
-            sum_abs3 = zero_v;
+            pref_offset = L1_DATA_LINESIZE - pref_offset;
         }
+        pref_offset = pref_offset / sizeof(FLOAT);
+        x_pref = x + pref_offset + 64;
+#endif
 
-        for (i = (n >> 4); i--;)
+        for (i = (n >> 5); i--;)
         {
+#ifdef ENABLE_PREFETCH
+            __asm__ __volatile__(
+                "pref   0,     0(%[x_pref])\n\t"
+                "pref   0,    32(%[x_pref])\n\t"
+                "pref   0,    64(%[x_pref])\n\t"
+                "pref   0,    96(%[x_pref])\n\t"
+                "pref   0,   128(%[x_pref])\n\t"
+                "pref   0,   160(%[x_pref])\n\t"
+                "pref   0,   192(%[x_pref])\n\t"
+                "pref   0,   224(%[x_pref])\n\t"
+
+                : : [x_pref] "r" (x_pref)
+            );
+
+            x_pref += 32;
+#endif
+
             LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+            LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
 
             sum_abs0 += AND_VEC_D(src0);
             sum_abs1 += AND_VEC_D(src1);
@@ -79,13 +90,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             sum_abs1 += AND_VEC_D(src5);
             sum_abs2 += AND_VEC_D(src6);
             sum_abs3 += AND_VEC_D(src7);
+            sum_abs0 += AND_VEC_D(src8);
+            sum_abs1 += AND_VEC_D(src9);
+            sum_abs2 += AND_VEC_D(src10);
+            sum_abs3 += AND_VEC_D(src11);
+            sum_abs0 += AND_VEC_D(src12);
+            sum_abs1 += AND_VEC_D(src13);
+            sum_abs2 += AND_VEC_D(src14);
+            sum_abs3 += AND_VEC_D(src15);
         }
 
-        if (n & 15)
+        if (n & 31)
         {
-            if ((n & 8) && (n & 4) && (n & 2))
+            if (n & 16)
             {
-                LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6);
+                LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
 
                 sum_abs0 += AND_VEC_D(src0);
                 sum_abs1 += AND_VEC_D(src1);
@@ -94,37 +113,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs0 += AND_VEC_D(src4);
                 sum_abs1 += AND_VEC_D(src5);
                 sum_abs2 += AND_VEC_D(src6);
+                sum_abs3 += AND_VEC_D(src7);
             }
-            else if ((n & 8) && (n & 4))
-            {
-                LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5);
-
-                sum_abs0 += AND_VEC_D(src0);
-                sum_abs1 += AND_VEC_D(src1);
-                sum_abs2 += AND_VEC_D(src2);
-                sum_abs3 += AND_VEC_D(src3);
-                sum_abs0 += AND_VEC_D(src4);
-                sum_abs1 += AND_VEC_D(src5);
-            }
-            else if ((n & 8) && (n & 2))
-            {
-                LD_DP5_INC(x, 2, src0, src1, src2, src3, src4);
-
-                sum_abs0 += AND_VEC_D(src0);
-                sum_abs1 += AND_VEC_D(src1);
-                sum_abs2 += AND_VEC_D(src2);
-                sum_abs3 += AND_VEC_D(src3);
-                sum_abs0 += AND_VEC_D(src4);
-            }
-            else if ((n & 4) && (n & 2))
-            {
-                LD_DP3_INC(x, 2, src0, src1, src2);
 
-                sum_abs0 += AND_VEC_D(src0);
-                sum_abs1 += AND_VEC_D(src1);
-                sum_abs2 += AND_VEC_D(src2);
-            }
-            else if (n & 8)
+            if (n & 8)
             {
                 LD_DP4_INC(x, 2, src0, src1, src2, src3);
 
@@ -133,64 +125,38 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs2 += AND_VEC_D(src2);
                 sum_abs3 += AND_VEC_D(src3);
             }
-            else if (n & 4)
+
+            if (n & 4)
             {
                 LD_DP2_INC(x, 2, src0, src1);
 
                 sum_abs0 += AND_VEC_D(src0);
                 sum_abs1 += AND_VEC_D(src1);
             }
-            else if (n & 2)
+
+            if (n & 2)
             {
                 src0 = LD_DP(x); x += 2;
 
                 sum_abs0 += AND_VEC_D(src0);
             }
 
-            sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-            sumf = sum_abs0[0] + sum_abs0[1];
-
             if (n & 1)
             {
                 sumf += fabs(*x);
             }
         }
-        else
-        {
-            sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
 
-            sumf = sum_abs0[0] + sum_abs0[1];
-        }
+        sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+
+        sumf += sum_abs0[0] + sum_abs0[1];
     }
     else
     {
-        if (n > 8)
-        {
-            n -= 8;
-
-            LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
-
-            sum_abs0 = AND_VEC_D(src0);
-            sum_abs1 = AND_VEC_D(src1);
-            sum_abs2 = AND_VEC_D(src2);
-            sum_abs3 = AND_VEC_D(src3);
-            sum_abs0 += AND_VEC_D(src4);
-            sum_abs1 += AND_VEC_D(src5);
-            sum_abs2 += AND_VEC_D(src6);
-            sum_abs3 += AND_VEC_D(src7);
-        }
-        else
-        {
-            sum_abs0 = zero_v;
-            sum_abs1 = zero_v;
-            sum_abs2 = zero_v;
-            sum_abs3 = zero_v;
-        }
-
-        for (i = (n >> 3); i--;)
+        for (i = (n >> 4); i--;)
         {
             LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+            LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
 
             sum_abs0 += AND_VEC_D(src0);
             sum_abs1 += AND_VEC_D(src1);
@@ -200,13 +166,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             sum_abs1 += AND_VEC_D(src5);
             sum_abs2 += AND_VEC_D(src6);
             sum_abs3 += AND_VEC_D(src7);
+            sum_abs0 += AND_VEC_D(src8);
+            sum_abs1 += AND_VEC_D(src9);
+            sum_abs2 += AND_VEC_D(src10);
+            sum_abs3 += AND_VEC_D(src11);
+            sum_abs0 += AND_VEC_D(src12);
+            sum_abs1 += AND_VEC_D(src13);
+            sum_abs2 += AND_VEC_D(src14);
+            sum_abs3 += AND_VEC_D(src15);
         }
 
-        if (n & 7)
+        if (n & 15)
         {
-            if ((n & 4) && (n & 2) && (n & 1))
+            if (n & 8)
             {
-                LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6);
+                LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
 
                 sum_abs0 += AND_VEC_D(src0);
                 sum_abs1 += AND_VEC_D(src1);
@@ -215,37 +189,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs0 += AND_VEC_D(src4);
                 sum_abs1 += AND_VEC_D(src5);
                 sum_abs2 += AND_VEC_D(src6);
+                sum_abs3 += AND_VEC_D(src7);
             }
-            else if ((n & 4) && (n & 2))
-            {
-                LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5);
 
-                sum_abs0 += AND_VEC_D(src0);
-                sum_abs1 += AND_VEC_D(src1);
-                sum_abs2 += AND_VEC_D(src2);
-                sum_abs3 += AND_VEC_D(src3);
-                sum_abs0 += AND_VEC_D(src4);
-                sum_abs1 += AND_VEC_D(src5);
-            }
-            else if ((n & 4) && (n & 1))
-            {
-                LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4);
-
-                sum_abs0 += AND_VEC_D(src0);
-                sum_abs1 += AND_VEC_D(src1);
-                sum_abs2 += AND_VEC_D(src2);
-                sum_abs3 += AND_VEC_D(src3);
-                sum_abs0 += AND_VEC_D(src4);
-            }
-            else if ((n & 2) && (n & 1))
-            {
-                LD_DP3_INC(x, inc_x, src0, src1, src2);
-
-                sum_abs0 += AND_VEC_D(src0);
-                sum_abs1 += AND_VEC_D(src1);
-                sum_abs2 += AND_VEC_D(src2);
-            }
-            else if (n & 4)
+            if (n & 4)
             {
                 LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
 
@@ -254,14 +201,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs2 += AND_VEC_D(src2);
                 sum_abs3 += AND_VEC_D(src3);
             }
-            else if (n & 2)
+
+            if (n & 2)
             {
                 LD_DP2_INC(x, inc_x, src0, src1);
 
                 sum_abs0 += AND_VEC_D(src0);
                 sum_abs1 += AND_VEC_D(src1);
             }
-            else if (n & 1)
+
+            if (n & 1)
             {
                 src0 = LD_DP(x);
 
@@ -269,7 +218,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             }
         }
 
-        sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+        sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
 
         sumf = sum_abs0[0];
     }
index b56e101..b92f313 100644 (file)
@@ -28,105 +28,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include "macros_msa.h"
 
-/* return float, x,y float */
-#if defined(DSDOT)
-double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-#else
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-#endif
 {
     BLASLONG i = 0;
-    double dot = 0.0;
+    FLOAT dot = 0.0;
     FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
     v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
     v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
     v2f64 dot0 = {0, 0};
+    v2f64 dot1 = {0, 0};
+    v2f64 dot2 = {0, 0};
+    v2f64 dot3 = {0, 0};
 
-    if (n < 0) return (dot);
+    if (n < 1) return (dot);
 
     if ((1 == inc_x) && (1 == inc_y))
     {
         for (i = (n >> 4); i--;)
         {
-                       LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
-                       LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+            LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+            LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+#ifdef ENABLE_PREFETCH
+            __asm__ __volatile__(
+                "pref   0,  256(%[x])\n\t"
+                "pref   0,  288(%[x])\n\t"
+                "pref   0,  320(%[x])\n\t"
+                "pref   0,  352(%[x])\n\t"
+                "pref   0,  256(%[y])\n\t"
+                "pref   0,  288(%[y])\n\t"
+                "pref   0,  320(%[y])\n\t"
+                "pref   0,  352(%[y])\n\t"
+
+                : : [x] "r" (x), [y] "r" (y)
+            );
+#endif
 
             dot0 += (vy0 * vx0);
-            dot0 += (vy1 * vx1);
-            dot0 += (vy2 * vx2);
-            dot0 += (vy3 * vx3);
+            dot1 += (vy1 * vx1);
+            dot2 += (vy2 * vx2);
+            dot3 += (vy3 * vx3);
             dot0 += (vy4 * vx4);
-            dot0 += (vy5 * vx5);
-            dot0 += (vy6 * vx6);
-            dot0 += (vy7 * vx7);
+            dot1 += (vy5 * vx5);
+            dot2 += (vy6 * vx6);
+            dot3 += (vy7 * vx7);
         }
 
         if (n & 15)
         {
-            if ((n & 8) && (n & 4) && (n & 2))
-                       {
-                               LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
-                               LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
-
-                dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                dot0 += (vy3 * vx3);
-                dot0 += (vy4 * vx4);
-                dot0 += (vy5 * vx5);
-                dot0 += (vy6 * vx6);
-                       }
-            else if ((n & 8) && (n & 4))
-                       {
-                               LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5);
-                               LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5);
-
-                dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                dot0 += (vy3 * vx3);
-                dot0 += (vy4 * vx4);
-                dot0 += (vy5 * vx5);
-                       }
-            else if ((n & 8) && (n & 2))
-                       {
-                               LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4);
-                               LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4);
-
-                dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                dot0 += (vy3 * vx3);
-                dot0 += (vy4 * vx4);
-                       }
-            else if ((n & 4) && (n & 2))
-                       {
-                               LD_DP3_INC(x, 2, vx0, vx1, vx2);
-                               LD_DP3_INC(y, 2, vy0, vy1, vy2);
-
-                dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                       }
-            else if (n & 8)
+            if (n & 8)
             {
-                               LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
-                               LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
+                LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
+                LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
 
                 dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                dot0 += (vy3 * vx3);
+                dot1 += (vy1 * vx1);
+                dot2 += (vy2 * vx2);
+                dot3 += (vy3 * vx3);
             }
-            else if (n & 4)
+
+            if (n & 4)
             {
-                               LD_DP2_INC(x, 2, vx0, vx1);
-                               LD_DP2_INC(y, 2, vy0, vy1);
+                LD_DP2_INC(x, 2, vx0, vx1);
+                LD_DP2_INC(y, 2, vy0, vy1);
 
                 dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
+                dot1 += (vy1 * vx1);
             }
-            else if (n & 2)
+
+            if (n & 2)
             {
                 vx0 = LD_DP(x); x += 2;
                 vy0 = LD_DP(y); y += 2;
@@ -143,6 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             }
         }
 
+        dot0 += dot1 + dot2 + dot3;
+
         dot += dot0[0];
         dot += dot0[1];
     }
@@ -159,16 +131,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             dot += (y3 * x3);
         }
 
-        if ((n & 2) && (n & 1))
-        {
-            LD_GP3_INC(x, inc_x, x0, x1, x2);
-            LD_GP3_INC(y, inc_y, y0, y1, y2);
-
-            dot += (y0 * x0);
-            dot += (y1 * x1);
-            dot += (y2 * x2);
-        }
-        else if (n & 2)
+        if (n & 2)
         {
             LD_GP2_INC(x, inc_x, x0, x1);
             LD_GP2_INC(y, inc_y, y0, y1);
@@ -176,7 +139,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             dot += (y0 * x0);
             dot += (y1 * x1);
         }
-        else if (n & 1)
+
+        if (n & 1)
         {
             x0 = *x;
             y0 = *y;
index e968f83..e15332f 100644 (file)
@@ -34,42 +34,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
     BLASLONG i = 0;
-    FLOAT data0, data1, data2, sumf = 0.0;
+    FLOAT data0, data1, sumf = 0.0;
     v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
-    v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
-    v4f32 zero_v = {0};
+    v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+    v4f32 sum_abs0 = {0, 0, 0, 0};
+    v4f32 sum_abs1 = {0, 0, 0, 0};
+    v4f32 sum_abs2 = {0, 0, 0, 0};
+    v4f32 sum_abs3 = {0, 0, 0, 0};
+    v4f32 zero_v = {0, 0, 0, 0};
     v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
 
     if (n <= 0 || inc_x <= 0) return (sumf);
 
     if (1 == inc_x)
     {
-        if (n > 31)
-        {
-            n -= 32;
-
-            LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+#ifdef ENABLE_PREFETCH
+        FLOAT *x_pref;
+        BLASLONG pref_offset;
 
-            sum_abs0 = AND_VEC_W(src0);
-            sum_abs1 = AND_VEC_W(src1);
-            sum_abs2 = AND_VEC_W(src2);
-            sum_abs3 = AND_VEC_W(src3);
-            sum_abs0 += AND_VEC_W(src4);
-            sum_abs1 += AND_VEC_W(src5);
-            sum_abs2 += AND_VEC_W(src6);
-            sum_abs3 += AND_VEC_W(src7);
-        }
-        else
+        pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+        if (pref_offset > 0)
         {
-            sum_abs0 = zero_v;
-            sum_abs1 = zero_v;
-            sum_abs2 = zero_v;
-            sum_abs3 = zero_v;
+            pref_offset = L1_DATA_LINESIZE - pref_offset;
         }
+        pref_offset = pref_offset / sizeof(FLOAT);
+        x_pref = x + pref_offset + 128;
+#endif
 
-        for (i = 0; i < (n >> 5); i++)
+        for (i = 0; i < (n >> 6); i++)
         {
+#ifdef ENABLE_PREFETCH
+            __asm__ __volatile__(
+                "pref   0,     0(%[x_pref])\n\t"
+                "pref   0,    32(%[x_pref])\n\t"
+                "pref   0,    64(%[x_pref])\n\t"
+                "pref   0,    96(%[x_pref])\n\t"
+                "pref   0,   128(%[x_pref])\n\t"
+                "pref   0,   160(%[x_pref])\n\t"
+                "pref   0,   192(%[x_pref])\n\t"
+                "pref   0,   224(%[x_pref])\n\t"
+
+                : : [x_pref] "r" (x_pref)
+            );
+
+            x_pref += 64;
+#endif
+
             LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+            LD_SP8_INC(x, 4, src8, src9, src10, src11, src12, src13, src14, src15);
 
             sum_abs0 += AND_VEC_W(src0);
             sum_abs1 += AND_VEC_W(src1);
@@ -79,13 +91,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             sum_abs1 += AND_VEC_W(src5);
             sum_abs2 += AND_VEC_W(src6);
             sum_abs3 += AND_VEC_W(src7);
+            sum_abs0 += AND_VEC_W(src8);
+            sum_abs1 += AND_VEC_W(src9);
+            sum_abs2 += AND_VEC_W(src10);
+            sum_abs3 += AND_VEC_W(src11);
+            sum_abs0 += AND_VEC_W(src12);
+            sum_abs1 += AND_VEC_W(src13);
+            sum_abs2 += AND_VEC_W(src14);
+            sum_abs3 += AND_VEC_W(src15);
         }
 
-        if (n & 31)
+        if (n & 63)
         {
-            if ((n & 16) && (n & 8) && (n & 4))
+            if (n & 32)
             {
-                LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
+                LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
 
                 sum_abs0 += AND_VEC_W(src0);
                 sum_abs1 += AND_VEC_W(src1);
@@ -94,65 +114,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs0 += AND_VEC_W(src4);
                 sum_abs1 += AND_VEC_W(src5);
                 sum_abs2 += AND_VEC_W(src6);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf += sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
-            }
-            else if ((n & 16) && (n & 8))
-            {
-                LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
-
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-                sum_abs3 += AND_VEC_W(src3);
-                sum_abs0 += AND_VEC_W(src4);
-                sum_abs1 += AND_VEC_W(src5);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf += sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
-            }
-            else if ((n & 16) && (n & 4))
-            {
-                LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
-
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-                sum_abs3 += AND_VEC_W(src3);
-                sum_abs0 += AND_VEC_W(src4);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf += sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
+                sum_abs3 += AND_VEC_W(src7);
             }
-            else if ((n & 8) && (n & 4))
-            {
-                LD_SP3_INC(x, 4, src0, src1, src2);
-
-                sum_abs0 += AND_VEC_W(src0);
-                sum_abs1 += AND_VEC_W(src1);
-                sum_abs2 += AND_VEC_W(src2);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
 
-                sumf += sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
-            }
-            else if (n & 16)
+            if (n & 16)
             {
                 LD_SP4_INC(x, 4, src0, src1, src2, src3);
 
@@ -160,79 +125,47 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 sum_abs1 += AND_VEC_W(src1);
                 sum_abs2 += AND_VEC_W(src2);
                 sum_abs3 += AND_VEC_W(src3);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf += sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
             }
-            else if (n & 8)
+
+            if (n & 8)
             {
                 LD_SP2_INC(x, 4, src0, src1);
 
                 sum_abs0 += AND_VEC_W(src0);
                 sum_abs1 += AND_VEC_W(src1);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf += sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
             }
-            else if (n & 4)
+
+            if (n & 4)
             {
                 src0 = LD_SP(x); x += 4;
 
                 sum_abs0 += AND_VEC_W(src0);
-
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf += sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
-            }
-            else
-            {
-                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
-
-                sumf += sum_abs0[0];
-                sumf += sum_abs0[1];
-                sumf += sum_abs0[2];
-                sumf += sum_abs0[3];
             }
 
             if (n & 2)
             {
-                sumf += fabsf(*(x + 0));
+                sumf += fabsf(*x);
                 sumf += fabsf(*(x + 1));
                 x += 2;
             }
 
             if (n & 1)
             {
-                sumf += fabsf(*(x + 0));
+                sumf += fabsf(*x);
             }
         }
-        else
-        {
-            sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
 
-            sumf += sum_abs0[0];
-            sumf += sum_abs0[1];
-            sumf += sum_abs0[2];
-            sumf += sum_abs0[3];
-        }
+        sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+
+        sumf += sum_abs0[0];
+        sumf += sum_abs0[1];
+        sumf += sum_abs0[2];
+        sumf += sum_abs0[3];
     }
     else
     {
-        if (n > 8)
+        for (i = (n >> 4); i--;)
         {
-            n -= 8;
-
             src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
             x += inc_x;
             src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
@@ -241,92 +174,97 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             x += inc_x;
             src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
             x += inc_x;
-            src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+            src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
             x += inc_x;
-            src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
+            src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x));
             x += inc_x;
-            src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
+            src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x));
             x += inc_x;
-            src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
+            src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x));
             x += inc_x;
-
-            sum_abs0 = AND_VEC_W(src0);
-            sum_abs1 = AND_VEC_W(src4);
-        }
-        else
-        {
-            sum_abs0 = zero_v;
-            sum_abs1 = zero_v;
-        }
-
-        for (i = (n >> 3); i--;)
-        {
-            src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+            src2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
             x += inc_x;
-            src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+            src2 = (v4f32) __msa_insert_w((v4i32) src2, 1, *((int *) x));
             x += inc_x;
-            src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+            src2 = (v4f32) __msa_insert_w((v4i32) src2, 2, *((int *) x));
             x += inc_x;
-            src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+            src2 = (v4f32) __msa_insert_w((v4i32) src2, 3, *((int *) x));
             x += inc_x;
-            src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+            src3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
             x += inc_x;
-            src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
+            src3 = (v4f32) __msa_insert_w((v4i32) src3, 1, *((int *) x));
             x += inc_x;
-            src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
+            src3 = (v4f32) __msa_insert_w((v4i32) src3, 2, *((int *) x));
             x += inc_x;
-            src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
+            src3 = (v4f32) __msa_insert_w((v4i32) src3, 3, *((int *) x));
             x += inc_x;
 
             sum_abs0 += AND_VEC_W(src0);
-            sum_abs1 += AND_VEC_W(src4);
+            sum_abs1 += AND_VEC_W(src1);
+            sum_abs2 += AND_VEC_W(src2);
+            sum_abs3 += AND_VEC_W(src3);
         }
 
-        if (n & 4)
+        if (n & 15)
         {
-            src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
-            x += inc_x;
-            src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
-            x += inc_x;
-            src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
-            x += inc_x;
-            src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
-            x += inc_x;
+            if (n & 8)
+            {
+                src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+                x += inc_x;
+                src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+                x += inc_x;
+                src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+                x += inc_x;
+                src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+                x += inc_x;
+                src1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+                x += inc_x;
+                src1 = (v4f32) __msa_insert_w((v4i32) src1, 1, *((int *) x));
+                x += inc_x;
+                src1 = (v4f32) __msa_insert_w((v4i32) src1, 2, *((int *) x));
+                x += inc_x;
+                src1 = (v4f32) __msa_insert_w((v4i32) src1, 3, *((int *) x));
+                x += inc_x;
 
-            sum_abs0 += AND_VEC_W(src0);
-        }
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+            }
 
-        sum_abs0 += sum_abs1;
+            if (n & 4)
+            {
+                src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+                x += inc_x;
+                src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+                x += inc_x;
+                src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+                x += inc_x;
+                src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+                x += inc_x;
 
-        sumf += sum_abs0[0];
-        sumf += sum_abs0[1];
-        sumf += sum_abs0[2];
-        sumf += sum_abs0[3];
+                sum_abs0 += AND_VEC_W(src0);
+            }
 
-        if ((n & 2) && (n & 1))
-        {
-            data0 = fabsf(*x); x += inc_x;
-            data1 = fabsf(*x); x += inc_x;
-            data2 = fabsf(*x);
+            if (n & 2)
+            {
+                data0 = fabsf(*x); x += inc_x;
+                data1 = fabsf(*x); x += inc_x;
 
-            sumf += data0;
-            sumf += data1;
-            sumf += data2;
-        }
-        else if (n & 2)
-        {
-            data0 = fabsf(*x); x += inc_x;
-            data1 = fabsf(*x);
+                sumf += data0;
+                sumf += data1;
+            }
 
-            sumf += data0;
-            sumf += data1;
+            if (n & 1)
+            {
+                sumf += fabsf(*x);
+            }
         }
-        else if (n & 1)
-        {
-            data0 = fabsf(*x);
 
-            sumf += data0;
-        }
+        sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+
+        sumf += sum_abs0[0];
+        sumf += sum_abs0[1];
+        sumf += sum_abs0[2];
+        sumf += sum_abs0[3];
     }
 
     return (sumf);
index 1997ec5..f281db3 100644 (file)
@@ -28,7 +28,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include "macros_msa.h"
 
-/* return float, x,y float */
 #if defined(DSDOT)
 double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 #else
@@ -37,96 +36,71 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
     BLASLONG i = 0;
     double dot = 0.0;
-    float x0, x1, x2, x3, y0, y1, y2, y3;
+    FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
     v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
     v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
     v4f32 dot0 = {0, 0, 0, 0};
+    v4f32 dot1 = {0, 0, 0, 0};
+    v4f32 dot2 = {0, 0, 0, 0};
+    v4f32 dot3 = {0, 0, 0, 0};
 
-    if (n < 0) return (dot);
+    if (n < 1) return (dot);
 
     if ((1 == inc_x) && (1 == inc_y))
     {
         for (i = (n >> 5); i--;)
         {
-                       LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
-                       LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+            LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+            LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+#ifdef ENABLE_PREFETCH
+            __asm__ __volatile__(
+                "pref   0,  256(%[x])\n\t"
+                "pref   0,  288(%[x])\n\t"
+                "pref   0,  320(%[x])\n\t"
+                "pref   0,  352(%[x])\n\t"
+                "pref   0,  256(%[y])\n\t"
+                "pref   0,  288(%[y])\n\t"
+                "pref   0,  320(%[y])\n\t"
+                "pref   0,  352(%[y])\n\t"
+
+                : : [x] "r" (x), [y] "r" (y)
+            );
+#endif
 
             dot0 += (vy0 * vx0);
-            dot0 += (vy1 * vx1);
-            dot0 += (vy2 * vx2);
-            dot0 += (vy3 * vx3);
+            dot1 += (vy1 * vx1);
+            dot2 += (vy2 * vx2);
+            dot3 += (vy3 * vx3);
             dot0 += (vy4 * vx4);
-            dot0 += (vy5 * vx5);
-            dot0 += (vy6 * vx6);
-            dot0 += (vy7 * vx7);
+            dot1 += (vy5 * vx5);
+            dot2 += (vy6 * vx6);
+            dot3 += (vy7 * vx7);
         }
 
         if (n & 31)
         {
-            if ((n & 16) && (n & 8) && (n & 4))
+            if (n & 16)
             {
-                LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
-                LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
+                LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+                LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
 
                 dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                dot0 += (vy3 * vx3);
-                dot0 += (vy4 * vx4);
-                dot0 += (vy5 * vx5);
-                dot0 += (vy6 * vx6);
+                dot1 += (vy1 * vx1);
+                dot2 += (vy2 * vx2);
+                dot3 += (vy3 * vx3);
             }
-            else if ((n & 16) && (n & 8))
-            {
-                LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5);
-                LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5);
 
-                dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                dot0 += (vy3 * vx3);
-                dot0 += (vy4 * vx4);
-                dot0 += (vy5 * vx5);
-            }
-            else if ((n & 16) && (n & 4))
+            if (n & 8)
             {
-                LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4);
-                LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4);
+                LD_SP2_INC(x, 4, vx0, vx1);
+                LD_SP2_INC(y, 4, vy0, vy1);
 
                 dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                dot0 += (vy3 * vx3);
-                dot0 += (vy4 * vx4);
+                dot1 += (vy1 * vx1);
             }
-            else if ((n & 8) && (n & 4))
-            {
-                LD_SP3_INC(x, 4, vx0, vx1, vx2);
-                LD_SP3_INC(y, 4, vy0, vy1, vy2);
 
-                dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-            }
-            else if (n & 16)
-            {
-                               LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
-                               LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
-
-                dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-                dot0 += (vy2 * vx2);
-                dot0 += (vy3 * vx3);
-            }
-            else if (n & 8)
-            {
-                               LD_SP2_INC(x, 4, vx0, vx1);
-                               LD_SP2_INC(y, 4, vy0, vy1);
-
-                dot0 += (vy0 * vx0);
-                dot0 += (vy1 * vx1);
-            }
-            else if (n & 4)
+            if (n & 4)
             {
                 vx0 = LD_SP(x); x += 4;
                 vy0 = LD_SP(y); y += 4;
@@ -134,16 +108,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 dot0 += (vy0 * vx0);
             }
 
-            if ((n & 2) && (n & 1))
-            {
-                LD_GP3_INC(x, 1, x0, x1, x2);
-                LD_GP3_INC(y, 1, y0, y1, y2);
-
-                dot += (y0 * x0);
-                dot += (y1 * x1);
-                dot += (y2 * x2);
-            }
-            else if (n & 2)
+            if (n & 2)
             {
                 LD_GP2_INC(x, 1, x0, x1);
                 LD_GP2_INC(y, 1, y0, y1);
@@ -151,7 +116,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 dot += (y0 * x0);
                 dot += (y1 * x1);
             }
-            else if (n & 1)
+
+            if (n & 1)
             {
                 x0 = *x;
                 y0 = *y;
@@ -160,6 +126,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             }
         }
 
+        dot0 += dot1 + dot2 + dot3;
+
         dot += dot0[0];
         dot += dot0[1];
         dot += dot0[2];
@@ -178,16 +146,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             dot += (y3 * x3);
         }
 
-        if ((n & 2) && (n & 1))
-        {
-            LD_GP3_INC(x, inc_x, x0, x1, x2);
-            LD_GP3_INC(y, inc_y, y0, y1, y2);
-
-            dot += (y0 * x0);
-            dot += (y1 * x1);
-            dot += (y2 * x2);
-        }
-        else if (n & 2)
+        if (n & 2)
         {
             LD_GP2_INC(x, inc_x, x0, x1);
             LD_GP2_INC(y, inc_y, y0, y1);
@@ -195,7 +154,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             dot += (y0 * x0);
             dot += (y1 * x1);
         }
-        else if (n & 1)
+
+        if (n & 1)
         {
             x0 = *x;
             y0 = *y;
index c84d48e..8c4f8d1 100644 (file)
@@ -31,139 +31,191 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define AND_VEC_D(in)   ((v2f64) ((v2i64) in & and_vec))
 
-#define PROCESS_ZD(inc_val)                           \
-    if (n > 8)                                        \
-    {                                                 \
-        n -= 8;                                       \
-                                                      \
-        LD_DP8_INC(x, inc_val, src0, src1, src2,      \
-                   src3, src4, src5, src6, src7);     \
-                                                      \
-        sum_abs0 = AND_VEC_D(src0);                   \
-        sum_abs1 = AND_VEC_D(src1);                   \
-        sum_abs2 = AND_VEC_D(src2);                   \
-        sum_abs3 = AND_VEC_D(src3);                   \
-        sum_abs0 += AND_VEC_D(src4);                  \
-        sum_abs1 += AND_VEC_D(src5);                  \
-        sum_abs2 += AND_VEC_D(src6);                  \
-        sum_abs3 += AND_VEC_D(src7);                  \
-    }                                                 \
-    else                                              \
-    {                                                 \
-        sum_abs0 = zero_v;                            \
-        sum_abs1 = zero_v;                            \
-        sum_abs2 = zero_v;                            \
-        sum_abs3 = zero_v;                            \
-    }                                                 \
-                                                      \
-    for (i = (n >> 3); i--;)                          \
-    {                                                 \
-        LD_DP8_INC(x, inc_val, src0, src1, src2,      \
-                   src3, src4, src5, src6, src7);     \
-                                                      \
-        sum_abs0 += AND_VEC_D(src0);                  \
-        sum_abs1 += AND_VEC_D(src1);                  \
-        sum_abs2 += AND_VEC_D(src2);                  \
-        sum_abs3 += AND_VEC_D(src3);                  \
-        sum_abs0 += AND_VEC_D(src4);                  \
-        sum_abs1 += AND_VEC_D(src5);                  \
-        sum_abs2 += AND_VEC_D(src6);                  \
-        sum_abs3 += AND_VEC_D(src7);                  \
-    }                                                 \
-                                                      \
-    if (n & 7)                                        \
-    {                                                 \
-        if ((n & 4) && (n & 2) && (n & 1))            \
-        {                                             \
-            LD_DP7_INC(x, inc_val, src0, src1, src2,  \
-                       src3, src4, src5, src6);       \
-                                                      \
-            sum_abs0 += AND_VEC_D(src0);              \
-            sum_abs1 += AND_VEC_D(src1);              \
-            sum_abs2 += AND_VEC_D(src2);              \
-            sum_abs3 += AND_VEC_D(src3);              \
-            sum_abs0 += AND_VEC_D(src4);              \
-            sum_abs1 += AND_VEC_D(src5);              \
-            sum_abs2 += AND_VEC_D(src6);              \
-        }                                             \
-        else if ((n & 4) && (n & 2))                  \
-        {                                             \
-            LD_DP6_INC(x, inc_val, src0, src1, src2,  \
-                       src3, src4, src5);             \
-                                                      \
-            sum_abs0 += AND_VEC_D(src0);              \
-            sum_abs1 += AND_VEC_D(src1);              \
-            sum_abs2 += AND_VEC_D(src2);              \
-            sum_abs3 += AND_VEC_D(src3);              \
-            sum_abs0 += AND_VEC_D(src4);              \
-            sum_abs1 += AND_VEC_D(src5);              \
-        }                                             \
-        else if ((n & 4) && (n & 1))                  \
-        {                                             \
-            LD_DP5_INC(x, inc_val, src0, src1, src2,  \
-                       src3, src4);                   \
-                                                      \
-            sum_abs0 += AND_VEC_D(src0);              \
-            sum_abs1 += AND_VEC_D(src1);              \
-            sum_abs2 += AND_VEC_D(src2);              \
-            sum_abs3 += AND_VEC_D(src3);              \
-            sum_abs0 += AND_VEC_D(src4);              \
-        }                                             \
-        else if ((n & 2) && (n & 1))                  \
-        {                                             \
-            LD_DP3_INC(x, inc_val, src0, src1, src2); \
-                                                      \
-            sum_abs0 += AND_VEC_D(src0);              \
-            sum_abs1 += AND_VEC_D(src1);              \
-            sum_abs2 += AND_VEC_D(src2);              \
-        }                                             \
-        else if (n & 4)                               \
-        {                                             \
-            LD_DP4_INC(x, inc_val, src0, src1, src2,  \
-                       src3);                         \
-                                                      \
-            sum_abs0 += AND_VEC_D(src0);              \
-            sum_abs1 += AND_VEC_D(src1);              \
-            sum_abs2 += AND_VEC_D(src2);              \
-            sum_abs3 += AND_VEC_D(src3);              \
-        }                                             \
-        else if (n & 2)                               \
-        {                                             \
-            LD_DP2_INC(x, inc_val, src0, src1);       \
-                                                      \
-            sum_abs0 += AND_VEC_D(src0);              \
-            sum_abs1 += AND_VEC_D(src1);              \
-        }                                             \
-        else if (n & 1)                               \
-        {                                             \
-            src0 = LD_DP(x);                          \
-                                                      \
-            sum_abs0 += AND_VEC_D(src0);              \
-        }                                             \
-    }                                                 \
-                                                      \
-    sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;       \
-    sumf = sum_abs0[0] + sum_abs0[1];
-
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
     BLASLONG i;
     FLOAT sumf = 0.0;
     v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
-    v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
-    v2f64 zero_v = {0};
+    v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+    v2f64 sum_abs0 = {0, 0};
+    v2f64 sum_abs1 = {0, 0};
+    v2f64 sum_abs2 = {0, 0};
+    v2f64 sum_abs3 = {0, 0};
     v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
 
     if (n <= 0 || inc_x <= 0) return (sumf);
 
     if (1 == inc_x)
     {
-        PROCESS_ZD(2);
+#ifdef ENABLE_PREFETCH
+        FLOAT *x_pref;
+        BLASLONG pref_offset;
+
+        pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+        if (pref_offset > 0)
+        {
+            pref_offset = L1_DATA_LINESIZE - pref_offset;
+        }
+        pref_offset = pref_offset / sizeof(FLOAT);
+        x_pref = x + pref_offset + 64;
+#endif
+
+        for (i = (n >> 4); i--;)
+        {
+#ifdef ENABLE_PREFETCH
+            __asm__ __volatile__(
+                "pref   0,     0(%[x_pref])\n\t"
+                "pref   0,    32(%[x_pref])\n\t"
+                "pref   0,    64(%[x_pref])\n\t"
+                "pref   0,    96(%[x_pref])\n\t"
+                "pref   0,   128(%[x_pref])\n\t"
+                "pref   0,   160(%[x_pref])\n\t"
+                "pref   0,   192(%[x_pref])\n\t"
+                "pref   0,   224(%[x_pref])\n\t"
+
+                : : [x_pref] "r" (x_pref)
+            );
+
+            x_pref += 32;
+#endif
+
+            LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+            LD_DP8_INC(x, 2, src8, src9, src10, src11, src12, src13, src14, src15);
+
+            sum_abs0 += AND_VEC_D(src0);
+            sum_abs1 += AND_VEC_D(src1);
+            sum_abs2 += AND_VEC_D(src2);
+            sum_abs3 += AND_VEC_D(src3);
+            sum_abs0 += AND_VEC_D(src4);
+            sum_abs1 += AND_VEC_D(src5);
+            sum_abs2 += AND_VEC_D(src6);
+            sum_abs3 += AND_VEC_D(src7);
+            sum_abs0 += AND_VEC_D(src8);
+            sum_abs1 += AND_VEC_D(src9);
+            sum_abs2 += AND_VEC_D(src10);
+            sum_abs3 += AND_VEC_D(src11);
+            sum_abs0 += AND_VEC_D(src12);
+            sum_abs1 += AND_VEC_D(src13);
+            sum_abs2 += AND_VEC_D(src14);
+            sum_abs3 += AND_VEC_D(src15);
+        }
+
+        if (n & 15)
+        {
+            if (n & 8)
+            {
+                LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+                sum_abs0 += AND_VEC_D(src4);
+                sum_abs1 += AND_VEC_D(src5);
+                sum_abs2 += AND_VEC_D(src6);
+                sum_abs3 += AND_VEC_D(src7);
+            }
+
+            if (n & 4)
+            {
+                LD_DP4_INC(x, 2, src0, src1, src2, src3);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+            }
+
+            if (n & 2)
+            {
+                LD_DP2_INC(x, 2, src0, src1);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+            }
+
+            if (n & 1)
+            {
+                src0 = LD_DP(x);
+
+                sum_abs0 += AND_VEC_D(src0);
+            }
+        }
+
+        sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+        sumf = sum_abs0[0] + sum_abs0[1];
     }
     else
     {
         inc_x *= 2;
-        PROCESS_ZD(inc_x);
+
+        for (i = (n >> 4); i--;)
+        {
+            LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+            LD_DP8_INC(x, inc_x, src8, src9, src10, src11, src12, src13, src14, src15);
+
+            sum_abs0 += AND_VEC_D(src0);
+            sum_abs1 += AND_VEC_D(src1);
+            sum_abs2 += AND_VEC_D(src2);
+            sum_abs3 += AND_VEC_D(src3);
+            sum_abs0 += AND_VEC_D(src4);
+            sum_abs1 += AND_VEC_D(src5);
+            sum_abs2 += AND_VEC_D(src6);
+            sum_abs3 += AND_VEC_D(src7);
+            sum_abs0 += AND_VEC_D(src8);
+            sum_abs1 += AND_VEC_D(src9);
+            sum_abs2 += AND_VEC_D(src10);
+            sum_abs3 += AND_VEC_D(src11);
+            sum_abs0 += AND_VEC_D(src12);
+            sum_abs1 += AND_VEC_D(src13);
+            sum_abs2 += AND_VEC_D(src14);
+            sum_abs3 += AND_VEC_D(src15);
+        }
+
+        if (n & 15)
+        {
+            if (n & 8)
+            {
+                LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+                sum_abs0 += AND_VEC_D(src4);
+                sum_abs1 += AND_VEC_D(src5);
+                sum_abs2 += AND_VEC_D(src6);
+                sum_abs3 += AND_VEC_D(src7);
+            }
+
+            if (n & 4)
+            {
+                LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+            }
+
+            if (n & 2)
+            {
+                LD_DP2_INC(x, inc_x, src0, src1);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+            }
+
+            if (n & 1)
+            {
+                src0 = LD_DP(x);
+
+                sum_abs0 += AND_VEC_D(src0);
+            }
+        }
+
+        sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;
+        sumf = sum_abs0[0] + sum_abs0[1];
     }
 
     return (sumf);
index 482c0cf..f3c1847 100644 (file)
@@ -29,195 +29,220 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "macros_msa.h"
 
 #if !defined(CONJ)
-       #define OP2             +=
-       #define OP3             -
-       #define OP4             +
+    #define OP1     -=
+    #define OP2     +=
+    #define OP3     -
+    #define OP4     +
 #else
-       #define OP2             -=
-       #define OP3             +
-       #define OP4             -
+    #define OP1     +=
+    #define OP2     -=
+    #define OP3     +
+    #define OP4     -
 #endif
 
-#define DOT16_KERNEL(OPR0, OPR1)  \
-       dot0 += (vx0r * vy0r);            \
-       dot0 OPR0## = (vx0i * vy0i);  \
-       dot1 OPR1## = (vx0i * vy0r);  \
-       dot1 += (vx0r * vy0i);        \
-                                                                 \
-       dot0 += (vx1r * vy1r);        \
-       dot0 OPR0## = (vx1i * vy1i);  \
-       dot1 OPR1## = (vx1i * vy1r);  \
-       dot1 += (vx1r * vy1i);        \
-                                                                 \
-       dot0 += (vx2r * vy2r);        \
-       dot0 OPR0## = (vx2i * vy2i);  \
-       dot1 OPR1## = (vx2i * vy2r);  \
-       dot1 += (vx2r * vy2i);        \
-                                                                 \
-       dot0 += (vx3r * vy3r);        \
-       dot0 OPR0## = (vx3i * vy3i);  \
-       dot1 OPR1## = (vx3i * vy3r);  \
-       dot1 += (vx3r * vy3i);
-
-#define DOT12_KERNEL(OPR0, OPR1)  \
-       dot0 += (vx0r * vy0r);            \
-       dot0 OPR0## = (vx0i * vy0i);  \
-       dot1 OPR1## = (vx0i * vy0r);  \
-       dot1 += (vx0r * vy0i);        \
-                                                                 \
-       dot0 += (vx1r * vy1r);        \
-       dot0 OPR0## = (vx1i * vy1i);  \
-       dot1 OPR1## = (vx1i * vy1r);  \
-       dot1 += (vx1r * vy1i);            \
-                                                                 \
-       dot0 += (vx2r * vy2r);        \
-       dot0 OPR0## = (vx2i * vy2i);  \
-       dot1 OPR1## = (vx2i * vy2r);  \
-       dot1 += (vx2r * vy2i);
-
-#define DOT8_KERNEL(OPR0, OPR1)   \
-       dot0 += (vx0r * vy0r);            \
-       dot0 OPR0## = (vx0i * vy0i);  \
-       dot1 OPR1## = (vx0i * vy0r);  \
-       dot1 += (vx0r * vy0i);        \
-                                                                 \
-       dot0 += (vx1r * vy1r);        \
-       dot0 OPR0## = (vx1i * vy1i);  \
-       dot1 OPR1## = (vx1i * vy1r);  \
-       dot1 += (vx1r * vy1i);
-
-#define DOT4_KERNEL(OPR0, OPR1)   \
-       dot0 += (vx0r * vy0r);            \
-       dot0 OPR0## = (vx0i * vy0i);  \
-       dot1 OPR1## = (vx0i * vy0r);  \
-       dot1 += (vx0r * vy0i);
-
-/* return double, x,y double */
-/* zdotc -  CONJ */
-/* zdotu - !CONJ */
-
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
     BLASLONG i = 0;
     FLOAT dot[2];
-    BLASLONG inc_x2;
-    BLASLONG inc_y2;
+    BLASLONG inc_x2, inc_y2;
     v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
     v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
-       v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
-       v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
+    v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
+    v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
     v2f64 dot0 = {0, 0};
     v2f64 dot1 = {0, 0};
+    v2f64 dot2 = {0, 0};
+    v2f64 dot3 = {0, 0};
+    v2f64 dot4 = {0, 0};
+    v2f64 dot5 = {0, 0};
+    v2f64 dot6 = {0, 0};
+    v2f64 dot7 = {0, 0};
     v2f64 zero = {0, 0};
-    openblas_complex_double result;
+    OPENBLAS_COMPLEX_FLOAT result;
 
     dot[0] = 0.0;
     dot[1] = 0.0;
 
-    __real__(result) = 0.0;
-    __imag__(result) = 0.0;
+    CREAL(result) = 0.0;
+    CIMAG(result) = 0.0;
 
-    if ( n < 1 ) return(result);
+    if (n < 1) return (result);
 
     inc_x2 = 2 * inc_x;
     inc_y2 = 2 * inc_y;
 
-       for (i = (n >> 3); i--;)
-       {
-               LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
-               LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
-
-               PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
-               PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
-               PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
-               PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
-
-               PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
-               PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
-               PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
-               PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
-
-       #if !defined(CONJ)
-               DOT16_KERNEL(-, +);
-       #else
-               DOT16_KERNEL(+, -);
-       #endif
-       }
-
-       if (n & 7)
-       {
-               if ((n & 4) && (n & 2))
-               {
-                       LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
-                       LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
-                       LD_DP2_INC(x, inc_x2, vx4, vx5);
-                       LD_DP2_INC(y, inc_y2, vy4, vy5);
-
-                       PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
-                       PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
-                       PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
-
-                       PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
-                       PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
-                       PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
-
-               #if !defined(CONJ)
-                       DOT12_KERNEL(-, +);
-               #else
-                       DOT12_KERNEL(+, -);
-               #endif
-               }
-               else if (n & 4)
-               {
-                       LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
-                       LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
-
-                       PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
-                       PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
-
-                       PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
-                       PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
-
-               #if !defined(CONJ)
-                       DOT8_KERNEL(-, +);
-               #else
-                       DOT8_KERNEL(+, -);
-               #endif
-               }
-               else if (n & 2)
-               {
-                       LD_DP2_INC(x, inc_x2, vx0, vx1);
-                       LD_DP2_INC(y, inc_y2, vy0, vy1);
-                       PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
-                       PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
-
-               #if !defined(CONJ)
-                       DOT4_KERNEL(-, +);
-               #else
-                       DOT4_KERNEL(+, -);
-               #endif
-               }
-
-               if (n & 1)
-               {
-                       vx0 = LD_DP(x);
-                       vy0 = LD_DP(y);
-                       PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i);
-                       PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i);
-
-               #if !defined(CONJ)
-                       DOT4_KERNEL(-, +);
-               #else
-                       DOT4_KERNEL(+, -);
-               #endif
-               }
-       }
-
-       dot[0] += (dot0[0] + dot0[1]);
-       dot[1] += (dot1[0] + dot1[1]);
-
-    __real__(result) = dot[0];
-    __imag__(result) = dot[1];
-
-    return(result);
+
+#ifdef ENABLE_PREFETCH
+    if ((1 == inc_x) && (1 == inc_y))
+    {
+        double *x_pref, *y_pref;
+        BLASLONG pref_offset;
+
+        pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+        if (pref_offset > 0)
+        {
+            pref_offset = L1_DATA_LINESIZE - pref_offset;
+        }
+        pref_offset = pref_offset / sizeof(double);
+        x_pref = x + pref_offset + 32;
+
+        pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+        if (pref_offset > 0)
+        {
+            pref_offset = L1_DATA_LINESIZE - pref_offset;
+        }
+        pref_offset = pref_offset / sizeof(double);
+        y_pref = y + pref_offset + 32;
+
+        for (i = (n >> 3); i--;)
+        {
+            __asm__ __volatile__(
+                "pref   0,   0(%[x_pref])\n\t"
+                "pref   0,  32(%[x_pref])\n\t"
+                "pref   0,  64(%[x_pref])\n\t"
+                "pref   0,  96(%[x_pref])\n\t"
+                "pref   0,   0(%[y_pref])\n\t"
+                "pref   0,  32(%[y_pref])\n\t"
+                "pref   0,  64(%[y_pref])\n\t"
+                "pref   0,  96(%[y_pref])\n\t"
+
+                : : [x_pref] "r" (x_pref), [y_pref] "r" (y_pref)
+            );
+
+            x_pref += 16;
+            y_pref += 16;
+
+            LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+            LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+            PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+            PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+            PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
+            PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
+
+            PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+            PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+            PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
+            PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
+
+            dot0 += (vx0r * vy0r);
+            dot0 OP1 (vx0i * vy0i);
+            dot1 OP2 (vx0i * vy0r);
+            dot1 += (vx0r * vy0i);
+
+            dot2 += (vx1r * vy1r);
+            dot2 OP1 (vx1i * vy1i);
+            dot3 OP2 (vx1i * vy1r);
+            dot3 += (vx1r * vy1i);
+
+            dot4 += (vx2r * vy2r);
+            dot4 OP1 (vx2i * vy2i);
+            dot5 OP2 (vx2i * vy2r);
+            dot5 += (vx2r * vy2i);
+
+            dot6 += (vx3r * vy3r);
+            dot6 OP1 (vx3i * vy3i);
+            dot7 OP2 (vx3i * vy3r);
+            dot7 += (vx3r * vy3i);
+        }
+    }
+    else
+#endif
+    for (i = (n >> 3); i--;)
+    {
+        LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+        LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+        PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+        PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+        PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
+        PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
+
+        PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+        PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+        PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
+        PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
+
+        dot0 += (vx0r * vy0r);
+        dot0 OP1 (vx0i * vy0i);
+        dot1 OP2 (vx0i * vy0r);
+        dot1 += (vx0r * vy0i);
+
+        dot2 += (vx1r * vy1r);
+        dot2 OP1 (vx1i * vy1i);
+        dot3 OP2 (vx1i * vy1r);
+        dot3 += (vx1r * vy1i);
+
+        dot4 += (vx2r * vy2r);
+        dot4 OP1 (vx2i * vy2i);
+        dot5 OP2 (vx2i * vy2r);
+        dot5 += (vx2r * vy2i);
+
+        dot6 += (vx3r * vy3r);
+        dot6 OP1 (vx3i * vy3i);
+        dot7 OP2 (vx3i * vy3r);
+        dot7 += (vx3r * vy3i);
+    }
+
+    if (n & 7)
+    {
+        if (n & 4)
+        {
+            LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
+            LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
+
+            PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+            PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+
+            PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+            PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+
+            dot0 += (vx0r * vy0r);
+            dot0 OP1 (vx0i * vy0i);
+            dot1 OP2 (vx0i * vy0r);
+            dot1 += (vx0r * vy0i);
+
+            dot2 += (vx1r * vy1r);
+            dot2 OP1 (vx1i * vy1i);
+            dot3 OP2 (vx1i * vy1r);
+            dot3 += (vx1r * vy1i);
+        }
+
+        if (n & 2)
+        {
+            LD_DP2_INC(x, inc_x2, vx0, vx1);
+            LD_DP2_INC(y, inc_y2, vy0, vy1);
+            PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+            PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+
+            dot0 += (vx0r * vy0r);
+            dot0 OP1 (vx0i * vy0i);
+            dot1 OP2 (vx0i * vy0r);
+            dot1 += (vx0r * vy0i);
+        }
+
+        if (n & 1)
+        {
+            vx0 = LD_DP(x);
+            vy0 = LD_DP(y);
+            PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i);
+            PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i);
+
+            dot0 += (vx0r * vy0r);
+            dot0 OP1 (vx0i * vy0i);
+            dot1 OP2 (vx0i * vy0r);
+            dot1 += (vx0r * vy0i);
+        }
+    }
+
+    dot0 += dot2 + dot4 + dot6;
+    dot1 += dot3 + dot5 + dot7;
+
+    dot[0] += (dot0[0] + dot0[1]);
+    dot[1] += (dot1[0] + dot1[1]);
+
+    CREAL(result) = dot[0];
+    CIMAG(result) = dot[1];
+
+    return (result);
 }