Added MSA optimization for GEMV_N, GEMV_T, ASUM, DOT functions
authorShivraj Patil <shivraj.patil@imgtec.com>
Fri, 15 Jul 2016 13:08:25 +0000 (18:38 +0530)
committerShivraj Patil <shivraj.patil@imgtec.com>
Fri, 15 Jul 2016 13:08:25 +0000 (18:38 +0530)
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
23 files changed:
Makefile.system
TargetList.txt
cpuid_mips64.c
getarch.c
kernel/mips/KERNEL.P5600
kernel/mips/casum_msa.c [new file with mode: 0644]
kernel/mips/cdot_msa.c [new file with mode: 0644]
kernel/mips/cgemv_n_msa.c [new file with mode: 0644]
kernel/mips/cgemv_t_msa.c [new file with mode: 0644]
kernel/mips/dasum_msa.c [new file with mode: 0644]
kernel/mips/ddot_msa.c [new file with mode: 0644]
kernel/mips/dgemv_n_msa.c [new file with mode: 0644]
kernel/mips/dgemv_t_msa.c [new file with mode: 0644]
kernel/mips/sasum_msa.c [new file with mode: 0644]
kernel/mips/sdot_msa.c [new file with mode: 0644]
kernel/mips/sgemv_n_msa.c [new file with mode: 0644]
kernel/mips/sgemv_t_msa.c [new file with mode: 0644]
kernel/mips/zasum_msa.c [new file with mode: 0644]
kernel/mips/zdot_msa.c [new file with mode: 0644]
kernel/mips/zgemv_n_msa.c [new file with mode: 0644]
kernel/mips/zgemv_t_msa.c [new file with mode: 0644]
kernel/mips64/KERNEL.P6600 [new file with mode: 0644]
param.h

index bbcdb82..1c48a25 100644 (file)
@@ -529,7 +529,7 @@ CCOMMON_OPT += -mmsa
 FCOMMON_OPT += -mmsa
 endif
 
-ifeq ($(CORE), I6400)
+ifneq ($(filter $(CORE), I6400 P6600),)
 CCOMMON_OPT += -mmsa
 FCOMMON_OPT += -mmsa
 endif
index 248f643..52a60b4 100644 (file)
@@ -61,6 +61,7 @@ SICORTEX
 LOONGSON3A
 LOONGSON3B
 I6400
+P6600
 
 5.IA64 CPU:
 ITANIUM2
index 13f1517..ac1554c 100644 (file)
@@ -75,13 +75,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CPU_LOONGSON3A  2
 #define CPU_LOONGSON3B  3
 #define CPU_I6400       4
+#define CPU_P6600       5
 
 static char *cpuname[] = {
   "UNKOWN",
   "SICORTEX",
   "LOONGSON3A",
   "LOONGSON3B",
-  "I6400"
+  "I6400",
+  "P6600"
 };
 
 int detect(void){
@@ -161,6 +163,8 @@ void get_subarchitecture(void){
     printf("LOONGSON3B");
   }else if(detect()==CPU_I6400){
     printf("I6400");
+  }else if(detect()==CPU_P6600){
+    printf("P6600");
   }else{
     printf("SICORTEX");
   }
@@ -198,6 +202,15 @@ void get_cpuconfig(void){
     printf("#define DTB_DEFAULT_ENTRIES 64\n");
     printf("#define DTB_SIZE 4096\n");
     printf("#define L2_ASSOCIATIVE 8\n");
+  }else if(detect()==CPU_P6600){
+    printf("#define P6600\n");
+    printf("#define L1_DATA_SIZE 65536\n");
+    printf("#define L1_DATA_LINESIZE 32\n");
+    printf("#define L2_SIZE 1048576\n");
+    printf("#define L2_LINESIZE 32\n");
+    printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    printf("#define DTB_SIZE 4096\n");
+    printf("#define L2_ASSOCIATIVE 8\n");
   }else{
     printf("#define SICORTEX\n");
     printf("#define L1_DATA_SIZE 32768\n");
@@ -217,6 +230,8 @@ void get_libname(void){
     printf("loongson3b\n");
   }else if(detect()==CPU_I6400) {
     printf("i6400\n");
+  }else if(detect()==CPU_P6600) {
+    printf("p6600\n");
   }else{
     printf("mips64\n");
   }
index 2f5d18a..f8069e5 100644 (file)
--- a/getarch.c
+++ b/getarch.c
@@ -132,6 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_LOONGSON3A    */
 /* #define FORCE_LOONGSON3B    */
 /* #define FORCE_I6400         */
+/* #define FORCE_P6600         */
 /* #define FORCE_P5600         */
 /* #define FORCE_ITANIUM2      */
 /* #define FORCE_SPARC         */
@@ -715,6 +716,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_P6600
+#define FORCE
+#define ARCHITECTURE    "MIPS"
+#define SUBARCHITECTURE "P6600"
+#define SUBDIRNAME      "mips64"
+#define ARCHCONFIG   "-DP6600 " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "p6600"
+#define CORENAME  "P6600"
+#else
+#endif
+
 #ifdef FORCE_P5600
 #define FORCE
 #define ARCHITECTURE    "MIPS"
index 5d8bcb9..7bf90c9 100644 (file)
@@ -30,10 +30,10 @@ IDMAXKERNEL  = ../mips/imax.c
 ISMINKERNEL  = ../mips/imin.c
 IDMINKERNEL  = ../mips/imin.c
 
-SASUMKERNEL  = ../mips/asum.c
-DASUMKERNEL  = ../mips/asum.c
-CASUMKERNEL  = ../mips/zasum.c
-ZASUMKERNEL  = ../mips/zasum.c
+SASUMKERNEL  = ../mips/sasum_msa.c
+DASUMKERNEL  = ../mips/dasum_msa.c
+CASUMKERNEL  = ../mips/casum_msa.c
+ZASUMKERNEL  = ../mips/zasum_msa.c
 
 SAXPYKERNEL  = ../mips/axpy.c
 DAXPYKERNEL  = ../mips/axpy.c
@@ -45,10 +45,10 @@ DCOPYKERNEL  = ../mips/copy.c
 CCOPYKERNEL  = ../mips/zcopy.c
 ZCOPYKERNEL  = ../mips/zcopy.c
 
-SDOTKERNEL   = ../mips/dot.c
-DDOTKERNEL   = ../mips/dot.c
-CDOTKERNEL   = ../mips/zdot.c
-ZDOTKERNEL   = ../mips/zdot.c
+SDOTKERNEL   = ../mips/sdot_msa.c
+DDOTKERNEL   = ../mips/ddot_msa.c
+CDOTKERNEL   = ../mips/cdot_msa.c
+ZDOTKERNEL   = ../mips/zdot_msa.c
 
 SNRM2KERNEL  = ../mips/nrm2.c
 DNRM2KERNEL  = ../mips/nrm2.c
@@ -70,15 +70,15 @@ DSWAPKERNEL  = ../mips/swap.c
 CSWAPKERNEL  = ../mips/zswap.c
 ZSWAPKERNEL  = ../mips/zswap.c
 
-SGEMVNKERNEL = ../mips/gemv_n.c
-DGEMVNKERNEL = ../mips/gemv_n.c
-CGEMVNKERNEL = ../mips/zgemv_n.c
-ZGEMVNKERNEL = ../mips/zgemv_n.c
+SGEMVNKERNEL = ../mips/sgemv_n_msa.c
+DGEMVNKERNEL = ../mips/dgemv_n_msa.c
+CGEMVNKERNEL = ../mips/cgemv_n_msa.c
+ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
 
-SGEMVTKERNEL = ../mips/gemv_t.c
-DGEMVTKERNEL = ../mips/gemv_t.c
-CGEMVTKERNEL = ../mips/zgemv_t.c
-ZGEMVTKERNEL = ../mips/zgemv_t.c
+SGEMVTKERNEL = ../mips/sgemv_t_msa.c
+DGEMVTKERNEL = ../mips/dgemv_t_msa.c
+CGEMVTKERNEL = ../mips/cgemv_t_msa.c
+ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
 
 SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
 SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
diff --git a/kernel/mips/casum_msa.c b/kernel/mips/casum_msa.c
new file mode 100644 (file)
index 0000000..454573d
--- /dev/null
@@ -0,0 +1,338 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include "macros_msa.h"
+
+#define AND_VEC_W(in)   ((v4f32) ((v4i32) in & and_vec))
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i, inc_x2;
+    FLOAT sumf = 0.0;
+    v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+    v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
+    v4f32 zero_v = {0};
+    v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+
+    if (n <= 0 || inc_x <= 0) return (sumf);
+
+    if (1 == inc_x)
+    {
+        if (n > 15)
+        {
+            n -= 16;
+
+            LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 = AND_VEC_W(src0);
+            sum_abs1 = AND_VEC_W(src1);
+            sum_abs2 = AND_VEC_W(src2);
+            sum_abs3 = AND_VEC_W(src3);
+            sum_abs0 += AND_VEC_W(src4);
+            sum_abs1 += AND_VEC_W(src5);
+            sum_abs2 += AND_VEC_W(src6);
+            sum_abs3 += AND_VEC_W(src7);
+        }
+        else
+        {
+            sum_abs0 = zero_v;
+            sum_abs1 = zero_v;
+            sum_abs2 = zero_v;
+            sum_abs3 = zero_v;
+        }
+
+        for (i = (n >> 4); i--;)
+        {
+            LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 += AND_VEC_W(src0);
+            sum_abs1 += AND_VEC_W(src1);
+            sum_abs2 += AND_VEC_W(src2);
+            sum_abs3 += AND_VEC_W(src3);
+            sum_abs0 += AND_VEC_W(src4);
+            sum_abs1 += AND_VEC_W(src5);
+            sum_abs2 += AND_VEC_W(src6);
+            sum_abs3 += AND_VEC_W(src7);
+        }
+
+        if (n & 15)
+        {
+            if ((n & 8) && (n & 4) && (n & 2))
+            {
+                LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+                sum_abs1 += AND_VEC_W(src5);
+                sum_abs2 += AND_VEC_W(src6);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf = sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if ((n & 8) && (n & 4))
+            {
+                LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+                sum_abs1 += AND_VEC_W(src5);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf = sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if ((n & 8) && (n & 2))
+            {
+                LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf = sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if ((n & 4) && (n & 2))
+            {
+                LD_SP3_INC(x, 4, src0, src1, src2);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf = sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if (n & 8)
+            {
+                LD_SP4_INC(x, 4, src0, src1, src2, src3);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf = sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if (n & 4)
+            {
+                LD_SP2_INC(x, 4, src0, src1);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf = sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if (n & 2)
+            {
+                src0 = LD_SP(x); x += 4;
+
+                sum_abs0 += AND_VEC_W(src0);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf = sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else
+            {
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf = sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+
+            if (n & 1)
+            {
+                sumf += fabsf(*(x + 0));
+                sumf += fabsf(*(x + 1));
+            }
+        }
+        else
+        {
+            sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+            sumf = sum_abs0[0];
+            sumf += sum_abs0[1];
+            sumf += sum_abs0[2];
+            sumf += sum_abs0[3];
+        }
+    }
+    else
+    {
+        inc_x2 = 2 * inc_x;
+
+        if (n > 8)
+        {
+            n -= 8;
+
+            LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 = AND_VEC_W(src0);
+            sum_abs1 = AND_VEC_W(src1);
+            sum_abs2 = AND_VEC_W(src2);
+            sum_abs3 = AND_VEC_W(src3);
+            sum_abs0 += AND_VEC_W(src4);
+            sum_abs1 += AND_VEC_W(src5);
+            sum_abs2 += AND_VEC_W(src6);
+            sum_abs3 += AND_VEC_W(src7);
+        }
+        else
+        {
+            sum_abs0 = zero_v;
+            sum_abs1 = zero_v;
+            sum_abs2 = zero_v;
+            sum_abs3 = zero_v;
+        }
+
+        for (i = (n >> 3); i--;)
+        {
+            LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 += AND_VEC_W(src0);
+            sum_abs1 += AND_VEC_W(src1);
+            sum_abs2 += AND_VEC_W(src2);
+            sum_abs3 += AND_VEC_W(src3);
+            sum_abs0 += AND_VEC_W(src4);
+            sum_abs1 += AND_VEC_W(src5);
+            sum_abs2 += AND_VEC_W(src6);
+            sum_abs3 += AND_VEC_W(src7);
+        }
+
+        if (n & 7)
+        {
+            if ((n & 4) && (n & 2) && (n & 1))
+            {
+                LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+                sum_abs1 += AND_VEC_W(src5);
+                sum_abs2 += AND_VEC_W(src6);
+            }
+            else if ((n & 4) && (n & 2))
+            {
+                LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+                sum_abs1 += AND_VEC_W(src5);
+            }
+            else if ((n & 4) && (n & 1))
+            {
+                LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+            }
+            else if ((n & 2) && (n & 1))
+            {
+                LD_SP3_INC(x, inc_x2, src0, src1, src2);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+            }
+            else if (n & 4)
+            {
+                LD_SP4_INC(x, inc_x2, src0, src1, src2, src3);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+            }
+            else if (n & 2)
+            {
+                LD_SP2_INC(x, inc_x2, src0, src1);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+            }
+            else if (n & 1)
+            {
+                src0 = LD_SP(x); x += inc_x2;
+
+                sum_abs0 += AND_VEC_W(src0);
+            }
+        }
+
+        sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+        sumf = sum_abs0[0] + sum_abs0[1];
+    }
+
+    return (sumf);
+}
diff --git a/kernel/mips/cdot_msa.c b/kernel/mips/cdot_msa.c
new file mode 100644 (file)
index 0000000..bf9f6b7
--- /dev/null
@@ -0,0 +1,361 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#if !defined(CONJ)
+       #define OP2             +=
+       #define OP3             -
+       #define OP4             +
+#else
+       #define OP2             -=
+       #define OP3             +
+       #define OP4             -
+#endif
+
+#define DOT16_KERNEL(OPR0, OPR1)  \
+       dot0 += (vx0r * vy0r);            \
+       dot0 OPR0## = (vx0i * vy0i);  \
+       dot1 OPR1## = (vx0i * vy0r);  \
+       dot1 += (vx0r * vy0i);        \
+                                                                 \
+       dot0 += (vx1r * vy1r);        \
+       dot0 OPR0## = (vx1i * vy1i);  \
+       dot1 OPR1## = (vx1i * vy1r);  \
+       dot1 += (vx1r * vy1i);        \
+                                                                 \
+       dot0 += (vx2r * vy2r);        \
+       dot0 OPR0## = (vx2i * vy2i);  \
+       dot1 OPR1## = (vx2i * vy2r);  \
+       dot1 += (vx2r * vy2i);        \
+                                                                 \
+       dot0 += (vx3r * vy3r);        \
+       dot0 OPR0## = (vx3i * vy3i);  \
+       dot1 OPR1## = (vx3i * vy3r);  \
+       dot1 += (vx3r * vy3i);
+
+#define DOT12_KERNEL(OPR0, OPR1)  \
+       dot0 += (vx0r * vy0r);            \
+       dot0 OPR0## = (vx0i * vy0i);  \
+       dot1 OPR1## = (vx0i * vy0r);  \
+       dot1 += (vx0r * vy0i);        \
+                                                                 \
+       dot0 += (vx1r * vy1r);        \
+       dot0 OPR0## = (vx1i * vy1i);  \
+       dot1 OPR1## = (vx1i * vy1r);  \
+       dot1 += (vx1r * vy1i);            \
+                                                                 \
+       dot0 += (vx2r * vy2r);        \
+       dot0 OPR0## = (vx2i * vy2i);  \
+       dot1 OPR1## = (vx2i * vy2r);  \
+       dot1 += (vx2r * vy2i);
+
+#define DOT8_KERNEL(OPR0, OPR1)   \
+       dot0 += (vx0r * vy0r);            \
+       dot0 OPR0## = (vx0i * vy0i);  \
+       dot1 OPR1## = (vx0i * vy0r);  \
+       dot1 += (vx0r * vy0i);        \
+                                                                 \
+       dot0 += (vx1r * vy1r);        \
+       dot0 OPR0## = (vx1i * vy1i);  \
+       dot1 OPR1## = (vx1i * vy1r);  \
+       dot1 += (vx1r * vy1i);
+
+#define DOT4_KERNEL(OPR0, OPR1)   \
+       dot0 += (vx0r * vy0r);            \
+       dot0 OPR0## = (vx0i * vy0i);  \
+       dot1 OPR1## = (vx0i * vy0r);  \
+       dot1 += (vx0r * vy0i);
+
+/* return float, x,y float */
+/* cdotc -  CONJ */
+/* cdotu - !CONJ */
+#ifndef _MSC_VER
+#include <complex.h>
+FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+    BLASLONG i = 0;
+    FLOAT dot[2];
+    BLASLONG inc_x2;
+    BLASLONG inc_y2;
+    FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
+    FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
+    v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
+    v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+       v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
+       v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
+    v4f32 dot0 = {0, 0, 0, 0};
+    v4f32 dot1 = {0, 0, 0, 0};
+    openblas_complex_float result;
+
+    dot[0] = 0.0;
+    dot[1] = 0.0;
+
+    __real__(result) = 0.0;
+    __imag__(result) = 0.0;
+
+    if ( n < 1 ) return(result);
+
+    if ((1 == inc_x) && (1 == inc_y))
+    {
+        for (i = (n >> 4); i--;)
+        {
+                       LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+                       LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+                       PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+                       PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+                       PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
+                       PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
+
+                       PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+                       PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+                       PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
+                       PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
+
+               #if !defined(CONJ)
+                       DOT16_KERNEL(-, +);
+               #else
+                       DOT16_KERNEL(+, -);
+               #endif
+        }
+
+        if (n & 15)
+        {
+            if ((n & 8) && (n & 4))
+            {
+                               LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+                               LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+                               LD_SP2_INC(x, 4, vx4, vx5);
+                               LD_SP2_INC(y, 4, vy4, vy5);
+
+                               PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+                               PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+                               PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
+
+                               PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+                               PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+                               PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
+
+                       #if !defined(CONJ)
+                               DOT12_KERNEL(-, +);
+                       #else
+                               DOT12_KERNEL(+, -);
+                       #endif
+            }
+            else if (n & 8)
+            {
+                               LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+                               LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+
+                               PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+                               PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
+
+                               PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+                               PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
+
+                       #if !defined(CONJ)
+                               DOT8_KERNEL(-, +);
+                       #else
+                               DOT8_KERNEL(+, -);
+                       #endif
+            }
+                       else if (n & 4)
+            {
+                               LD_SP2_INC(x, 4, vx0, vx1);
+                               LD_SP2_INC(y, 4, vy0, vy1);
+                               PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
+                               PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
+
+                       #if !defined(CONJ)
+                               DOT4_KERNEL(-, +);
+                       #else
+                               DOT4_KERNEL(+, -);
+                       #endif
+            }
+
+                       if ((n & 2) && (n & 1))
+                       {
+                LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5);
+                LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5);
+
+                               dot[0] += ( x0 * y0 OP3 x1 * y1 );
+                               dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+                               dot[0] += ( x2 * y2 OP3 x3 * y3 );
+                               dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+
+                               dot[0] += ( x4 * y4 OP3 x5 * y5 );
+                               dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
+                       }
+                       else if (n & 2)
+                       {
+                LD_GP4_INC(x, 1, x0, x1, x2, x3);
+                LD_GP4_INC(y, 1, y0, y1, y2, y3);
+
+                               dot[0] += ( x0 * y0 OP3 x1 * y1 );
+                               dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+                               dot[0] += ( x2 * y2 OP3 x3 * y3 );
+                               dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+                       }
+                       else if (n & 1)
+                       {
+                LD_GP2_INC(x, 1, x0, x1);
+                LD_GP2_INC(y, 1, y0, y1);
+
+                               dot[0] += ( x0 * y0 OP3 x1 * y1 );
+                               dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+                       }
+        }
+
+               dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
+               dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
+       }
+       else
+       {
+               inc_x2 = 2 * inc_x;
+               inc_y2 = 2 * inc_y;
+
+               for (i = (n >> 2); i--;)
+               {
+                       x0 = *x;
+                       x1 = *(x + 1);
+                       x += inc_x2;
+                       x2 = *x;
+                       x3 = *(x + 1);
+                       x += inc_x2;
+                       x4 = *x;
+                       x5 = *(x + 1);
+                       x += inc_x2;
+                       x6 = *x;
+                       x7 = *(x + 1);
+                       x += inc_x2;
+
+                       y0 = *y;
+                       y1 = *(y + 1);
+                       y += inc_y2;
+                       y2 = *y;
+                       y3 = *(y + 1);
+                       y += inc_y2;
+                       y4 = *y;
+                       y5 = *(y + 1);
+                       y += inc_y2;
+                       y6 = *y;
+                       y7 = *(y + 1);
+                       y += inc_y2;
+
+                       dot[0] += ( x0 * y0 OP3 x1 * y1 );
+                       dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+                       dot[0] += ( x2 * y2 OP3 x3 * y3 );
+                       dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+
+                       dot[0] += ( x4 * y4 OP3 x5 * y5 );
+                       dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
+
+                       dot[0] += ( x6 * y6 OP3 x7 * y7 );
+                       dot[1] OP2 ( x7 * y6 OP4 x6 * y7 );
+               }
+
+               if ((n & 2) && (n & 1))
+               {
+                       x0 = *x;
+                       x1 = *(x + 1);
+                       x += inc_x2;
+                       x2 = *x;
+                       x3 = *(x + 1);
+                       x += inc_x2;
+                       x4 = *x;
+                       x5 = *(x + 1);
+                       x += inc_x2;
+
+                       y0 = *y;
+                       y1 = *(y + 1);
+                       y += inc_y2;
+                       y2 = *y;
+                       y3 = *(y + 1);
+                       y += inc_y2;
+                       y4 = *y;
+                       y5 = *(y + 1);
+                       y += inc_y2;
+
+                       dot[0] += ( x0 * y0 OP3 x1 * y1 );
+                       dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+                       dot[0] += ( x2 * y2 OP3 x3 * y3 );
+                       dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+
+                       dot[0] += ( x4 * y4 OP3 x5 * y5 );
+                       dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
+               }
+               else if (n & 2)
+               {
+                       x0 = *x;
+                       x1 = *(x + 1);
+                       x += inc_x2;
+                       x2 = *x;
+                       x3 = *(x + 1);
+                       x += inc_x2;
+
+                       y0 = *y;
+                       y1 = *(y + 1);
+                       y += inc_y2;
+                       y2 = *y;
+                       y3 = *(y + 1);
+                       y += inc_y2;
+
+                       dot[0] += ( x0 * y0 OP3 x1 * y1 );
+                       dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+
+                       dot[0] += ( x2 * y2 OP3 x3 * y3 );
+                       dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
+               }
+               else if (n & 1)
+               {
+                       x0 = *x;
+                       x1 = *(x + 1);
+                       x += inc_x2;
+
+                       y0 = *y;
+                       y1 = *(y + 1);
+                       y += inc_y2;
+
+                       dot[0] += ( x0 * y0 OP3 x1 * y1 );
+                       dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
+               }
+       }
+
+    __real__(result) = dot[0];
+    __imag__(result) = dot[1];
+
+    return(result);
+}
diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c
new file mode 100644 (file)
index 0000000..f1879ba
--- /dev/null
@@ -0,0 +1,611 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#undef OP0
+#undef OP1
+#undef OP2
+#undef OP3
+#undef OP4
+
+#if !defined(XCONJ)
+    #define OP3  -=
+    #define OP4  +=
+#else
+    #define OP3  +=
+    #define OP4  -=
+#endif
+
+#if !defined(CONJ)
+    #if !defined(XCONJ)
+        #define OP0  -=
+        #define OP1  +=
+        #define OP2  +=
+    #else
+        #define OP0  +=
+        #define OP1  +=
+        #define OP2  -=
+    #endif
+#else
+    #if !defined(XCONJ)
+        #define OP0  +=
+        #define OP1  -=
+        #define OP2  -=
+    #else
+        #define OP0  -=
+        #define OP1  -=
+        #define OP2  +=
+    #endif
+#endif
+
+#define CGEMV_N_8x4()                        \
+    LD_SP4(pa0 + k, 4, t0, t1, t2, t3);      \
+    LD_SP4(pa1 + k, 4, t4, t5, t6, t7);      \
+    LD_SP4(pa2 + k, 4, t8, t9, t10, t11);    \
+    LD_SP4(pa3 + k, 4, t12, t13, t14, t15);  \
+                                             \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);     \
+    PCKEVOD_W2_SP(t3, t2, src1r, src1i);     \
+    PCKEVOD_W2_SP(t5, t4, src2r, src2i);     \
+    PCKEVOD_W2_SP(t7, t6, src3r, src3i);     \
+    PCKEVOD_W2_SP(t9, t8, src4r, src4i);     \
+    PCKEVOD_W2_SP(t11, t10, src5r, src5i);   \
+    PCKEVOD_W2_SP(t13, t12, src6r, src6i);   \
+    PCKEVOD_W2_SP(t15, t14, src7r, src7i);   \
+                                             \
+    y0r += tp0r * src0r;                     \
+    y1r += tp0r * src1r;                     \
+    y0r += tp1r * src2r;                     \
+    y1r += tp1r * src3r;                     \
+    y0r += tp2r * src4r;                     \
+    y1r += tp2r * src5r;                     \
+    y0r += tp3r * src6r;                     \
+    y1r += tp3r * src7r;                     \
+                                             \
+    y0r OP0 tp0i * src0i;                    \
+    y1r OP0 tp0i * src1i;                    \
+    y0r OP0 tp1i * src2i;                    \
+    y1r OP0 tp1i * src3i;                    \
+    y0r OP0 tp2i * src4i;                    \
+    y1r OP0 tp2i * src5i;                    \
+    y0r OP0 tp3i * src6i;                    \
+    y1r OP0 tp3i * src7i;                    \
+                                             \
+    y0i OP1 tp0r * src0i;                    \
+    y1i OP1 tp0r * src1i;                    \
+    y0i OP1 tp1r * src2i;                    \
+    y1i OP1 tp1r * src3i;                    \
+    y0i OP1 tp2r * src4i;                    \
+    y1i OP1 tp2r * src5i;                    \
+    y0i OP1 tp3r * src6i;                    \
+    y1i OP1 tp3r * src7i;                    \
+                                             \
+    y0i OP2 tp0i * src0r;                    \
+    y1i OP2 tp0i * src1r;                    \
+    y0i OP2 tp1i * src2r;                    \
+    y1i OP2 tp1i * src3r;                    \
+    y0i OP2 tp2i * src4r;                    \
+    y1i OP2 tp2i * src5r;                    \
+    y0i OP2 tp3i * src6r;                    \
+    y1i OP2 tp3i * src7r;                    \
+
+#define CGEMV_N_4x4()                       \
+    LD_SP2(pa0 + k, 4, t0, t1);             \
+    LD_SP2(pa1 + k, 4, t4, t5);             \
+    LD_SP2(pa2 + k, 4, t8, t9);             \
+    LD_SP2(pa3 + k, 4, t12, t13);           \
+                                            \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);    \
+    PCKEVOD_W2_SP(t5, t4, src2r, src2i);    \
+    PCKEVOD_W2_SP(t9, t8, src4r, src4i);    \
+    PCKEVOD_W2_SP(t13, t12, src6r, src6i);  \
+                                            \
+    y0r += tp0r * src0r;                    \
+    y0r += tp1r * src2r;                    \
+    y0r += tp2r * src4r;                    \
+    y0r += tp3r * src6r;                    \
+                                            \
+    y0r OP0 tp0i * src0i;                   \
+    y0r OP0 tp1i * src2i;                   \
+    y0r OP0 tp2i * src4i;                   \
+    y0r OP0 tp3i * src6i;                   \
+                                            \
+    y0i OP1 tp0r * src0i;                   \
+    y0i OP1 tp1r * src2i;                   \
+    y0i OP1 tp2r * src4i;                   \
+    y0i OP1 tp3r * src6i;                   \
+                                            \
+    y0i OP2 tp0i * src0r;                   \
+    y0i OP2 tp1i * src2r;                   \
+    y0i OP2 tp2i * src4r;                   \
+    y0i OP2 tp3i * src6r;                   \
+
+#define CGEMV_N_1x4()               \
+    res0 = y[0 * inc_y2];           \
+    res1 = y[0 * inc_y2 + 1];       \
+                                    \
+    res0  += temp0_r * pa0[k];      \
+    res0 OP0 temp0_i * pa0[k + 1];  \
+    res0  += temp1_r * pa1[k];      \
+    res0 OP0 temp1_i * pa1[k + 1];  \
+    res0  += temp2_r * pa2[k];      \
+    res0 OP0 temp2_i * pa2[k + 1];  \
+    res0  += temp3_r * pa3[k];      \
+    res0 OP0 temp3_i * pa3[k + 1];  \
+                                    \
+    res1 OP1 temp0_r * pa0[k + 1];  \
+    res1 OP2 temp0_i * pa0[k];      \
+    res1 OP1 temp1_r * pa1[k + 1];  \
+    res1 OP2 temp1_i * pa1[k];      \
+    res1 OP1 temp2_r * pa2[k + 1];  \
+    res1 OP2 temp2_i * pa2[k];      \
+    res1 OP1 temp3_r * pa3[k + 1];  \
+    res1 OP2 temp3_i * pa3[k];      \
+                                    \
+    y[0 * inc_y2]     = res0;       \
+    y[0 * inc_y2 + 1] = res1;       \
+
+#define CGEMV_N_8x2()                     \
+    LD_SP4(pa0 + k, 4, t0, t1, t2, t3);   \
+    LD_SP4(pa1 + k, 4, t4, t5, t6, t7);   \
+                                          \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
+    PCKEVOD_W2_SP(t3, t2, src1r, src1i);  \
+    PCKEVOD_W2_SP(t5, t4, src2r, src2i);  \
+    PCKEVOD_W2_SP(t7, t6, src3r, src3i);  \
+                                          \
+    y0r += tp0r * src0r;                  \
+    y1r += tp0r * src1r;                  \
+    y0r += tp1r * src2r;                  \
+    y1r += tp1r * src3r;                  \
+                                          \
+    y0r OP0 tp0i * src0i;                 \
+    y1r OP0 tp0i * src1i;                 \
+    y0r OP0 tp1i * src2i;                 \
+    y1r OP0 tp1i * src3i;                 \
+                                          \
+    y0i OP1 tp0r * src0i;                 \
+    y1i OP1 tp0r * src1i;                 \
+    y0i OP1 tp1r * src2i;                 \
+    y1i OP1 tp1r * src3i;                 \
+                                          \
+    y0i OP2 tp0i * src0r;                 \
+    y1i OP2 tp0i * src1r;                 \
+    y0i OP2 tp1i * src2r;                 \
+    y1i OP2 tp1i * src3r;                 \
+
+#define CGEMV_N_4x2()                     \
+    LD_SP2(pa0 + k, 4, t0, t1);           \
+    LD_SP2(pa1 + k, 4, t4, t5);           \
+                                          \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
+    PCKEVOD_W2_SP(t5, t4, src2r, src2i);  \
+                                          \
+    y0r += tp0r * src0r;                  \
+    y0r += tp1r * src2r;                  \
+                                          \
+    y0r OP0 tp0i * src0i;                 \
+    y0r OP0 tp1i * src2i;                 \
+                                          \
+    y0i OP1 tp0r * src0i;                 \
+    y0i OP1 tp1r * src2i;                 \
+                                          \
+    y0i OP2 tp0i * src0r;                 \
+    y0i OP2 tp1i * src2r;                 \
+
+#define CGEMV_N_1x2()               \
+    res0 = y[0 * inc_y2];           \
+    res1 = y[0 * inc_y2 + 1];       \
+                                    \
+    res0  += temp0_r * pa0[k];      \
+    res0 OP0 temp0_i * pa0[k + 1];  \
+    res0  += temp1_r * pa1[k];      \
+    res0 OP0 temp1_i * pa1[k + 1];  \
+                                    \
+    res1 OP1 temp0_r * pa0[k + 1];  \
+    res1 OP2 temp0_i * pa0[k];      \
+    res1 OP1 temp1_r * pa1[k + 1];  \
+    res1 OP2 temp1_i * pa1[k];      \
+                                    \
+    y[0 * inc_y2]     = res0;       \
+    y[0 * inc_y2 + 1] = res1;       \
+
+#define CGEMV_N_1x1()              \
+    res0 = y[0 * inc_y2];          \
+    res1 = y[0 * inc_y2 + 1];      \
+                                   \
+    res0  += temp_r * pa0[k];      \
+    res0 OP0 temp_i * pa0[k + 1];  \
+                                   \
+    res1 OP1 temp_r * pa0[k + 1];  \
+    res1 OP2 temp_i * pa0[k];      \
+                                   \
+    y[0 * inc_y2]     = res0;      \
+    y[0 * inc_y2 + 1] = res1;      \
+
+#define CLOAD_X4_SCALE_VECTOR()                  \
+    LD_SP2(x, 4, x0, x1);                        \
+                                                 \
+    PCKEVOD_W2_SP(x1, x0, x0r, x0i);             \
+                                                 \
+    tp4r   = alphar * x0r;                       \
+    tp4r OP3 alphai * x0i;                       \
+    tp4i   = alphar * x0i;                       \
+    tp4i OP4 alphai * x0r;                       \
+                                                 \
+    SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r);  \
+    SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i);  \
+
+#define CLOAD_X4_SCALE_GP()                                                          \
+    x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  1, *((int *) (x + 1 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  2, *((int *) (x + 2 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  3, *((int *) (x + 3 * inc_x2)));      \
+    x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  1, *((int *) (x + 1 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  2, *((int *) (x + 2 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  3, *((int *) (x + 3 * inc_x2 + 1)));  \
+                                                                                     \
+    tp4r   = alphar * x0r;                                                           \
+    tp4r OP3 alphai * x0i;                                                           \
+    tp4i   = alphar * x0i;                                                           \
+    tp4i OP4 alphai * x0r;                                                           \
+                                                                                     \
+    SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r);                                      \
+    SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i);                                      \
+
+#define CLOAD_X2_SCALE_GP()                        \
+    temp0_r   = alpha_r * x[0 * inc_x2];           \
+    temp0_r OP3 alpha_i * x[0 * inc_x2 + 1];       \
+    temp0_i   = alpha_r * x[0 * inc_x2 + 1];       \
+    temp0_i OP4 alpha_i * x[0 * inc_x2];           \
+                                                   \
+    temp1_r   = alpha_r * x[1 * inc_x2];           \
+    temp1_r OP3 alpha_i * x[1 * inc_x2 + 1];       \
+    temp1_i   = alpha_r * x[1 * inc_x2 + 1];       \
+    temp1_i OP4 alpha_i * x[1 * inc_x2];           \
+                                                   \
+    tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r);  \
+    tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i);  \
+    tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r);  \
+    tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i);  \
+
+#define CLOAD_X1_SCALE_GP()                  \
+    temp_r   = alpha_r * x[0 * inc_x2];      \
+    temp_r OP3 alpha_i * x[0 * inc_x2 + 1];  \
+    temp_i   = alpha_r * x[0 * inc_x2 + 1];  \
+    temp_i OP4 alpha_i * x[0 * inc_x2];      \
+
+#define CLOAD_Y8_VECTOR()             \
+    LD_SP4(y, 4, y0, y1, y2, y3);     \
+    PCKEVOD_W2_SP(y1, y0, y0r, y0i);  \
+    PCKEVOD_W2_SP(y3, y2, y1r, y1i);  \
+
+#define CLOAD_Y4_VECTOR()             \
+    LD_SP2(y, 4, y0, y1);             \
+    PCKEVOD_W2_SP(y1, y0, y0r, y0i);  \
+
+#define CSTORE_Y8_VECTOR()          \
+    ILVRL_W2_SP(y0i, y0r, y0, y1);  \
+    ILVRL_W2_SP(y1i, y1r, y2, y3);  \
+    ST_SP4(y0, y1, y2, y3, y, 4);   \
+
+#define CSTORE_Y4_VECTOR()          \
+    ILVRL_W2_SP(y0i, y0r, y0, y1);  \
+    ST_SP2(y0, y1, y, 4);           \
+
+#define CLOAD_Y8_GP()                                                               \
+    y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2)));      \
+    y0r = (v4f32) __msa_insert_w((v4i32) y0r,  1, *((int *)(y + 1 * inc_y2)));      \
+    y0r = (v4f32) __msa_insert_w((v4i32) y0r,  2, *((int *)(y + 2 * inc_y2)));      \
+    y0r = (v4f32) __msa_insert_w((v4i32) y0r,  3, *((int *)(y + 3 * inc_y2)));      \
+    y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2)));      \
+    y1r = (v4f32) __msa_insert_w((v4i32) y1r,  1, *((int *)(y + 5 * inc_y2)));      \
+    y1r = (v4f32) __msa_insert_w((v4i32) y1r,  2, *((int *)(y + 6 * inc_y2)));      \
+    y1r = (v4f32) __msa_insert_w((v4i32) y1r,  3, *((int *)(y + 7 * inc_y2)));      \
+    y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1)));  \
+    y0i = (v4f32) __msa_insert_w((v4i32) y0i,  1, *((int *)(y + 1 * inc_y2 + 1)));  \
+    y0i = (v4f32) __msa_insert_w((v4i32) y0i,  2, *((int *)(y + 2 * inc_y2 + 1)));  \
+    y0i = (v4f32) __msa_insert_w((v4i32) y0i,  3, *((int *)(y + 3 * inc_y2 + 1)));  \
+    y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1)));  \
+    y1i = (v4f32) __msa_insert_w((v4i32) y1i,  1, *((int *)(y + 5 * inc_y2 + 1)));  \
+    y1i = (v4f32) __msa_insert_w((v4i32) y1i,  2, *((int *)(y + 6 * inc_y2 + 1)));  \
+    y1i = (v4f32) __msa_insert_w((v4i32) y1i,  3, *((int *)(y + 7 * inc_y2 + 1)));  \
+
+#define CLOAD_Y4_GP()                                                                \
+    y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y +  0 * inc_y2)));      \
+    y0r = (v4f32) __msa_insert_w((v4i32) y0r,  1, *((int *)(y +  1 * inc_y2)));      \
+    y0r = (v4f32) __msa_insert_w((v4i32) y0r,  2, *((int *)(y +  2 * inc_y2)));      \
+    y0r = (v4f32) __msa_insert_w((v4i32) y0r,  3, *((int *)(y +  3 * inc_y2)));      \
+    y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y +  0 * inc_y2 + 1)));  \
+    y0i = (v4f32) __msa_insert_w((v4i32) y0i,  1, *((int *)(y +  1 * inc_y2 + 1)));  \
+    y0i = (v4f32) __msa_insert_w((v4i32) y0i,  2, *((int *)(y +  2 * inc_y2 + 1)));  \
+    y0i = (v4f32) __msa_insert_w((v4i32) y0i,  3, *((int *)(y +  3 * inc_y2 + 1)));  \
+
+#define CSTORE_Y8_GP()                                                \
+    *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0);      \
+    *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1);      \
+    *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2);      \
+    *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3);      \
+    *((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0);      \
+    *((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1);      \
+    *((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2);      \
+    *((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3);      \
+    *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0);  \
+    *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1);  \
+    *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2);  \
+    *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3);  \
+    *((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0);  \
+    *((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1);  \
+    *((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2);  \
+    *((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3);  \
+
+#define CSTORE_Y4_GP()                                                \
+    *((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0);      \
+    *((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1);      \
+    *((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2);      \
+    *((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3);      \
+    *((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0);  \
+    *((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1);  \
+    *((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2);  \
+    *((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3);  \
+
+#define CGEMV_N_MSA()                \
+    for (j = (n >> 2); j--;)         \
+    {                                \
+        CLOAD_X4_SCALE();            \
+                                     \
+        k = 0;                       \
+        y = y_org;                   \
+                                     \
+        for (i = (m >> 3); i--;)     \
+        {                            \
+            CLOAD_Y8()               \
+            CGEMV_N_8x4();           \
+            CSTORE_Y8();             \
+                                     \
+            k += 2 * 8;              \
+            y += inc_y2 * 8;         \
+        }                            \
+                                     \
+        if (m & 4)                   \
+        {                            \
+            CLOAD_Y4();              \
+            CGEMV_N_4x4();           \
+            CSTORE_Y4();             \
+                                     \
+            k += 2 * 4;              \
+            y += inc_y2 * 4;         \
+        }                            \
+                                     \
+        if (m & 3)                   \
+        {                            \
+            temp0_r = tp4r[0];       \
+            temp1_r = tp4r[1];       \
+            temp2_r = tp4r[2];       \
+            temp3_r = tp4r[3];       \
+                                     \
+            temp0_i = tp4i[0];       \
+            temp1_i = tp4i[1];       \
+            temp2_i = tp4i[2];       \
+            temp3_i = tp4i[3];       \
+                                     \
+            for (i = (m & 3); i--;)  \
+            {                        \
+                CGEMV_N_1x4();       \
+                                     \
+                k += 2;              \
+                y += inc_y2;         \
+            }                        \
+        }                            \
+                                     \
+        pa0 += 4 * lda2;             \
+        pa1 += 4 * lda2;             \
+        pa2 += 4 * lda2;             \
+        pa3 += 4 * lda2;             \
+                                     \
+        x += 4 * inc_x2;             \
+    }                                \
+                                     \
+    if (n & 2)                       \
+    {                                \
+        CLOAD_X2_SCALE();            \
+                                     \
+        k = 0;                       \
+        y = y_org;                   \
+                                     \
+        for (i = (m >> 3); i--;)     \
+        {                            \
+            CLOAD_Y8();              \
+            CGEMV_N_8x2();           \
+            CSTORE_Y8();             \
+                                     \
+            k += 2 * 8;              \
+            y += inc_y2 * 8;         \
+        }                            \
+                                     \
+        if (m & 4)                   \
+        {                            \
+            CLOAD_Y4();              \
+            CGEMV_N_4x2();           \
+            CSTORE_Y4();             \
+                                     \
+            k += 2 * 4;              \
+            y += inc_y2 * 4;         \
+        }                            \
+                                     \
+        for (i = (m & 3); i--;)      \
+        {                            \
+             CGEMV_N_1x2();          \
+                                     \
+             k += 2;                 \
+             y += inc_y2;            \
+        }                            \
+                                     \
+        pa0 += 2 * lda2;             \
+        pa1 += 2 * lda2;             \
+                                     \
+        x += 2 * inc_x2;             \
+    }                                \
+                                     \
+    if (n & 1)                       \
+    {                                \
+        CLOAD_X1_SCALE();            \
+                                     \
+        k = 0;                       \
+        y = y_org;                   \
+                                     \
+        for (i = m; i--;)            \
+        {                            \
+            CGEMV_N_1x1();           \
+                                     \
+            k += 2;                  \
+            y += inc_y2;             \
+        }                            \
+                                     \
+        pa0 += lda2;                 \
+        x += inc_x2;                 \
+    }                                \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+          FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
+          BLASLONG inc_y2, FLOAT *buffer)
+{
+    BLASLONG i, j, k;
+    FLOAT *y_org = y;
+    FLOAT *pa0, *pa1, *pa2, *pa3;
+    FLOAT temp_r, temp_i, res0, res1, temp0_r;
+    FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
+    v4f32 alphar, alphai;
+    v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;
+    v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
+    v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
+    v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i;
+
+    lda2 = 2 * lda2;
+    inc_x2 = 2 * inc_x2;
+    inc_y2 = 2 * inc_y2;
+
+    pa0 = A;
+    pa1 = A + lda2;
+    pa2 = A + 2 * lda2;
+    pa3 = A + 3 * lda2;
+
+    alphar = COPY_FLOAT_TO_VECTOR(alpha_r);
+    alphai = COPY_FLOAT_TO_VECTOR(alpha_i);
+
+    if ((2 == inc_x2) && (2 == inc_y2))
+    {
+        #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_VECTOR
+        #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
+        #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
+        #define CLOAD_Y8        CLOAD_Y8_VECTOR
+        #define CLOAD_Y4        CLOAD_Y4_VECTOR
+        #define CSTORE_Y8       CSTORE_Y8_VECTOR
+        #define CSTORE_Y4       CSTORE_Y4_VECTOR
+
+        CGEMV_N_MSA();
+
+        #undef CLOAD_X4_SCALE
+        #undef CLOAD_X2_SCALE
+        #undef CLOAD_X1_SCALE
+        #undef CLOAD_Y8
+        #undef CLOAD_Y4
+        #undef CSTORE_Y8
+        #undef CSTORE_Y4
+    }
+    else if (2 == inc_x2)
+    {
+        #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_VECTOR
+        #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
+        #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
+        #define CLOAD_Y8         CLOAD_Y8_GP
+        #define CLOAD_Y4         CLOAD_Y4_GP
+        #define CSTORE_Y8        CSTORE_Y8_GP
+        #define CSTORE_Y4        CSTORE_Y4_GP
+
+        CGEMV_N_MSA();
+
+        #undef CLOAD_X4_SCALE
+        #undef CLOAD_X2_SCALE
+        #undef CLOAD_X1_SCALE
+        #undef CLOAD_Y8
+        #undef CLOAD_Y4
+        #undef CSTORE_Y8
+        #undef CSTORE_Y4
+    }
+    else if (2 == inc_y2)
+    {
+        #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_GP
+        #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
+        #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
+        #define CLOAD_Y8        CLOAD_Y8_VECTOR
+        #define CLOAD_Y4        CLOAD_Y4_VECTOR
+        #define CSTORE_Y8       CSTORE_Y8_VECTOR
+        #define CSTORE_Y4       CSTORE_Y4_VECTOR
+
+        CGEMV_N_MSA();
+
+        #undef CLOAD_X4_SCALE
+        #undef CLOAD_X2_SCALE
+        #undef CLOAD_X1_SCALE
+        #undef CLOAD_Y8
+        #undef CLOAD_Y4
+        #undef CSTORE_Y8
+        #undef CSTORE_Y4
+    }
+    else
+    {
+        #define CLOAD_X4_SCALE  CLOAD_X4_SCALE_GP
+        #define CLOAD_X2_SCALE  CLOAD_X2_SCALE_GP
+        #define CLOAD_X1_SCALE  CLOAD_X1_SCALE_GP
+        #define CLOAD_Y8        CLOAD_Y8_GP
+        #define CLOAD_Y4        CLOAD_Y4_GP
+        #define CSTORE_Y8       CSTORE_Y8_GP
+        #define CSTORE_Y4       CSTORE_Y4_GP
+
+        CGEMV_N_MSA();
+
+        #undef CLOAD_X4_SCALE
+        #undef CLOAD_X2_SCALE
+        #undef CLOAD_X1_SCALE
+        #undef CLOAD_Y8
+        #undef CLOAD_Y4
+        #undef CSTORE_Y8
+        #undef CSTORE_Y4
+    }
+    return(0);
+}
+
+#undef OP0
+#undef OP1
+#undef OP2
+#undef OP3
+#undef OP4
diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c
new file mode 100644 (file)
index 0000000..b9620bf
--- /dev/null
@@ -0,0 +1,583 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#undef OP0
+#undef OP1
+#undef OP2
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    #define OP0  -=
+    #define OP1  +=
+    #define OP2  +=
+#else
+    #define OP0  +=
+    #define OP1  +=
+    #define OP2  -=
+#endif
+
+#define CGEMV_T_8x4()                        \
+    LD_SP4(pa0 + k, 4, t0, t1, t2, t3);      \
+    LD_SP4(pa1 + k, 4, t4, t5, t6, t7);      \
+    LD_SP4(pa2 + k, 4, t8, t9, t10, t11);    \
+    LD_SP4(pa3 + k, 4, t12, t13, t14, t15);  \
+                                             \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);     \
+    PCKEVOD_W2_SP(t3, t2, src1r, src1i);     \
+    PCKEVOD_W2_SP(t5, t4, src2r, src2i);     \
+    PCKEVOD_W2_SP(t7, t6, src3r, src3i);     \
+    PCKEVOD_W2_SP(t9, t8, src4r, src4i);     \
+    PCKEVOD_W2_SP(t11, t10, src5r, src5i);   \
+    PCKEVOD_W2_SP(t13, t12, src6r, src6i);   \
+    PCKEVOD_W2_SP(t15, t14, src7r, src7i);   \
+                                             \
+    tp0r += src0r * x0r;                     \
+    tp0r += src1r * x1r;                     \
+    tp0r OP0 src0i * x0i;                    \
+    tp0r OP0 src1i * x1i;                    \
+                                             \
+    tp1r += src2r * x0r;                     \
+    tp1r += src3r * x1r;                     \
+    tp1r OP0 src2i * x0i;                    \
+    tp1r OP0 src3i * x1i;                    \
+                                             \
+    tp2r += src4r * x0r;                     \
+    tp2r += src5r * x1r;                     \
+    tp2r OP0 src4i * x0i;                    \
+    tp2r OP0 src5i * x1i;                    \
+                                             \
+    tp3r += src6r * x0r;                     \
+    tp3r += src7r * x1r;                     \
+    tp3r OP0 src6i * x0i;                    \
+    tp3r OP0 src7i * x1i;                    \
+                                             \
+    tp0i OP1 src0r * x0i;                    \
+    tp0i OP1 src1r * x1i;                    \
+    tp0i OP2 src0i * x0r;                    \
+    tp0i OP2 src1i * x1r;                    \
+                                             \
+    tp1i OP1 src2r * x0i;                    \
+    tp1i OP1 src3r * x1i;                    \
+    tp1i OP2 src2i * x0r;                    \
+    tp1i OP2 src3i * x1r;                    \
+                                             \
+    tp2i OP1 src4r * x0i;                    \
+    tp2i OP1 src5r * x1i;                    \
+    tp2i OP2 src4i * x0r;                    \
+    tp2i OP2 src5i * x1r;                    \
+                                             \
+    tp3i OP1 src6r * x0i;                    \
+    tp3i OP1 src7r * x1i;                    \
+    tp3i OP2 src6i * x0r;                    \
+    tp3i OP2 src7i * x1r;                    \
+
+#define CGEMV_T_8x2()                     \
+    LD_SP4(pa0 + k, 4, t0, t1, t2, t3);   \
+    LD_SP4(pa1 + k, 4, t4, t5, t6, t7);   \
+                                          \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
+    PCKEVOD_W2_SP(t3, t2, src1r, src1i);  \
+    PCKEVOD_W2_SP(t5, t4, src2r, src2i);  \
+    PCKEVOD_W2_SP(t7, t6, src3r, src3i);  \
+                                          \
+    tp0r += src0r * x0r;                  \
+    tp0r += src1r * x1r;                  \
+    tp0r OP0 src0i * x0i;                 \
+    tp0r OP0 src1i * x1i;                 \
+                                          \
+    tp1r += src2r * x0r;                  \
+    tp1r += src3r * x1r;                  \
+    tp1r OP0 src2i * x0i;                 \
+    tp1r OP0 src3i * x1i;                 \
+                                          \
+    tp0i OP1 src0r * x0i;                 \
+    tp0i OP1 src1r * x1i;                 \
+    tp0i OP2 src0i * x0r;                 \
+    tp0i OP2 src1i * x1r;                 \
+                                          \
+    tp1i OP1 src2r * x0i;                 \
+    tp1i OP1 src3r * x1i;                 \
+    tp1i OP2 src2i * x0r;                 \
+    tp1i OP2 src3i * x1r;                 \
+
+#define CGEMV_T_8x1()                     \
+    LD_SP4(pa0 + k, 4, t0, t1, t2, t3);   \
+                                          \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
+    PCKEVOD_W2_SP(t3, t2, src1r, src1i);  \
+                                          \
+    tp0r += src0r * x0r;                  \
+    tp0r += src1r * x1r;                  \
+    tp0r OP0 src0i * x0i;                 \
+    tp0r OP0 src1i * x1i;                 \
+                                          \
+    tp0i OP1 src0r * x0i;                 \
+    tp0i OP1 src1r * x1i;                 \
+    tp0i OP2 src0i * x0r;                 \
+    tp0i OP2 src1i * x1r;                 \
+
+#define CGEMV_T_4x4()                       \
+    LD_SP2(pa0 + k, 4, t0, t1);             \
+    LD_SP2(pa1 + k, 4, t4, t5);             \
+    LD_SP2(pa2 + k, 4, t8, t9);             \
+    LD_SP2(pa3 + k, 4, t12, t13);           \
+                                            \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);    \
+    PCKEVOD_W2_SP(t5, t4, src2r, src2i);    \
+    PCKEVOD_W2_SP(t9, t8, src4r, src4i);    \
+    PCKEVOD_W2_SP(t13, t12, src6r, src6i);  \
+                                            \
+    tp0r += src0r * x0r;                    \
+    tp0r OP0 src0i * x0i;                   \
+                                            \
+    tp1r += src2r * x0r;                    \
+    tp1r OP0 src2i * x0i;                   \
+                                            \
+    tp2r += src4r * x0r;                    \
+    tp2r OP0 src4i * x0i;                   \
+                                            \
+    tp3r += src6r * x0r;                    \
+    tp3r OP0 src6i * x0i;                   \
+                                            \
+    tp0i OP1 src0r * x0i;                   \
+    tp0i OP2 src0i * x0r;                   \
+                                            \
+    tp1i OP1 src2r * x0i;                   \
+    tp1i OP2 src2i * x0r;                   \
+                                            \
+    tp2i OP1 src4r * x0i;                   \
+    tp2i OP2 src4i * x0r;                   \
+                                            \
+    tp3i OP1 src6r * x0i;                   \
+    tp3i OP2 src6i * x0r;                   \
+
+#define CGEMV_T_4x2()                     \
+    LD_SP2(pa0 + k, 4, t0, t1);           \
+    LD_SP2(pa1 + k, 4, t4, t5);           \
+                                          \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
+    PCKEVOD_W2_SP(t5, t4, src2r, src2i);  \
+                                          \
+    tp0r += src0r * x0r;                  \
+    tp0r OP0 src0i * x0i;                 \
+                                          \
+    tp1r += src2r * x0r;                  \
+    tp1r OP0 src2i * x0i;                 \
+                                          \
+    tp0i OP1 src0r * x0i;                 \
+    tp0i OP2 src0i * x0r;                 \
+                                          \
+    tp1i OP1 src2r * x0i;                 \
+    tp1i OP2 src2i * x0r;                 \
+
+#define CGEMV_T_4x1()                     \
+    LD_SP2(pa0 + k, 4, t0, t1);           \
+                                          \
+    PCKEVOD_W2_SP(t1, t0, src0r, src0i);  \
+                                          \
+    tp0r += src0r * x0r;                  \
+    tp0r OP0 src0i * x0i;                 \
+                                          \
+    tp0i OP1 src0r * x0i;                 \
+    tp0i OP2 src0i * x0r;                 \
+
+#define CGEMV_T_1x4()                           \
+    temp0r  += pa0[k + 0] * x[0 * inc_x2];      \
+    temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1];  \
+    temp1r  += pa1[k + 0] * x[0 * inc_x2];      \
+    temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1];  \
+    temp2r  += pa2[k + 0] * x[0 * inc_x2];      \
+    temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1];  \
+    temp3r  += pa3[k + 0] * x[0 * inc_x2];      \
+    temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1];  \
+                                                \
+    temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1];  \
+    temp0i OP2 pa0[k + 1] * x[0 * inc_x2];      \
+    temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1];  \
+    temp1i OP2 pa1[k + 1] * x[0 * inc_x2];      \
+    temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1];  \
+    temp2i OP2 pa2[k + 1] * x[0 * inc_x2];      \
+    temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1];  \
+    temp3i OP2 pa3[k + 1] * x[0 * inc_x2];      \
+
+#define CGEMV_T_1x2()                           \
+    temp0r  += pa0[k + 0] * x[0 * inc_x2];      \
+    temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1];  \
+    temp1r  += pa1[k + 0] * x[0 * inc_x2];      \
+    temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1];  \
+                                                \
+    temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1];  \
+    temp0i OP2 pa0[k + 1] * x[0 * inc_x2];      \
+    temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1];  \
+    temp1i OP2 pa1[k + 1] * x[0 * inc_x2];      \
+
+#define CGEMV_T_1x1()                           \
+    temp0r  += pa0[k + 0] * x[0 * inc_x2];      \
+    temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1];  \
+                                                \
+    temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1];  \
+    temp0i OP2 pa0[k + 1] * x[0 * inc_x2];      \
+
+#define CSCALE_STORE_Y4_GP()    \
+    res0r = y[0 * inc_y2];      \
+    res1r = y[1 * inc_y2];      \
+    res2r = y[2 * inc_y2];      \
+    res3r = y[3 * inc_y2];      \
+                                \
+    res0i = y[0 * inc_y2 + 1];  \
+    res1i = y[1 * inc_y2 + 1];  \
+    res2i = y[2 * inc_y2 + 1];  \
+    res3i = y[3 * inc_y2 + 1];  \
+                                \
+    res0r  += alphar * temp0r;  \
+    res0r OP0 alphai * temp0i;  \
+    res1r  += alphar * temp1r;  \
+    res1r OP0 alphai * temp1i;  \
+    res2r  += alphar * temp2r;  \
+    res2r OP0 alphai * temp2i;  \
+    res3r  += alphar * temp3r;  \
+    res3r OP0 alphai * temp3i;  \
+                                \
+    res0i OP1 alphar * temp0i;  \
+    res0i OP2 alphai * temp0r;  \
+    res1i OP1 alphar * temp1i;  \
+    res1i OP2 alphai * temp1r;  \
+    res2i OP1 alphar * temp2i;  \
+    res2i OP2 alphai * temp2r;  \
+    res3i OP1 alphar * temp3i;  \
+    res3i OP2 alphai * temp3r;  \
+                                \
+    y[0 * inc_y2] = res0r;      \
+    y[1 * inc_y2] = res1r;      \
+    y[2 * inc_y2] = res2r;      \
+    y[3 * inc_y2] = res3r;      \
+                                \
+    y[0 * inc_y2 + 1] = res0i;  \
+    y[1 * inc_y2 + 1] = res1i;  \
+    y[2 * inc_y2 + 1] = res2i;  \
+    y[3 * inc_y2 + 1] = res3i;  \
+
+#define CSCALE_STORE_Y2_GP()    \
+    res0r = y[0 * inc_y2];      \
+    res1r = y[1 * inc_y2];      \
+                                \
+    res0i = y[0 * inc_y2 + 1];  \
+    res1i = y[1 * inc_y2 + 1];  \
+                                \
+    res0r  += alphar * temp0r;  \
+    res0r OP0 alphai * temp0i;  \
+    res1r  += alphar * temp1r;  \
+    res1r OP0 alphai * temp1i;  \
+                                \
+    res0i OP1 alphar * temp0i;  \
+    res0i OP2 alphai * temp0r;  \
+    res1i OP1 alphar * temp1i;  \
+    res1i OP2 alphai * temp1r;  \
+                                \
+    y[0 * inc_y2] = res0r;      \
+    y[1 * inc_y2] = res1r;      \
+                                \
+    y[0 * inc_y2 + 1] = res0i;  \
+    y[1 * inc_y2 + 1] = res1i;  \
+
+
+#define CSCALE_STORE_Y1_GP()    \
+    res0r = y[0 * inc_y2];      \
+    res0i = y[0 * inc_y2 + 1];  \
+                                \
+    res0r  += alphar * temp0r;  \
+    res0r OP0 alphai * temp0i;  \
+                                \
+    res0i OP1 alphar * temp0i;  \
+    res0i OP2 alphai * temp0r;  \
+                                \
+    y[0 * inc_y2] = res0r;      \
+    y[0 * inc_y2 + 1] = res0i;  \
+
+#define CLOAD_X8_VECTOR()             \
+    LD_SP4(x, 4, x0, x1, x2, x3);     \
+    PCKEVOD_W2_SP(x1, x0, x0r, x0i);  \
+    PCKEVOD_W2_SP(x3, x2, x1r, x1i);  \
+
+#define CLOAD_X4_VECTOR()             \
+    LD_SP2(x, 4, x0, x1);             \
+    PCKEVOD_W2_SP(x1, x0, x0r, x0i);  \
+
+#define CLOAD_X8_GP()                                                                \
+    x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  1, *((int *) (x + 1 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  2, *((int *) (x + 2 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  3, *((int *) (x + 3 * inc_x2)));      \
+    x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2)));      \
+    x1r = (v4f32) __msa_insert_w((v4i32) x1r,  1, *((int *) (x + 5 * inc_x2)));      \
+    x1r = (v4f32) __msa_insert_w((v4i32) x1r,  2, *((int *) (x + 6 * inc_x2)));      \
+    x1r = (v4f32) __msa_insert_w((v4i32) x1r,  3, *((int *) (x + 7 * inc_x2)));      \
+    x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  1, *((int *) (x + 1 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  2, *((int *) (x + 2 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  3, *((int *) (x + 3 * inc_x2 + 1)));  \
+    x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1)));  \
+    x1i = (v4f32) __msa_insert_w((v4i32) x1i,  1, *((int *) (x + 5 * inc_x2 + 1)));  \
+    x1i = (v4f32) __msa_insert_w((v4i32) x1i,  2, *((int *) (x + 6 * inc_x2 + 1)));  \
+    x1i = (v4f32) __msa_insert_w((v4i32) x1i,  3, *((int *) (x + 7 * inc_x2 + 1)));  \
+
+#define CLOAD_X4_GP()                                                                \
+    x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  1, *((int *) (x + 1 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  2, *((int *) (x + 2 * inc_x2)));      \
+    x0r = (v4f32) __msa_insert_w((v4i32) x0r,  3, *((int *) (x + 3 * inc_x2)));      \
+    x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  1, *((int *) (x + 1 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  2, *((int *) (x + 2 * inc_x2 + 1)));  \
+    x0i = (v4f32) __msa_insert_w((v4i32) x0i,  3, *((int *) (x + 3 * inc_x2 + 1)));  \
+
+#define CGEMV_T_MSA()                                \
+    for (j = (n >> 2); j--;)                         \
+    {                                                \
+        tp0r = tp1r = tp2r = tp3r = zero;            \
+        tp0i = tp1i = tp2i = tp3i = zero;            \
+                                                     \
+        k = 0;                                       \
+        x = srcx_org;                                \
+                                                     \
+        for (i = (m >> 3); i--;)                     \
+        {                                            \
+            CLOAD_X8()                               \
+            CGEMV_T_8x4();                           \
+                                                     \
+            k += 2 * 8;                              \
+            x += inc_x2 * 8;                         \
+        }                                            \
+                                                     \
+        if (m & 4)                                   \
+        {                                            \
+            CLOAD_X4();                              \
+                                                     \
+            CGEMV_T_4x4();                           \
+                                                     \
+            k += 2 * 4;                              \
+            x += inc_x2 * 4;                         \
+        }                                            \
+                                                     \
+        TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r,   \
+                           tp0r, tp1r, tp2r, tp3r);  \
+        TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i,   \
+                           tp0i, tp1i, tp2i, tp3i);  \
+                                                     \
+        tp0r += tp1r;                                \
+        tp0r += tp2r;                                \
+        tp0r += tp3r;                                \
+        tp0i += tp1i;                                \
+        tp0i += tp2i;                                \
+        tp0i += tp3i;                                \
+                                                     \
+        temp0r = tp0r[0];                            \
+        temp1r = tp0r[1];                            \
+        temp2r = tp0r[2];                            \
+        temp3r = tp0r[3];                            \
+        temp0i = tp0i[0];                            \
+        temp1i = tp0i[1];                            \
+        temp2i = tp0i[2];                            \
+        temp3i = tp0i[3];                            \
+                                                     \
+        for (i = (m & 3); i--;)                      \
+        {                                            \
+            CGEMV_T_1x4();                           \
+                                                     \
+            k += 2;                                  \
+            x += inc_x2;                             \
+        }                                            \
+                                                     \
+        CSCALE_STORE_Y4_GP();                        \
+                                                     \
+        pa0 += 4 * lda2;                             \
+        pa1 += 4 * lda2;                             \
+        pa2 += 4 * lda2;                             \
+        pa3 += 4 * lda2;                             \
+        y += 4 * inc_y2;                             \
+    }                                                \
+                                                     \
+    if (n & 2)                                       \
+    {                                                \
+        tp0r = tp1r = zero;                          \
+        tp0i = tp1i = zero;                          \
+                                                     \
+        k = 0;                                       \
+        x = srcx_org;                                \
+                                                     \
+        for (i = (m >> 3); i--;)                     \
+        {                                            \
+            CLOAD_X8();                              \
+                                                     \
+            CGEMV_T_8x2();                           \
+                                                     \
+            k += 2 * 8;                              \
+            x += inc_x2 * 8;                         \
+        }                                            \
+                                                     \
+        if (m & 4)                                   \
+        {                                            \
+            CLOAD_X4();                              \
+                                                     \
+            CGEMV_T_4x2();                           \
+                                                     \
+            k += 2 * 4;                              \
+            x += inc_x2 * 4;                         \
+        }                                            \
+                                                     \
+        TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i,   \
+                           tp0r, tp1r, tp0i, tp1i);  \
+                                                     \
+        tp0r += tp1r;                                \
+        tp0r += tp0i;                                \
+        tp0r += tp1i;                                \
+                                                     \
+        temp0r = tp0r[0];                            \
+        temp1r = tp0r[1];                            \
+        temp0i = tp0r[2];                            \
+        temp1i = tp0r[3];                            \
+                                                     \
+        for (i = (m & 3); i--;)                      \
+        {                                            \
+            CGEMV_T_1x2();                           \
+                                                     \
+            k += 2;                                  \
+            x += inc_x2;                             \
+        }                                            \
+                                                     \
+        CSCALE_STORE_Y2_GP();                        \
+                                                     \
+        pa0 += 2 * lda2;                             \
+        pa1 += 2 * lda2;                             \
+        y += 2 * inc_y2;                             \
+    }                                                \
+                                                     \
+    if (n & 1)                                       \
+    {                                                \
+        tp0r = zero;                                 \
+        tp0i = zero;                                 \
+                                                     \
+        k = 0;                                       \
+        x = srcx_org;                                \
+                                                     \
+        for (i = (m >> 3); i--;)                     \
+        {                                            \
+            CLOAD_X8();                              \
+                                                     \
+            CGEMV_T_8x1();                           \
+                                                     \
+            k += 2 * 8;                              \
+            x += inc_x2 * 8;                         \
+        }                                            \
+                                                     \
+        if (m & 4)                                   \
+        {                                            \
+            CLOAD_X4();                              \
+                                                     \
+            CGEMV_T_4x1();                           \
+                                                     \
+            k += 2 * 4;                              \
+            x += inc_x2 * 4;                         \
+        }                                            \
+                                                     \
+        ILVRL_W2_SP(tp0i, tp0r, t0, t1);             \
+                                                     \
+        t0 += t1;                                    \
+                                                     \
+        temp0r = t0[0] + t0[2];                      \
+        temp0i = t0[1] + t0[3];                      \
+                                                     \
+        for (i = (m & 3); i--;)                      \
+        {                                            \
+            CGEMV_T_1x1();                           \
+                                                     \
+            k += 2;                                  \
+            x += inc_x2;                             \
+        }                                            \
+                                                     \
+        CSCALE_STORE_Y1_GP();                        \
+                                                     \
+        pa0 += lda2;                                 \
+        y += inc_y2;                                 \
+    }                                                \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
+          FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+          BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i, j, k;
+    FLOAT *pa0, *pa1, *pa2, *pa3;
+    FLOAT *srcx_org = x;
+    FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
+    FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
+    BLASLONG inc_x2, inc_y2, lda2;
+    v4f32 zero = {0};
+    v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
+    v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
+    v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
+    v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
+
+    lda2 = 2 * lda;
+
+    pa0 = A;
+    pa1 = A + lda2;
+    pa2 = A + 2 * lda2;
+    pa3 = A + 3 * lda2;
+
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+
+    if (2 == inc_x2)
+    {
+        #define CLOAD_X8  CLOAD_X8_VECTOR
+        #define CLOAD_X4  CLOAD_X4_VECTOR
+
+        CGEMV_T_MSA();
+
+        #undef CLOAD_X8
+        #undef CLOAD_X4
+    }
+    else
+    {
+        #define CLOAD_X8  CLOAD_X8_GP
+        #define CLOAD_X4  CLOAD_X4_GP
+
+        CGEMV_T_MSA();
+
+        #undef CLOAD_X8
+        #undef CLOAD_X4
+    }
+
+    return(0);
+}
+
+#undef OP0
+#undef OP1
+#undef OP2
diff --git a/kernel/mips/dasum_msa.c b/kernel/mips/dasum_msa.c
new file mode 100644 (file)
index 0000000..a3641cd
--- /dev/null
@@ -0,0 +1,278 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include "macros_msa.h"
+
+#define AND_VEC_D(in)   ((v2f64) ((v2i64) in & and_vec))
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i;
+    FLOAT sumf = 0.0;
+    v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+    v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
+    v2f64 zero_v = {0};
+    v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
+
+    if (n <= 0 || inc_x <= 0) return (sumf);
+
+    if (1 == inc_x)
+    {
+        if (n > 15)
+        {
+            n -= 16;
+
+            LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 = AND_VEC_D(src0);
+            sum_abs1 = AND_VEC_D(src1);
+            sum_abs2 = AND_VEC_D(src2);
+            sum_abs3 = AND_VEC_D(src3);
+            sum_abs0 += AND_VEC_D(src4);
+            sum_abs1 += AND_VEC_D(src5);
+            sum_abs2 += AND_VEC_D(src6);
+            sum_abs3 += AND_VEC_D(src7);
+        }
+        else
+        {
+            sum_abs0 = zero_v;
+            sum_abs1 = zero_v;
+            sum_abs2 = zero_v;
+            sum_abs3 = zero_v;
+        }
+
+        for (i = (n >> 4); i--;)
+        {
+            LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 += AND_VEC_D(src0);
+            sum_abs1 += AND_VEC_D(src1);
+            sum_abs2 += AND_VEC_D(src2);
+            sum_abs3 += AND_VEC_D(src3);
+            sum_abs0 += AND_VEC_D(src4);
+            sum_abs1 += AND_VEC_D(src5);
+            sum_abs2 += AND_VEC_D(src6);
+            sum_abs3 += AND_VEC_D(src7);
+        }
+
+        if (n & 15)
+        {
+            if ((n & 8) && (n & 4) && (n & 2))
+            {
+                LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+                sum_abs0 += AND_VEC_D(src4);
+                sum_abs1 += AND_VEC_D(src5);
+                sum_abs2 += AND_VEC_D(src6);
+            }
+            else if ((n & 8) && (n & 4))
+            {
+                LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+                sum_abs0 += AND_VEC_D(src4);
+                sum_abs1 += AND_VEC_D(src5);
+            }
+            else if ((n & 8) && (n & 2))
+            {
+                LD_DP5_INC(x, 2, src0, src1, src2, src3, src4);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+                sum_abs0 += AND_VEC_D(src4);
+            }
+            else if ((n & 4) && (n & 2))
+            {
+                LD_DP3_INC(x, 2, src0, src1, src2);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+            }
+            else if (n & 8)
+            {
+                LD_DP4_INC(x, 2, src0, src1, src2, src3);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+            }
+            else if (n & 4)
+            {
+                LD_DP2_INC(x, 2, src0, src1);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+            }
+            else if (n & 2)
+            {
+                src0 = LD_DP(x); x += 2;
+
+                sum_abs0 += AND_VEC_D(src0);
+            }
+
+            sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+            sumf = sum_abs0[0] + sum_abs0[1];
+
+            if (n & 1)
+            {
+                sumf += fabs(*x);
+            }
+        }
+        else
+        {
+            sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+            sumf = sum_abs0[0] + sum_abs0[1];
+        }
+    }
+    else
+    {
+        if (n > 8)
+        {
+            n -= 8;
+
+            LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 = AND_VEC_D(src0);
+            sum_abs1 = AND_VEC_D(src1);
+            sum_abs2 = AND_VEC_D(src2);
+            sum_abs3 = AND_VEC_D(src3);
+            sum_abs0 += AND_VEC_D(src4);
+            sum_abs1 += AND_VEC_D(src5);
+            sum_abs2 += AND_VEC_D(src6);
+            sum_abs3 += AND_VEC_D(src7);
+        }
+        else
+        {
+            sum_abs0 = zero_v;
+            sum_abs1 = zero_v;
+            sum_abs2 = zero_v;
+            sum_abs3 = zero_v;
+        }
+
+        for (i = (n >> 3); i--;)
+        {
+            LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 += AND_VEC_D(src0);
+            sum_abs1 += AND_VEC_D(src1);
+            sum_abs2 += AND_VEC_D(src2);
+            sum_abs3 += AND_VEC_D(src3);
+            sum_abs0 += AND_VEC_D(src4);
+            sum_abs1 += AND_VEC_D(src5);
+            sum_abs2 += AND_VEC_D(src6);
+            sum_abs3 += AND_VEC_D(src7);
+        }
+
+        if (n & 7)
+        {
+            if ((n & 4) && (n & 2) && (n & 1))
+            {
+                LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+                sum_abs0 += AND_VEC_D(src4);
+                sum_abs1 += AND_VEC_D(src5);
+                sum_abs2 += AND_VEC_D(src6);
+            }
+            else if ((n & 4) && (n & 2))
+            {
+                LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+                sum_abs0 += AND_VEC_D(src4);
+                sum_abs1 += AND_VEC_D(src5);
+            }
+            else if ((n & 4) && (n & 1))
+            {
+                LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+                sum_abs0 += AND_VEC_D(src4);
+            }
+            else if ((n & 2) && (n & 1))
+            {
+                LD_DP3_INC(x, inc_x, src0, src1, src2);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+            }
+            else if (n & 4)
+            {
+                LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+                sum_abs2 += AND_VEC_D(src2);
+                sum_abs3 += AND_VEC_D(src3);
+            }
+            else if (n & 2)
+            {
+                LD_DP2_INC(x, inc_x, src0, src1);
+
+                sum_abs0 += AND_VEC_D(src0);
+                sum_abs1 += AND_VEC_D(src1);
+            }
+            else if (n & 1)
+            {
+                src0 = LD_DP(x);
+
+                sum_abs0 += AND_VEC_D(src0);
+            }
+        }
+
+        sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+        sumf = sum_abs0[0];
+    }
+
+    return (sumf);
+}
diff --git a/kernel/mips/ddot_msa.c b/kernel/mips/ddot_msa.c
new file mode 100644 (file)
index 0000000..b56e101
--- /dev/null
@@ -0,0 +1,189 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+/* return float, x,y float */
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+    BLASLONG i = 0;
+    double dot = 0.0;
+    FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
+    v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
+    v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+    v2f64 dot0 = {0, 0};
+
+    if (n < 0) return (dot);
+
+    if ((1 == inc_x) && (1 == inc_y))
+    {
+        for (i = (n >> 4); i--;)
+        {
+                       LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+                       LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+            dot0 += (vy0 * vx0);
+            dot0 += (vy1 * vx1);
+            dot0 += (vy2 * vx2);
+            dot0 += (vy3 * vx3);
+            dot0 += (vy4 * vx4);
+            dot0 += (vy5 * vx5);
+            dot0 += (vy6 * vx6);
+            dot0 += (vy7 * vx7);
+        }
+
+        if (n & 15)
+        {
+            if ((n & 8) && (n & 4) && (n & 2))
+                       {
+                               LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
+                               LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                dot0 += (vy3 * vx3);
+                dot0 += (vy4 * vx4);
+                dot0 += (vy5 * vx5);
+                dot0 += (vy6 * vx6);
+                       }
+            else if ((n & 8) && (n & 4))
+                       {
+                               LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5);
+                               LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                dot0 += (vy3 * vx3);
+                dot0 += (vy4 * vx4);
+                dot0 += (vy5 * vx5);
+                       }
+            else if ((n & 8) && (n & 2))
+                       {
+                               LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4);
+                               LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                dot0 += (vy3 * vx3);
+                dot0 += (vy4 * vx4);
+                       }
+            else if ((n & 4) && (n & 2))
+                       {
+                               LD_DP3_INC(x, 2, vx0, vx1, vx2);
+                               LD_DP3_INC(y, 2, vy0, vy1, vy2);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                       }
+            else if (n & 8)
+            {
+                               LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
+                               LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                dot0 += (vy3 * vx3);
+            }
+            else if (n & 4)
+            {
+                               LD_DP2_INC(x, 2, vx0, vx1);
+                               LD_DP2_INC(y, 2, vy0, vy1);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+            }
+            else if (n & 2)
+            {
+                vx0 = LD_DP(x); x += 2;
+                vy0 = LD_DP(y); y += 2;
+
+                dot0 += (vy0 * vx0);
+            }
+
+            if (n & 1)
+            {
+                x0 = *x;
+                y0 = *y;
+
+                dot += (y0 * x0);
+            }
+        }
+
+        dot += dot0[0];
+        dot += dot0[1];
+    }
+    else
+    {
+        for (i = (n >> 2); i--;)
+        {
+            LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
+            LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
+
+            dot += (y0 * x0);
+            dot += (y1 * x1);
+            dot += (y2 * x2);
+            dot += (y3 * x3);
+        }
+
+        if ((n & 2) && (n & 1))
+        {
+            LD_GP3_INC(x, inc_x, x0, x1, x2);
+            LD_GP3_INC(y, inc_y, y0, y1, y2);
+
+            dot += (y0 * x0);
+            dot += (y1 * x1);
+            dot += (y2 * x2);
+        }
+        else if (n & 2)
+        {
+            LD_GP2_INC(x, inc_x, x0, x1);
+            LD_GP2_INC(y, inc_y, y0, y1);
+
+            dot += (y0 * x0);
+            dot += (y1 * x1);
+        }
+        else if (n & 1)
+        {
+            x0 = *x;
+            y0 = *y;
+
+            dot += (y0 * x0);
+        }
+    }
+
+    return (dot);
+}
diff --git a/kernel/mips/dgemv_n_msa.c b/kernel/mips/dgemv_n_msa.c
new file mode 100644 (file)
index 0000000..09bb063
--- /dev/null
@@ -0,0 +1,577 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define DGEMV_N_8x8()                        \
+{                                            \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
+    LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
+    LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
+    LD_DP4(pa4 + k, 2, t16, t17, t18, t19);  \
+    LD_DP4(pa5 + k, 2, t20, t21, t22, t23);  \
+    LD_DP4(pa6 + k, 2, t24, t25, t26, t27);  \
+    LD_DP4(pa7 + k, 2, t28, t29, t30, t31);  \
+                                             \
+    y0 += tp0 * t0;                          \
+    y1 += tp0 * t1;                          \
+    y2 += tp0 * t2;                          \
+    y3 += tp0 * t3;                          \
+                                             \
+    y0 += tp1 * t4;                          \
+    y1 += tp1 * t5;                          \
+    y2 += tp1 * t6;                          \
+    y3 += tp1 * t7;                          \
+                                             \
+    y0 += tp2 * t8;                          \
+    y1 += tp2 * t9;                          \
+    y2 += tp2 * t10;                         \
+    y3 += tp2 * t11;                         \
+                                             \
+    y0 += tp3 * t12;                         \
+    y1 += tp3 * t13;                         \
+    y2 += tp3 * t14;                         \
+    y3 += tp3 * t15;                         \
+                                             \
+    y0 += tp4 * t16;                         \
+    y1 += tp4 * t17;                         \
+    y2 += tp4 * t18;                         \
+    y3 += tp4 * t19;                         \
+                                             \
+    y0 += tp5 * t20;                         \
+    y1 += tp5 * t21;                         \
+    y2 += tp5 * t22;                         \
+    y3 += tp5 * t23;                         \
+                                             \
+    y0 += tp6 * t24;                         \
+    y1 += tp6 * t25;                         \
+    y2 += tp6 * t26;                         \
+    y3 += tp6 * t27;                         \
+                                             \
+    y0 += tp7 * t28;                         \
+    y1 += tp7 * t29;                         \
+    y2 += tp7 * t30;                         \
+    y3 += tp7 * t31;                         \
+}
+
+#define DGEMV_N_4x8()              \
+{                                  \
+    LD_DP2(pa0 + k, 2, t0, t1);    \
+    LD_DP2(pa1 + k, 2, t4, t5);    \
+    LD_DP2(pa2 + k, 2, t8, t9);    \
+    LD_DP2(pa3 + k, 2, t12, t13);  \
+    LD_DP2(pa4 + k, 2, t16, t17);  \
+    LD_DP2(pa5 + k, 2, t20, t21);  \
+    LD_DP2(pa6 + k, 2, t24, t25);  \
+    LD_DP2(pa7 + k, 2, t28, t29);  \
+                                   \
+    y0 += tp0 * t0;                \
+    y1 += tp0 * t1;                \
+                                   \
+    y0 += tp1 * t4;                \
+    y1 += tp1 * t5;                \
+                                   \
+    y0 += tp2 * t8;                \
+    y1 += tp2 * t9;                \
+                                   \
+    y0 += tp3 * t12;               \
+    y1 += tp3 * t13;               \
+                                   \
+    y0 += tp4 * t16;               \
+    y1 += tp4 * t17;               \
+                                   \
+    y0 += tp5 * t20;               \
+    y1 += tp5 * t21;               \
+                                   \
+    y0 += tp6 * t24;               \
+    y1 += tp6 * t25;               \
+                                   \
+    y0 += tp7 * t28;               \
+    y1 += tp7 * t29;               \
+}
+
+#define DGEMV_N_8x4()                        \
+{                                            \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
+    LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
+    LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
+                                             \
+    y0 += tp0 * t0;                          \
+    y1 += tp0 * t1;                          \
+    y2 += tp0 * t2;                          \
+    y3 += tp0 * t3;                          \
+                                             \
+    y0 += tp1 * t4;                          \
+    y1 += tp1 * t5;                          \
+    y2 += tp1 * t6;                          \
+    y3 += tp1 * t7;                          \
+                                             \
+    y0 += tp2 * t8;                          \
+    y1 += tp2 * t9;                          \
+    y2 += tp2 * t10;                         \
+    y3 += tp2 * t11;                         \
+                                             \
+    y0 += tp3 * t12;                         \
+    y1 += tp3 * t13;                         \
+    y2 += tp3 * t14;                         \
+    y3 += tp3 * t15;                         \
+}
+
+#define DGEMV_N_4x4()              \
+{                                  \
+    LD_DP2(pa0 + k, 2, t0, t1);    \
+    LD_DP2(pa1 + k, 2, t4, t5);    \
+    LD_DP2(pa2 + k, 2, t8, t9);    \
+    LD_DP2(pa3 + k, 2, t12, t13);  \
+                                   \
+    y0 += tp0 * t0;                \
+    y1 += tp0 * t1;                \
+                                   \
+    y0 += tp1 * t4;                \
+    y1 += tp1 * t5;                \
+                                   \
+    y0 += tp2 * t8;                \
+    y1 += tp2 * t9;                \
+                                   \
+    y0 += tp3 * t12;               \
+    y1 += tp3 * t13;               \
+}
+
+#define DGEMV_N_8x2()                    \
+{                                        \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);  \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);  \
+                                         \
+    y0 += tp0 * t0;                      \
+    y1 += tp0 * t1;                      \
+    y2 += tp0 * t2;                      \
+    y3 += tp0 * t3;                      \
+                                         \
+    y0 += tp1 * t4;                      \
+    y1 += tp1 * t5;                      \
+    y2 += tp1 * t6;                      \
+    y3 += tp1 * t7;                      \
+}
+
+#define DGEMV_N_4x2()            \
+{                                \
+    LD_DP2(pa0 + k, 2, t0, t1);  \
+    LD_DP2(pa1 + k, 2, t4, t5);  \
+                                 \
+    y0 += tp0 * t0;              \
+    y1 += tp0 * t1;              \
+                                 \
+    y0 += tp1 * t4;              \
+    y1 += tp1 * t5;              \
+}
+
+#define DLOAD_X8_SCALE_GP()             \
+   temp0 = alpha * x[0 * inc_x];        \
+   temp1 = alpha * x[1 * inc_x];        \
+   temp2 = alpha * x[2 * inc_x];        \
+   temp3 = alpha * x[3 * inc_x];        \
+   temp4 = alpha * x[4 * inc_x];        \
+   temp5 = alpha * x[5 * inc_x];        \
+   temp6 = alpha * x[6 * inc_x];        \
+   temp7 = alpha * x[7 * inc_x];        \
+                                        \
+   tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
+   tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
+   tp2 = COPY_DOUBLE_TO_VECTOR(temp2);  \
+   tp3 = COPY_DOUBLE_TO_VECTOR(temp3);  \
+   tp4 = COPY_DOUBLE_TO_VECTOR(temp4);  \
+   tp5 = COPY_DOUBLE_TO_VECTOR(temp5);  \
+   tp6 = COPY_DOUBLE_TO_VECTOR(temp6);  \
+   tp7 = COPY_DOUBLE_TO_VECTOR(temp7);  \
+
+#define  DLOAD_X4_SCALE_GP()             \
+    temp0 = alpha * x[0 * inc_x];        \
+    temp1 = alpha * x[1 * inc_x];        \
+    temp2 = alpha * x[2 * inc_x];        \
+    temp3 = alpha * x[3 * inc_x];        \
+                                         \
+    tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
+    tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
+    tp2 = COPY_DOUBLE_TO_VECTOR(temp2);  \
+    tp3 = COPY_DOUBLE_TO_VECTOR(temp3);  \
+
+#define DLOAD_X8_SCALE_VECTOR()    \
+    LD_DP4(x, 2, x0, x1, x2, x3);  \
+                                   \
+    x0 = x0 * v_alpha;             \
+    x1 = x1 * v_alpha;             \
+    x2 = x2 * v_alpha;             \
+    x3 = x3 * v_alpha;             \
+                                   \
+    SPLATI_D2_DP(x0, tp0, tp1);    \
+    SPLATI_D2_DP(x1, tp2, tp3);    \
+    SPLATI_D2_DP(x2, tp4, tp5);    \
+    SPLATI_D2_DP(x3, tp6, tp7);    \
+
+#define DLOAD_X4_SCALE_VECTOR()  \
+    LD_DP2(x, 2, x0, x1);        \
+                                 \
+    x0 = x0 * v_alpha;           \
+    x1 = x1 * v_alpha;           \
+                                 \
+    SPLATI_D2_DP(x0, tp0, tp1);  \
+    SPLATI_D2_DP(x1, tp2, tp3);  \
+
+#define DLOAD_Y8_GP()                                                              \
+    y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y)));  \
+    y0 = (v2f64) __msa_insert_d((v2i64) y0,  1, *((long long *)(y + 1 * inc_y)));  \
+    y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y)));  \
+    y1 = (v2f64) __msa_insert_d((v2i64) y1,  1, *((long long *)(y + 3 * inc_y)));  \
+    y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y)));  \
+    y2 = (v2f64) __msa_insert_d((v2i64) y2,  1, *((long long *)(y + 5 * inc_y)));  \
+    y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y)));  \
+    y3 = (v2f64) __msa_insert_d((v2i64) y3,  1, *((long long *)(y + 7 * inc_y)));  \
+
+#define DLOAD_Y4_GP()                                                              \
+    y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y)));  \
+    y0 = (v2f64) __msa_insert_d((v2i64) y0,  1, *((long long *)(y + 1 * inc_y)));  \
+    y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y)));  \
+    y1 = (v2f64) __msa_insert_d((v2i64) y1,  1, *((long long *)(y + 3 * inc_y)));  \
+
+#define DLOAD_Y8_VECTOR()  LD_DP4(y, 2, y0, y1, y2, y3);
+#define DLOAD_Y4_VECTOR()  LD_DP2(y, 2, y0, y1);
+
+#define DSTORE_Y8_GP()                                                \
+    *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0);  \
+    *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1);  \
+    *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0);  \
+    *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1);  \
+    *((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0);  \
+    *((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1);  \
+    *((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0);  \
+    *((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1);  \
+
+#define DSTORE_Y4_GP()                                                \
+    *((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0);  \
+    *((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1);  \
+    *((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0);  \
+    *((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1);  \
+
+#define DSTORE_Y8_VECTOR()  ST_DP4(y0, y1, y2, y3, y, 2);
+#define DSTORE_Y4_VECTOR()  ST_DP2(y0, y1, y, 2);
+
+#define DGEMV_N_MSA()                        \
+    for (j = (n >> 3); j--;)                 \
+    {                                        \
+        DLOAD_X8_SCALE();                    \
+                                             \
+        k = 0;                               \
+        y = y_org;                           \
+                                             \
+        for (i = (m >> 3); i--;)             \
+        {                                    \
+            DLOAD_Y8();                      \
+            DGEMV_N_8x8();                   \
+            DSTORE_Y8();                     \
+                                             \
+            y += 8 * inc_y;                  \
+            k += 8;                          \
+        }                                    \
+                                             \
+        if (m & 4)                           \
+        {                                    \
+            DLOAD_Y4();                      \
+            DGEMV_N_4x8();                   \
+            DSTORE_Y4();                     \
+                                             \
+            y += 4 * inc_y;                  \
+            k += 4;                          \
+        }                                    \
+                                             \
+        if (m & 3)                           \
+        {                                    \
+            temp0 = alpha * x[0 * inc_x];    \
+            temp1 = alpha * x[1 * inc_x];    \
+            temp2 = alpha * x[2 * inc_x];    \
+            temp3 = alpha * x[3 * inc_x];    \
+            temp4 = alpha * x[4 * inc_x];    \
+            temp5 = alpha * x[5 * inc_x];    \
+            temp6 = alpha * x[6 * inc_x];    \
+            temp7 = alpha * x[7 * inc_x];    \
+                                             \
+            for (i = (m & 3); i--;)          \
+            {                                \
+                temp = y[0];                 \
+                temp += temp0 * pa0[k];      \
+                temp += temp1 * pa1[k];      \
+                temp += temp2 * pa2[k];      \
+                temp += temp3 * pa3[k];      \
+                temp += temp4 * pa4[k];      \
+                temp += temp5 * pa5[k];      \
+                temp += temp6 * pa6[k];      \
+                temp += temp7 * pa7[k];      \
+                y[0] = temp;                 \
+                                             \
+                y += inc_y;                  \
+                k++;                         \
+            }                                \
+        }                                    \
+        pa0 += 8 * lda;                      \
+        pa1 += 8 * lda;                      \
+        pa2 += 8 * lda;                      \
+        pa3 += 8 * lda;                      \
+        pa4 += 8 * lda;                      \
+        pa5 += 8 * lda;                      \
+        pa6 += 8 * lda;                      \
+        pa7 += 8 * lda;                      \
+                                             \
+        x += 8 * inc_x;                      \
+    }                                        \
+                                             \
+    if (n & 4)                               \
+    {                                        \
+        DLOAD_X4_SCALE();                    \
+                                             \
+        k = 0;                               \
+        y = y_org;                           \
+                                             \
+        for (i = (m >> 3); i--;)             \
+        {                                    \
+            DLOAD_Y8();                      \
+            DGEMV_N_8x4();                   \
+            DSTORE_Y8();                     \
+                                             \
+            y += 8 * inc_y;                  \
+            k += 8;                          \
+        }                                    \
+                                             \
+        if (m & 4)                           \
+        {                                    \
+            DLOAD_Y4();                      \
+            DGEMV_N_4x4();                   \
+            DSTORE_Y4();                     \
+                                             \
+            y += 4 * inc_y;                  \
+            k += 4;                          \
+        }                                    \
+                                             \
+        if (m & 3)                           \
+        {                                    \
+            temp0 = alpha * x[0 * inc_x];    \
+            temp1 = alpha * x[1 * inc_x];    \
+            temp2 = alpha * x[2 * inc_x];    \
+            temp3 = alpha * x[3 * inc_x];    \
+                                             \
+            for (i = (m & 3); i--;)          \
+            {                                \
+                temp = y[0];                 \
+                temp += temp0 * pa0[k];      \
+                temp += temp1 * pa1[k];      \
+                temp += temp2 * pa2[k];      \
+                temp += temp3 * pa3[k];      \
+                y[0] = temp;                 \
+                                             \
+                y += inc_y;                  \
+                k++;                         \
+            }                                \
+        }                                    \
+                                             \
+        pa0 += 4 * lda;                      \
+        pa1 += 4 * lda;                      \
+        pa2 += 4 * lda;                      \
+        pa3 += 4 * lda;                      \
+                                             \
+        x += 4 * inc_x;                      \
+    }                                        \
+                                             \
+    if (n & 2)                               \
+    {                                        \
+        temp0 = alpha * x[0 * inc_x];        \
+        temp1 = alpha * x[1 * inc_x];        \
+                                             \
+        tp0 = COPY_DOUBLE_TO_VECTOR(temp0);  \
+        tp1 = COPY_DOUBLE_TO_VECTOR(temp1);  \
+                                             \
+        k = 0;                               \
+        y = y_org;                           \
+                                             \
+        for (i = (m >> 3); i--;)             \
+        {                                    \
+            DLOAD_Y8();                      \
+            DGEMV_N_8x2();                   \
+            DSTORE_Y8();                     \
+                                             \
+            y += 8 * inc_y;                  \
+            k += 8;                          \
+        }                                    \
+                                             \
+        if (m & 4)                           \
+        {                                    \
+            DLOAD_Y4();                      \
+            DGEMV_N_4x2();                   \
+            DSTORE_Y4();                     \
+                                             \
+            y += 4 * inc_y;                  \
+            k += 4;                          \
+        }                                    \
+                                             \
+        if (m & 3)                           \
+        {                                    \
+            temp0 = alpha * x[0 * inc_x];    \
+            temp1 = alpha * x[1 * inc_x];    \
+                                             \
+            for (i = (m & 3); i--;)          \
+            {                                \
+                temp = y[0];                 \
+                temp += temp0 * pa0[k];      \
+                temp += temp1 * pa1[k];      \
+                y[0] = temp;                 \
+                                             \
+                y += inc_y;                  \
+                k++;                         \
+            }                                \
+        }                                    \
+                                             \
+        pa0 += 2 * lda;                      \
+        pa1 += 2 * lda;                      \
+                                             \
+        x += 2 * inc_x;                      \
+    }                                        \
+                                             \
+    if (n & 1)                               \
+    {                                        \
+        temp = alpha * x[0];                 \
+                                             \
+        k = 0;                               \
+        y = y_org;                           \
+                                             \
+        for (i = m; i--;)                    \
+        {                                    \
+           y[0] += temp * pa0[k];            \
+           y += inc_y;                       \
+           k++;                              \
+        }                                    \
+    }                                        \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
+          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *buffer)
+{
+    BLASLONG i, j, k;
+    FLOAT *y_org = y;
+    FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+    FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v2f64 v_alpha;
+    v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
+    v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+    v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+
+    v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
+
+    pa0 = A;
+    pa1 = A + lda;
+    pa2 = A + 2 * lda;
+    pa3 = A + 3 * lda;
+    pa4 = A + 4 * lda;
+    pa5 = A + 5 * lda;
+    pa6 = A + 6 * lda;
+    pa7 = A + 7 * lda;
+
+    if ((1 == inc_x) && (1 == inc_y))
+    {
+        #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_VECTOR
+        #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_VECTOR
+        #define DLOAD_Y8   DLOAD_Y8_VECTOR
+        #define DLOAD_Y4   DLOAD_Y4_VECTOR
+        #define DSTORE_Y8  DSTORE_Y8_VECTOR
+        #define DSTORE_Y4  DSTORE_Y4_VECTOR
+
+        DGEMV_N_MSA();
+
+        #undef DLOAD_X8_SCALE
+        #undef DLOAD_X4_SCALE
+        #undef DLOAD_Y8
+        #undef DLOAD_Y4
+        #undef DSTORE_Y8
+        #undef DSTORE_Y4
+    }
+    else if (1 == inc_y)
+    {
+        #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_GP
+        #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_GP
+        #define DLOAD_Y8   DLOAD_Y8_VECTOR
+        #define DLOAD_Y4   DLOAD_Y4_VECTOR
+        #define DSTORE_Y8  DSTORE_Y8_VECTOR
+        #define DSTORE_Y4  DSTORE_Y4_VECTOR
+
+        DGEMV_N_MSA();
+
+        #undef DLOAD_X8_SCALE
+        #undef DLOAD_X4_SCALE
+        #undef DLOAD_Y8
+        #undef DLOAD_Y4
+        #undef DSTORE_Y8
+        #undef DSTORE_Y4
+    }
+    else if (1 == inc_x)
+    {
+        #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_VECTOR
+        #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_VECTOR
+        #define DLOAD_Y8   DLOAD_Y8_GP
+        #define DLOAD_Y4   DLOAD_Y4_GP
+        #define DSTORE_Y8  DSTORE_Y8_GP
+        #define DSTORE_Y4  DSTORE_Y4_GP
+
+        DGEMV_N_MSA();
+
+        #undef DLOAD_X8_SCALE
+        #undef DLOAD_X4_SCALE
+        #undef DLOAD_Y8
+        #undef DLOAD_Y4
+        #undef DSTORE_Y8
+        #undef DSTORE_Y4
+    }
+    else
+    {
+        #define DLOAD_X8_SCALE   DLOAD_X8_SCALE_GP
+        #define DLOAD_X4_SCALE   DLOAD_X4_SCALE_GP
+        #define DLOAD_Y8   DLOAD_Y8_GP
+        #define DLOAD_Y4   DLOAD_Y4_GP
+        #define DSTORE_Y8  DSTORE_Y8_GP
+        #define DSTORE_Y4  DSTORE_Y4_GP
+
+        DGEMV_N_MSA();
+
+        #undef DLOAD_X8_SCALE
+        #undef DLOAD_X4_SCALE
+        #undef DLOAD_Y8
+        #undef DLOAD_Y4
+        #undef DSTORE_Y8
+        #undef DSTORE_Y4
+    }
+
+    return(0);
+}
diff --git a/kernel/mips/dgemv_t_msa.c b/kernel/mips/dgemv_t_msa.c
new file mode 100644 (file)
index 0000000..f74cb2e
--- /dev/null
@@ -0,0 +1,589 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define DGEMV_T_8x8()                        \
+{                                            \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
+    LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
+    LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
+    LD_DP4(pa4 + k, 2, t16, t17, t18, t19);  \
+    LD_DP4(pa5 + k, 2, t20, t21, t22, t23);  \
+    LD_DP4(pa6 + k, 2, t24, t25, t26, t27);  \
+    LD_DP4(pa7 + k, 2, t28, t29, t30, t31);  \
+                                             \
+    tp0 += x0 * t0;                          \
+    tp0 += x1 * t1;                          \
+    tp0 += x2 * t2;                          \
+    tp0 += x3 * t3;                          \
+                                             \
+    tp1 += x0 * t4;                          \
+    tp1 += x1 * t5;                          \
+    tp1 += x2 * t6;                          \
+    tp1 += x3 * t7;                          \
+                                             \
+    tp2 += x0 * t8;                          \
+    tp2 += x1 * t9;                          \
+    tp2 += x2 * t10;                         \
+    tp2 += x3 * t11;                         \
+                                             \
+    tp3 += x0 * t12;                         \
+    tp3 += x1 * t13;                         \
+    tp3 += x2 * t14;                         \
+    tp3 += x3 * t15;                         \
+                                             \
+    tp4 += x0 * t16;                         \
+    tp4 += x1 * t17;                         \
+    tp4 += x2 * t18;                         \
+    tp4 += x3 * t19;                         \
+                                             \
+    tp5 += x0 * t20;                         \
+    tp5 += x1 * t21;                         \
+    tp5 += x2 * t22;                         \
+    tp5 += x3 * t23;                         \
+                                             \
+    tp6 += x0 * t24;                         \
+    tp6 += x1 * t25;                         \
+    tp6 += x2 * t26;                         \
+    tp6 += x3 * t27;                         \
+                                             \
+    tp7 += x0 * t28;                         \
+    tp7 += x1 * t29;                         \
+    tp7 += x2 * t30;                         \
+    tp7 += x3 * t31;                         \
+}
+
+#define DGEMV_T_8x4()              \
+{                                  \
+    LD_DP2(pa0 + k, 2, t0, t1);    \
+    LD_DP2(pa1 + k, 2, t4, t5);    \
+    LD_DP2(pa2 + k, 2, t8, t9);    \
+    LD_DP2(pa3 + k, 2, t12, t13);  \
+    LD_DP2(pa4 + k, 2, t16, t17);  \
+    LD_DP2(pa5 + k, 2, t20, t21);  \
+    LD_DP2(pa6 + k, 2, t24, t25);  \
+    LD_DP2(pa7 + k, 2, t28, t29);  \
+                                   \
+    tp0 += x0 * t0;                \
+    tp0 += x1 * t1;                \
+                                   \
+    tp1 += x0 * t4;                \
+    tp1 += x1 * t5;                \
+                                   \
+    tp2 += x0 * t8;                \
+    tp2 += x1 * t9;                \
+                                   \
+    tp3 += x0 * t12;               \
+    tp3 += x1 * t13;               \
+                                   \
+    tp4 += x0 * t16;               \
+    tp4 += x1 * t17;               \
+                                   \
+    tp5 += x0 * t20;               \
+    tp5 += x1 * t21;               \
+                                   \
+    tp6 += x0 * t24;               \
+    tp6 += x1 * t25;               \
+                                   \
+    tp7 += x0 * t28;               \
+    tp7 += x1 * t29;               \
+}
+
+#define DGEMV_T_8x2()      \
+{                          \
+    t0  = LD_DP(pa0 + k);  \
+    t4  = LD_DP(pa1 + k);  \
+    t8  = LD_DP(pa2 + k);  \
+    t12 = LD_DP(pa3 + k);  \
+    t16 = LD_DP(pa4 + k);  \
+    t20 = LD_DP(pa5 + k);  \
+    t24 = LD_DP(pa6 + k);  \
+    t28 = LD_DP(pa7 + k);  \
+                           \
+    tp0 += x0 * t0;        \
+    tp1 += x0 * t4;        \
+    tp2 += x0 * t8;        \
+    tp3 += x0 * t12;       \
+    tp4 += x0 * t16;       \
+    tp5 += x0 * t20;       \
+    tp6 += x0 * t24;       \
+    tp7 += x0 * t28;       \
+}
+
+#define DGEMV_T_4x8()                        \
+{                                            \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
+    LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
+    LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
+                                             \
+    tp0 += x0 * t0;                          \
+    tp0 += x1 * t1;                          \
+    tp0 += x2 * t2;                          \
+    tp0 += x3 * t3;                          \
+                                             \
+    tp1 += x0 * t4;                          \
+    tp1 += x1 * t5;                          \
+    tp1 += x2 * t6;                          \
+    tp1 += x3 * t7;                          \
+                                             \
+    tp2 += x0 * t8;                          \
+    tp2 += x1 * t9;                          \
+    tp2 += x2 * t10;                         \
+    tp2 += x3 * t11;                         \
+                                             \
+    tp3 += x0 * t12;                         \
+    tp3 += x1 * t13;                         \
+    tp3 += x2 * t14;                         \
+    tp3 += x3 * t15;                         \
+}
+
+#define DGEMV_T_4x4()              \
+{                                  \
+    LD_DP2(pa0 + k, 2, t0, t1);    \
+    LD_DP2(pa1 + k, 2, t4, t5);    \
+    LD_DP2(pa2 + k, 2, t8, t9);    \
+    LD_DP2(pa3 + k, 2, t12, t13);  \
+                                   \
+    tp0 += x0 * t0;                \
+    tp0 += x1 * t1;                \
+                                   \
+    tp1 += x0 * t4;                \
+    tp1 += x1 * t5;                \
+                                   \
+    tp2 += x0 * t8;                \
+    tp2 += x1 * t9;                \
+                                   \
+    tp3 += x0 * t12;               \
+    tp3 += x1 * t13;               \
+}
+
+#define DGEMV_T_4x2()      \
+{                          \
+    t0  = LD_DP(pa0 + k);  \
+    t4  = LD_DP(pa1 + k);  \
+    t8  = LD_DP(pa2 + k);  \
+    t12 = LD_DP(pa3 + k);  \
+                           \
+    tp0 += x0 * t0;        \
+    tp1 += x0 * t4;        \
+    tp2 += x0 * t8;        \
+    tp3 += x0 * t12;       \
+}
+
+#define DGEMV_T_2x8()                    \
+{                                        \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);  \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);  \
+                                         \
+    tp0 += x0 * t0;                      \
+    tp0 += x1 * t1;                      \
+    tp0 += x2 * t2;                      \
+    tp0 += x3 * t3;                      \
+                                         \
+    tp1 += x0 * t4;                      \
+    tp1 += x1 * t5;                      \
+    tp1 += x2 * t6;                      \
+    tp1 += x3 * t7;                      \
+}
+
+#define DGEMV_T_2x4()            \
+{                                \
+    LD_DP2(pa0 + k, 2, t0, t1);  \
+    LD_DP2(pa1 + k, 2, t4, t5);  \
+                                 \
+    tp0 += x0 * t0;              \
+    tp0 += x1 * t1;              \
+                                 \
+    tp1 += x0 * t4;              \
+    tp1 += x1 * t5;              \
+}
+
+#define DGEMV_T_2x2()     \
+{                         \
+    t0 = LD_DP(pa0 + k);  \
+    t4 = LD_DP(pa1 + k);  \
+                          \
+    tp0 += x0 * t0;       \
+    tp1 += x0 * t4;       \
+}
+
+#define DLOAD_X8_GP()                                                              \
+    x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x)));  \
+    x0 = (v2f64) __msa_insert_d((v2i64) x0,  1, *((long long *)(x + 1 * inc_x)));  \
+    x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x)));  \
+    x1 = (v2f64) __msa_insert_d((v2i64) x1,  1, *((long long *)(x + 3 * inc_x)));  \
+    x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x)));  \
+    x2 = (v2f64) __msa_insert_d((v2i64) x2,  1, *((long long *)(x + 5 * inc_x)));  \
+    x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x)));  \
+    x3 = (v2f64) __msa_insert_d((v2i64) x3,  1, *((long long *)(x + 7 * inc_x)));  \
+
+#define DLOAD_X4_GP()                                                              \
+    x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x)));  \
+    x0 = (v2f64) __msa_insert_d((v2i64) x0,  1, *((long long *)(x + 1 * inc_x)));  \
+    x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x)));  \
+    x1 = (v2f64) __msa_insert_d((v2i64) x1,  1, *((long long *)(x + 3 * inc_x)));  \
+
+#define DLOAD_X2_GP()                                                               \
+    x0 = (v2f64) __msa_insert_d((v2i64) tp0,  0, *((long long *)(x + 0 * inc_x)));  \
+    x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x)));    \
+
+#define DLOAD_X8_VECTOR()  LD_DP4(x, 2, x0, x1, x2, x3);
+#define DLOAD_X4_VECTOR()  LD_DP2(x, 2, x0, x1);
+#define DLOAD_X2_VECTOR()  x0 = LD_DP(x);
+
+#define DGEMV_T_MSA()                   \
+    for (j = (n >> 3); j--;)            \
+    {                                   \
+        tp0 = zero;                     \
+        tp1 = zero;                     \
+        tp2 = zero;                     \
+        tp3 = zero;                     \
+        tp4 = zero;                     \
+        tp5 = zero;                     \
+        tp6 = zero;                     \
+        tp7 = zero;                     \
+                                        \
+        k = 0;                          \
+        x = srcx_org;                   \
+                                        \
+        for (i = (m >> 3); i--;)        \
+        {                               \
+            DLOAD_X8();                 \
+            DGEMV_T_8x8();              \
+                                        \
+            x += 8 * inc_x;             \
+            k += 8;                     \
+        }                               \
+                                        \
+        if (m & 4)                      \
+        {                               \
+            DLOAD_X4();                 \
+            DGEMV_T_8x4();              \
+                                        \
+            x += 4 * inc_x;             \
+            k += 4;                     \
+        }                               \
+                                        \
+        if (m & 2)                      \
+        {                               \
+            DLOAD_X2();                 \
+            DGEMV_T_8x2();              \
+                                        \
+            x += 2 * inc_x;             \
+            k += 2;                     \
+        }                               \
+                                        \
+        ILVRL_D2_DP(tp1, tp0, t0, t4);  \
+        ILVRL_D2_DP(tp3, tp2, t1, t5);  \
+        ILVRL_D2_DP(tp5, tp4, t2, t6);  \
+        ILVRL_D2_DP(tp7, tp6, t3, t7);  \
+        ADD2(t0, t4, t1, t5, t0, t1);   \
+        ADD2(t2, t6, t3, t7, t2, t3);   \
+                                        \
+        temp0 = t0[0];                  \
+        temp1 = t0[1];                  \
+        temp2 = t1[0];                  \
+        temp3 = t1[1];                  \
+        temp4 = t2[0];                  \
+        temp5 = t2[1];                  \
+        temp6 = t3[0];                  \
+        temp7 = t3[1];                  \
+                                        \
+        if (m & 1)                      \
+        {                               \
+            temp0 += pa0[k] * x[0];     \
+            temp1 += pa1[k] * x[0];     \
+            temp2 += pa2[k] * x[0];     \
+            temp3 += pa3[k] * x[0];     \
+            temp4 += pa4[k] * x[0];     \
+            temp5 += pa5[k] * x[0];     \
+            temp6 += pa6[k] * x[0];     \
+            temp7 += pa7[k] * x[0];     \
+                                        \
+            x += inc_x;                 \
+            k++;                        \
+        }                               \
+                                        \
+        res0 = y[0 * inc_y];            \
+        res1 = y[1 * inc_y];            \
+        res2 = y[2 * inc_y];            \
+        res3 = y[3 * inc_y];            \
+        res4 = y[4 * inc_y];            \
+        res5 = y[5 * inc_y];            \
+        res6 = y[6 * inc_y];            \
+        res7 = y[7 * inc_y];            \
+                                        \
+        res0 += alpha * temp0;          \
+        res1 += alpha * temp1;          \
+        res2 += alpha * temp2;          \
+        res3 += alpha * temp3;          \
+        res4 += alpha * temp4;          \
+        res5 += alpha * temp5;          \
+        res6 += alpha * temp6;          \
+        res7 += alpha * temp7;          \
+                                        \
+        y[0 * inc_y] = res0;            \
+        y[1 * inc_y] = res1;            \
+        y[2 * inc_y] = res2;            \
+        y[3 * inc_y] = res3;            \
+        y[4 * inc_y] = res4;            \
+        y[5 * inc_y] = res5;            \
+        y[6 * inc_y] = res6;            \
+        y[7 * inc_y] = res7;            \
+                                        \
+        y += 8 * inc_y;                 \
+                                        \
+        pa0 += 8 * lda;                 \
+        pa1 += 8 * lda;                 \
+        pa2 += 8 * lda;                 \
+        pa3 += 8 * lda;                 \
+        pa4 += 8 * lda;                 \
+        pa5 += 8 * lda;                 \
+        pa6 += 8 * lda;                 \
+        pa7 += 8 * lda;                 \
+    }                                   \
+                                        \
+    if (n & 4)                          \
+    {                                   \
+        tp0 = zero;                     \
+        tp1 = zero;                     \
+        tp2 = zero;                     \
+        tp3 = zero;                     \
+                                        \
+        k = 0;                          \
+        x = srcx_org;                   \
+                                        \
+        for (i = (m >> 3); i--;)        \
+        {                               \
+            DLOAD_X8();                 \
+            DGEMV_T_4x8();              \
+                                        \
+            x += 8 * inc_x;             \
+            k += 8;                     \
+        }                               \
+                                        \
+        if (m & 4)                      \
+        {                               \
+            DLOAD_X4();                 \
+            DGEMV_T_4x4();              \
+                                        \
+            x += 4 * inc_x;             \
+            k += 4;                     \
+        }                               \
+                                        \
+        if (m & 2)                      \
+        {                               \
+            DLOAD_X2();                 \
+            DGEMV_T_4x2();              \
+                                        \
+            x += 2 * inc_x;             \
+            k += 2;                     \
+        }                               \
+                                        \
+        ILVRL_D2_DP(tp1, tp0, t0, t4);  \
+        ILVRL_D2_DP(tp3, tp2, t1, t5);  \
+        ADD2(t0, t4, t1, t5, t0, t1);   \
+                                        \
+        temp0 = t0[0];                  \
+        temp1 = t0[1];                  \
+        temp2 = t1[0];                  \
+        temp3 = t1[1];                  \
+                                        \
+        if (m & 1)                      \
+        {                               \
+            temp0 += pa0[k] * x[0];     \
+            temp1 += pa1[k] * x[0];     \
+            temp2 += pa2[k] * x[0];     \
+            temp3 += pa3[k] * x[0];     \
+                                        \
+            x += inc_x;                 \
+            k++;                        \
+        }                               \
+                                        \
+        res0 = y[0 * inc_y];            \
+        res1 = y[1 * inc_y];            \
+        res2 = y[2 * inc_y];            \
+        res3 = y[3 * inc_y];            \
+                                        \
+        res0 += alpha * temp0;          \
+        res1 += alpha * temp1;          \
+        res2 += alpha * temp2;          \
+        res3 += alpha * temp3;          \
+                                        \
+        y[0 * inc_y] = res0;            \
+        y[1 * inc_y] = res1;            \
+        y[2 * inc_y] = res2;            \
+        y[3 * inc_y] = res3;            \
+                                        \
+        y += 4 * inc_y;                 \
+                                        \
+        pa0 += 4 * lda;                 \
+        pa1 += 4 * lda;                 \
+        pa2 += 4 * lda;                 \
+        pa3 += 4 * lda;                 \
+    }                                   \
+                                        \
+    if (n & 2)                          \
+    {                                   \
+        tp0 = zero;                     \
+        tp1 = zero;                     \
+                                        \
+        k = 0;                          \
+        x = srcx_org;                   \
+                                        \
+        for (i = (m >> 3); i--;)        \
+        {                               \
+            DLOAD_X8();                 \
+            DGEMV_T_2x8();              \
+                                        \
+            x += 8 * inc_x;             \
+            k += 8;                     \
+        }                               \
+                                        \
+        if (m & 4)                      \
+        {                               \
+            DLOAD_X4();                 \
+            DGEMV_T_2x4();              \
+                                        \
+            x += 4 * inc_x;             \
+            k += 4;                     \
+        }                               \
+                                        \
+        if (m & 2)                      \
+        {                               \
+            DLOAD_X2();                 \
+            DGEMV_T_2x2();              \
+                                        \
+            x += 2 * inc_x;             \
+            k += 2;                     \
+        }                               \
+                                        \
+        ILVRL_D2_DP(tp1, tp0, t0, t4);  \
+                                        \
+        t0 += t4;                       \
+                                        \
+        temp0 = t0[0];                  \
+        temp1 = t0[1];                  \
+                                        \
+        if (m & 1)                      \
+        {                               \
+            temp0 += pa0[k] * x[0];     \
+            temp1 += pa1[k] * x[0];     \
+            x += inc_x;                 \
+            k++;                        \
+        }                               \
+                                        \
+        res0 = y[0 * inc_y];            \
+        res1 = y[1 * inc_y];            \
+                                        \
+        res0 += alpha * temp0;          \
+        res1 += alpha * temp1;          \
+                                        \
+        y[0 * inc_y] = res0;            \
+        y[1 * inc_y] = res1;            \
+                                        \
+        y += 2 * inc_y;                 \
+                                        \
+        pa0 += 2 * lda;                 \
+        pa1 += 2 * lda;                 \
+    }                                   \
+                                        \
+    if (n & 1)                          \
+    {                                   \
+        temp0 = 0.0;                    \
+                                        \
+        k = 0;                          \
+        x = srcx_org;                   \
+                                        \
+        for (i = m; i--;)               \
+        {                               \
+            temp0 += pa0[k] * x[0];     \
+            x += inc_x;                 \
+            k++;                        \
+        }                               \
+                                        \
+        y[0] += alpha * temp0;          \
+        y += inc_y;                     \
+        pa0 += lda;                     \
+    }
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
+          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *buffer)
+{
+    BLASLONG i, j, k;
+    FLOAT *srcx_org = x;
+    FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+    FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
+    v2f64 x0, x1, x2, x3;
+    v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+    v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+    v2f64 zero = {0};
+
+    pa0 = A + 0 * lda;
+    pa1 = A + 1 * lda;
+    pa2 = A + 2 * lda;
+    pa3 = A + 3 * lda;
+    pa4 = A + 4 * lda;
+    pa5 = A + 5 * lda;
+    pa6 = A + 6 * lda;
+    pa7 = A + 7 * lda;
+
+    if (1 == inc_x)
+    {
+        #define DLOAD_X8  DLOAD_X8_VECTOR
+        #define DLOAD_X4  DLOAD_X4_VECTOR
+        #define DLOAD_X2  DLOAD_X2_VECTOR
+
+        DGEMV_T_MSA();
+
+        #undef DLOAD_X8
+        #undef DLOAD_X4
+        #undef DLOAD_X2
+    }
+    else
+    {
+        #define DLOAD_X8  DLOAD_X8_GP
+        #define DLOAD_X4  DLOAD_X4_GP
+        #define DLOAD_X2  DLOAD_X2_GP
+
+        DGEMV_T_MSA();
+
+        #undef DLOAD_X8
+        #undef DLOAD_X4
+        #undef DLOAD_X2
+    }
+
+    return(0);
+}
diff --git a/kernel/mips/sasum_msa.c b/kernel/mips/sasum_msa.c
new file mode 100644 (file)
index 0000000..e968f83
--- /dev/null
@@ -0,0 +1,333 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include "macros_msa.h"
+
+#define AND_VEC_W(in)   ((v4f32) ((v4i32) in & and_vec))
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    FLOAT data0, data1, data2, sumf = 0.0;
+    v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+    v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
+    v4f32 zero_v = {0};
+    v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+
+    if (n <= 0 || inc_x <= 0) return (sumf);
+
+    if (1 == inc_x)
+    {
+        if (n > 31)
+        {
+            n -= 32;
+
+            LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 = AND_VEC_W(src0);
+            sum_abs1 = AND_VEC_W(src1);
+            sum_abs2 = AND_VEC_W(src2);
+            sum_abs3 = AND_VEC_W(src3);
+            sum_abs0 += AND_VEC_W(src4);
+            sum_abs1 += AND_VEC_W(src5);
+            sum_abs2 += AND_VEC_W(src6);
+            sum_abs3 += AND_VEC_W(src7);
+        }
+        else
+        {
+            sum_abs0 = zero_v;
+            sum_abs1 = zero_v;
+            sum_abs2 = zero_v;
+            sum_abs3 = zero_v;
+        }
+
+        for (i = 0; i < (n >> 5); i++)
+        {
+            LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
+
+            sum_abs0 += AND_VEC_W(src0);
+            sum_abs1 += AND_VEC_W(src1);
+            sum_abs2 += AND_VEC_W(src2);
+            sum_abs3 += AND_VEC_W(src3);
+            sum_abs0 += AND_VEC_W(src4);
+            sum_abs1 += AND_VEC_W(src5);
+            sum_abs2 += AND_VEC_W(src6);
+            sum_abs3 += AND_VEC_W(src7);
+        }
+
+        if (n & 31)
+        {
+            if ((n & 16) && (n & 8) && (n & 4))
+            {
+                LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+                sum_abs1 += AND_VEC_W(src5);
+                sum_abs2 += AND_VEC_W(src6);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf += sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if ((n & 16) && (n & 8))
+            {
+                LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+                sum_abs1 += AND_VEC_W(src5);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf += sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if ((n & 16) && (n & 4))
+            {
+                LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+                sum_abs0 += AND_VEC_W(src4);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf += sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if ((n & 8) && (n & 4))
+            {
+                LD_SP3_INC(x, 4, src0, src1, src2);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf += sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if (n & 16)
+            {
+                LD_SP4_INC(x, 4, src0, src1, src2, src3);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+                sum_abs2 += AND_VEC_W(src2);
+                sum_abs3 += AND_VEC_W(src3);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf += sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if (n & 8)
+            {
+                LD_SP2_INC(x, 4, src0, src1);
+
+                sum_abs0 += AND_VEC_W(src0);
+                sum_abs1 += AND_VEC_W(src1);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf += sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else if (n & 4)
+            {
+                src0 = LD_SP(x); x += 4;
+
+                sum_abs0 += AND_VEC_W(src0);
+
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf += sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+            else
+            {
+                sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+                sumf += sum_abs0[0];
+                sumf += sum_abs0[1];
+                sumf += sum_abs0[2];
+                sumf += sum_abs0[3];
+            }
+
+            if (n & 2)
+            {
+                sumf += fabsf(*(x + 0));
+                sumf += fabsf(*(x + 1));
+                x += 2;
+            }
+
+            if (n & 1)
+            {
+                sumf += fabsf(*(x + 0));
+            }
+        }
+        else
+        {
+            sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
+
+            sumf += sum_abs0[0];
+            sumf += sum_abs0[1];
+            sumf += sum_abs0[2];
+            sumf += sum_abs0[3];
+        }
+    }
+    else
+    {
+        if (n > 8)
+        {
+            n -= 8;
+
+            src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+            x += inc_x;
+            src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+            x += inc_x;
+            src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
+            x += inc_x;
+            src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
+            x += inc_x;
+            src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
+            x += inc_x;
+
+            sum_abs0 = AND_VEC_W(src0);
+            sum_abs1 = AND_VEC_W(src4);
+        }
+        else
+        {
+            sum_abs0 = zero_v;
+            sum_abs1 = zero_v;
+        }
+
+        for (i = (n >> 3); i--;)
+        {
+            src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+            x += inc_x;
+            src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+            x += inc_x;
+            src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
+            x += inc_x;
+            src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
+            x += inc_x;
+            src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
+            x += inc_x;
+
+            sum_abs0 += AND_VEC_W(src0);
+            sum_abs1 += AND_VEC_W(src4);
+        }
+
+        if (n & 4)
+        {
+            src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
+            x += inc_x;
+            src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
+            x += inc_x;
+
+            sum_abs0 += AND_VEC_W(src0);
+        }
+
+        sum_abs0 += sum_abs1;
+
+        sumf += sum_abs0[0];
+        sumf += sum_abs0[1];
+        sumf += sum_abs0[2];
+        sumf += sum_abs0[3];
+
+        if ((n & 2) && (n & 1))
+        {
+            data0 = fabsf(*x); x += inc_x;
+            data1 = fabsf(*x); x += inc_x;
+            data2 = fabsf(*x);
+
+            sumf += data0;
+            sumf += data1;
+            sumf += data2;
+        }
+        else if (n & 2)
+        {
+            data0 = fabsf(*x); x += inc_x;
+            data1 = fabsf(*x);
+
+            sumf += data0;
+            sumf += data1;
+        }
+        else if (n & 1)
+        {
+            data0 = fabsf(*x);
+
+            sumf += data0;
+        }
+    }
+
+    return (sumf);
+}
diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c
new file mode 100644 (file)
index 0000000..1997ec5
--- /dev/null
@@ -0,0 +1,208 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+/* return float, x,y float */
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+    BLASLONG i = 0;
+    double dot = 0.0;
+    float x0, x1, x2, x3, y0, y1, y2, y3;
+    v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
+    v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+    v4f32 dot0 = {0, 0, 0, 0};
+
+    if (n < 0) return (dot);
+
+    if ((1 == inc_x) && (1 == inc_y))
+    {
+        for (i = (n >> 5); i--;)
+        {
+                       LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+                       LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+            dot0 += (vy0 * vx0);
+            dot0 += (vy1 * vx1);
+            dot0 += (vy2 * vx2);
+            dot0 += (vy3 * vx3);
+            dot0 += (vy4 * vx4);
+            dot0 += (vy5 * vx5);
+            dot0 += (vy6 * vx6);
+            dot0 += (vy7 * vx7);
+        }
+
+        if (n & 31)
+        {
+            if ((n & 16) && (n & 8) && (n & 4))
+            {
+                LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
+                LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                dot0 += (vy3 * vx3);
+                dot0 += (vy4 * vx4);
+                dot0 += (vy5 * vx5);
+                dot0 += (vy6 * vx6);
+            }
+            else if ((n & 16) && (n & 8))
+            {
+                LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5);
+                LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                dot0 += (vy3 * vx3);
+                dot0 += (vy4 * vx4);
+                dot0 += (vy5 * vx5);
+            }
+            else if ((n & 16) && (n & 4))
+            {
+                LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4);
+                LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                dot0 += (vy3 * vx3);
+                dot0 += (vy4 * vx4);
+            }
+            else if ((n & 8) && (n & 4))
+            {
+                LD_SP3_INC(x, 4, vx0, vx1, vx2);
+                LD_SP3_INC(y, 4, vy0, vy1, vy2);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+            }
+            else if (n & 16)
+            {
+                               LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
+                               LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+                dot0 += (vy2 * vx2);
+                dot0 += (vy3 * vx3);
+            }
+            else if (n & 8)
+            {
+                               LD_SP2_INC(x, 4, vx0, vx1);
+                               LD_SP2_INC(y, 4, vy0, vy1);
+
+                dot0 += (vy0 * vx0);
+                dot0 += (vy1 * vx1);
+            }
+            else if (n & 4)
+            {
+                vx0 = LD_SP(x); x += 4;
+                vy0 = LD_SP(y); y += 4;
+
+                dot0 += (vy0 * vx0);
+            }
+
+            if ((n & 2) && (n & 1))
+            {
+                LD_GP3_INC(x, 1, x0, x1, x2);
+                LD_GP3_INC(y, 1, y0, y1, y2);
+
+                dot += (y0 * x0);
+                dot += (y1 * x1);
+                dot += (y2 * x2);
+            }
+            else if (n & 2)
+            {
+                LD_GP2_INC(x, 1, x0, x1);
+                LD_GP2_INC(y, 1, y0, y1);
+
+                dot += (y0 * x0);
+                dot += (y1 * x1);
+            }
+            else if (n & 1)
+            {
+                x0 = *x;
+                y0 = *y;
+
+                dot += (y0 * x0);
+            }
+        }
+
+        dot += dot0[0];
+        dot += dot0[1];
+        dot += dot0[2];
+        dot += dot0[3];
+    }
+    else
+    {
+        for (i = (n >> 2); i--;)
+        {
+            LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
+            LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
+
+            dot += (y0 * x0);
+            dot += (y1 * x1);
+            dot += (y2 * x2);
+            dot += (y3 * x3);
+        }
+
+        if ((n & 2) && (n & 1))
+        {
+            LD_GP3_INC(x, inc_x, x0, x1, x2);
+            LD_GP3_INC(y, inc_y, y0, y1, y2);
+
+            dot += (y0 * x0);
+            dot += (y1 * x1);
+            dot += (y2 * x2);
+        }
+        else if (n & 2)
+        {
+            LD_GP2_INC(x, inc_x, x0, x1);
+            LD_GP2_INC(y, inc_y, y0, y1);
+
+            dot += (y0 * x0);
+            dot += (y1 * x1);
+        }
+        else if (n & 1)
+        {
+            x0 = *x;
+            y0 = *y;
+
+            dot += (y0 * x0);
+        }
+    }
+
+    return (dot);
+}
diff --git a/kernel/mips/sgemv_n_msa.c b/kernel/mips/sgemv_n_msa.c
new file mode 100644 (file)
index 0000000..ae6e655
--- /dev/null
@@ -0,0 +1,515 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define SGEMV_N_8x8()              \
+{                                  \
+    LD_SP2(pa0 + k, 4, t0, t1);    \
+    LD_SP2(pa1 + k, 4, t2, t3);    \
+    LD_SP2(pa2 + k, 4, t4, t5);    \
+    LD_SP2(pa3 + k, 4, t6, t7);    \
+    LD_SP2(pa4 + k, 4, t8, t9);    \
+    LD_SP2(pa5 + k, 4, t10, t11);  \
+    LD_SP2(pa6 + k, 4, t12, t13);  \
+    LD_SP2(pa7 + k, 4, t14, t15);  \
+                                   \
+    y0 += tp0 * t0;                \
+    y1 += tp0 * t1;                \
+                                   \
+    y0 += tp1 * t2;                \
+    y1 += tp1 * t3;                \
+                                   \
+    y0 += tp2 * t4;                \
+    y1 += tp2 * t5;                \
+                                   \
+    y0 += tp3 * t6;                \
+    y1 += tp3 * t7;                \
+                                   \
+    y0 += tp4 * t8;                \
+    y1 += tp4 * t9;                \
+                                   \
+    y0 += tp5 * t10;               \
+    y1 += tp5 * t11;               \
+                                   \
+    y0 += tp6 * t12;               \
+    y1 += tp6 * t13;               \
+                                   \
+    y0 += tp7 * t14;               \
+    y1 += tp7 * t15;               \
+}
+
+#define SGEMV_N_4x8()      \
+{                          \
+    t0  = LD_SP(pa0 + k);  \
+    t2  = LD_SP(pa1 + k);  \
+    t4  = LD_SP(pa2 + k);  \
+    t6  = LD_SP(pa3 + k);  \
+    t8  = LD_SP(pa4 + k);  \
+    t10 = LD_SP(pa5 + k);  \
+    t12 = LD_SP(pa6 + k);  \
+    t14 = LD_SP(pa7 + k);  \
+                           \
+    y0 += tp0 * t0;        \
+    y0 += tp1 * t2;        \
+    y0 += tp2 * t4;        \
+    y0 += tp3 * t6;        \
+    y0 += tp4 * t8;        \
+    y0 += tp5 * t10;       \
+    y0 += tp6 * t12;       \
+    y0 += tp7 * t14;       \
+}
+
+#define SGEMV_N_8x4()            \
+{                                \
+    LD_SP2(pa0 + k, 4, t0, t1);  \
+    LD_SP2(pa1 + k, 4, t2, t3);  \
+    LD_SP2(pa2 + k, 4, t4, t5);  \
+    LD_SP2(pa3 + k, 4, t6, t7);  \
+                                 \
+    y0 += tp0 * t0;              \
+    y1 += tp0 * t1;              \
+                                 \
+    y0 += tp1 * t2;              \
+    y1 += tp1 * t3;              \
+                                 \
+    y0 += tp2 * t4;              \
+    y1 += tp2 * t5;              \
+                                 \
+    y0 += tp3 * t6;              \
+    y1 += tp3 * t7;              \
+}
+
+#define SGEMV_N_4x4()      \
+{                          \
+    t0  = LD_SP(pa0 + k);  \
+    t2  = LD_SP(pa1 + k);  \
+    t4  = LD_SP(pa2 + k);  \
+    t6  = LD_SP(pa3 + k);  \
+                           \
+    y0 += tp0 * t0;        \
+    y0 += tp1 * t2;        \
+    y0 += tp2 * t4;        \
+    y0 += tp3 * t6;        \
+}
+
+#define SGEMV_N_8x2()            \
+{                                \
+    LD_SP2(pa0 + k, 4, t0, t1);  \
+    LD_SP2(pa1 + k, 4, t2, t3);  \
+                                 \
+    y0 += tp0 * t0;              \
+    y1 += tp0 * t1;              \
+                                 \
+    y0 += tp1 * t2;              \
+    y1 += tp1 * t3;              \
+}
+
+#define SGEMV_N_4x2()      \
+{                          \
+    t0  = LD_SP(pa0 + k);  \
+    t2  = LD_SP(pa1 + k);  \
+                           \
+    y0 += tp0 * t0;        \
+    y0 += tp1 * t2;        \
+}
+
+#define SLOAD_X8_SCALE_GP()             \
+    temp0 = alpha * x[0 * inc_x];       \
+    temp1 = alpha * x[1 * inc_x];       \
+    temp2 = alpha * x[2 * inc_x];       \
+    temp3 = alpha * x[3 * inc_x];       \
+    temp4 = alpha * x[4 * inc_x];       \
+    temp5 = alpha * x[5 * inc_x];       \
+    temp6 = alpha * x[6 * inc_x];       \
+    temp7 = alpha * x[7 * inc_x];       \
+                                        \
+    tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
+    tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
+    tp2 = COPY_FLOAT_TO_VECTOR(temp2);  \
+    tp3 = COPY_FLOAT_TO_VECTOR(temp3);  \
+    tp4 = COPY_FLOAT_TO_VECTOR(temp4);  \
+    tp5 = COPY_FLOAT_TO_VECTOR(temp5);  \
+    tp6 = COPY_FLOAT_TO_VECTOR(temp6);  \
+    tp7 = COPY_FLOAT_TO_VECTOR(temp7);  \
+
+#define SLOAD_X4_SCALE_GP()             \
+    temp0 = alpha * x[0 * inc_x];       \
+    temp1 = alpha * x[1 * inc_x];       \
+    temp2 = alpha * x[2 * inc_x];       \
+    temp3 = alpha * x[3 * inc_x];       \
+                                        \
+    tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
+    tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
+    tp2 = COPY_FLOAT_TO_VECTOR(temp2);  \
+    tp3 = COPY_FLOAT_TO_VECTOR(temp3);  \
+
+#define SLOAD_X8_SCALE_VECTOR()            \
+    LD_SP2(x, 4, x0, x1);                  \
+                                           \
+    x0 = x0 * v_alpha;                     \
+    x1 = x1 * v_alpha;                     \
+                                           \
+    SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3);  \
+    SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7);  \
+
+#define SLOAD_X4_SCALE_VECTOR()            \
+    x0 = LD_SP(x);                         \
+    x0 = x0 * v_alpha;                     \
+    SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3);  \
+
+#define SLOAD_Y8_GP()                                                        \
+    y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y)));  \
+    y0 = (v4f32) __msa_insert_w((v4i32) y0,  1, *((int *)(y + 1 * inc_y)));  \
+    y0 = (v4f32) __msa_insert_w((v4i32) y0,  2, *((int *)(y + 2 * inc_y)));  \
+    y0 = (v4f32) __msa_insert_w((v4i32) y0,  3, *((int *)(y + 3 * inc_y)));  \
+    y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y)));  \
+    y1 = (v4f32) __msa_insert_w((v4i32) y1,  1, *((int *)(y + 5 * inc_y)));  \
+    y1 = (v4f32) __msa_insert_w((v4i32) y1,  2, *((int *)(y + 6 * inc_y)));  \
+    y1 = (v4f32) __msa_insert_w((v4i32) y1,  3, *((int *)(y + 7 * inc_y)));  \
+
+#define SLOAD_Y4_GP()                                                        \
+    y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y)));  \
+    y0 = (v4f32) __msa_insert_w((v4i32) y0,  1, *((int *)(y + 1 * inc_y)));  \
+    y0 = (v4f32) __msa_insert_w((v4i32) y0,  2, *((int *)(y + 2 * inc_y)));  \
+    y0 = (v4f32) __msa_insert_w((v4i32) y0,  3, *((int *)(y + 3 * inc_y)));  \
+
+#define SLOAD_Y8_VECTOR()  LD_SP2(y, 4, y0, y1);
+#define SLOAD_Y4_VECTOR()  y0 = LD_SP(y);
+
+#define SSTORE_Y8_GP()                                          \
+    *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0);  \
+    *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1);  \
+    *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2);  \
+    *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3);  \
+    *((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0);  \
+    *((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1);  \
+    *((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2);  \
+    *((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3);  \
+
+#define SSTORE_Y4_GP()                                          \
+    *((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0);  \
+    *((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1);  \
+    *((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2);  \
+    *((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3);  \
+
+#define SSTORE_Y8_VECTOR()  ST_SP2(y0, y1, y, 4);
+#define SSTORE_Y4_VECTOR()  ST_SP(y0, y);
+
+#define SGEMV_N_MSA()                       \
+    for (j = (n >> 3); j--;)                \
+    {                                       \
+        SLOAD_X8_SCALE();                   \
+                                            \
+        k = 0;                              \
+        y = y_org;                          \
+                                            \
+        for (i = (m >> 3); i--;)            \
+        {                                   \
+            SLOAD_Y8();                     \
+            SGEMV_N_8x8();                  \
+            SSTORE_Y8();                    \
+                                            \
+            y += 8 * inc_y;                 \
+            k += 8;                         \
+        }                                   \
+                                            \
+        if (m & 4)                          \
+        {                                   \
+            SLOAD_Y4();                     \
+            SGEMV_N_4x8();                  \
+            SSTORE_Y4();                    \
+                                            \
+            y += 4 * inc_y;                 \
+            k += 4;                         \
+        }                                   \
+                                            \
+        if (m & 3)                          \
+        {                                   \
+            temp0 = alpha * x[0 * inc_x];   \
+            temp1 = alpha * x[1 * inc_x];   \
+            temp2 = alpha * x[2 * inc_x];   \
+            temp3 = alpha * x[3 * inc_x];   \
+            temp4 = alpha * x[4 * inc_x];   \
+            temp5 = alpha * x[5 * inc_x];   \
+            temp6 = alpha * x[6 * inc_x];   \
+            temp7 = alpha * x[7 * inc_x];   \
+                                            \
+            for (i = (m & 3); i--;)         \
+            {                               \
+                temp = y[0];                \
+                temp += temp0 * pa0[k];     \
+                temp += temp1 * pa1[k];     \
+                temp += temp2 * pa2[k];     \
+                temp += temp3 * pa3[k];     \
+                temp += temp4 * pa4[k];     \
+                temp += temp5 * pa5[k];     \
+                temp += temp6 * pa6[k];     \
+                temp += temp7 * pa7[k];     \
+                y[0] = temp;                \
+                                            \
+                y += inc_y;                 \
+                k++;                        \
+            }                               \
+        }                                   \
+        pa0 += 8 * lda;                     \
+        pa1 += 8 * lda;                     \
+        pa2 += 8 * lda;                     \
+        pa3 += 8 * lda;                     \
+        pa4 += 8 * lda;                     \
+        pa5 += 8 * lda;                     \
+        pa6 += 8 * lda;                     \
+        pa7 += 8 * lda;                     \
+                                            \
+        x += 8 * inc_x;                     \
+    }                                       \
+                                            \
+    if (n & 4)                              \
+    {                                       \
+        SLOAD_X4_SCALE();                   \
+                                            \
+        k = 0;                              \
+        y = y_org;                          \
+                                            \
+        for (i = (m >> 3); i--;)            \
+        {                                   \
+            SLOAD_Y8();                     \
+            SGEMV_N_8x4();                  \
+            SSTORE_Y8();                    \
+                                            \
+            y += 8 * inc_y;                 \
+            k += 8;                         \
+        }                                   \
+                                            \
+        if (m & 4)                          \
+        {                                   \
+            SLOAD_Y4();                     \
+            SGEMV_N_4x4();                  \
+            SSTORE_Y4();                    \
+                                            \
+            y += 4 * inc_y;                 \
+            k += 4;                         \
+        }                                   \
+                                            \
+        if (m & 3)                          \
+        {                                   \
+            temp0 = alpha * x[0 * inc_x];   \
+            temp1 = alpha * x[1 * inc_x];   \
+            temp2 = alpha * x[2 * inc_x];   \
+            temp3 = alpha * x[3 * inc_x];   \
+                                            \
+            for (i = (m & 3); i--;)         \
+            {                               \
+                temp = y[0];                \
+                temp += temp0 * pa0[k];     \
+                temp += temp1 * pa1[k];     \
+                temp += temp2 * pa2[k];     \
+                temp += temp3 * pa3[k];     \
+                y[0] = temp;                \
+                                            \
+                y += inc_y;                 \
+                k++;                        \
+            }                               \
+        }                                   \
+                                            \
+        pa0 += 4 * lda;                     \
+        pa1 += 4 * lda;                     \
+        pa2 += 4 * lda;                     \
+        pa3 += 4 * lda;                     \
+                                            \
+        x += 4 * inc_x;                     \
+    }                                       \
+                                            \
+    if (n & 2)                              \
+    {                                       \
+        temp0 = alpha * x[0 * inc_x];       \
+        temp1 = alpha * x[1 * inc_x];       \
+                                            \
+        tp0 = COPY_FLOAT_TO_VECTOR(temp0);  \
+        tp1 = COPY_FLOAT_TO_VECTOR(temp1);  \
+                                            \
+        k = 0;                              \
+        y = y_org;                          \
+                                            \
+        for (i = (m >> 3); i--;)            \
+        {                                   \
+            SLOAD_Y8();                     \
+            SGEMV_N_8x2();                  \
+            SSTORE_Y8();                    \
+                                            \
+            y += 8 * inc_y;                 \
+            k += 8;                         \
+        }                                   \
+                                            \
+        if (m & 4)                          \
+        {                                   \
+            SLOAD_Y4();                     \
+            SGEMV_N_4x2();                  \
+            SSTORE_Y4();                    \
+                                            \
+            y += 4 * inc_y;                 \
+            k += 4;                         \
+        }                                   \
+                                            \
+        if (m & 3)                          \
+        {                                   \
+            temp0 = alpha * x[0 * inc_x];   \
+            temp1 = alpha * x[1 * inc_x];   \
+                                            \
+            for (i = (m & 3); i--;)         \
+            {                               \
+                temp = y[0];                \
+                temp += temp0 * pa0[k];     \
+                temp += temp1 * pa1[k];     \
+                y[0] = temp;                \
+                                            \
+                y += inc_y;                 \
+                k++;                        \
+            }                               \
+        }                                   \
+                                            \
+        pa0 += 2 * lda;                     \
+        pa1 += 2 * lda;                     \
+                                            \
+        x += 2 * inc_x;                     \
+    }                                       \
+                                            \
+    if (n & 1)                              \
+    {                                       \
+        temp = alpha * x[0];                \
+                                            \
+        k = 0;                              \
+        y = y_org;                          \
+                                            \
+        for (i = m; i--;)                   \
+        {                                   \
+           y[0] += temp * pa0[k];           \
+                                            \
+           y += inc_y;                      \
+           k++;                             \
+        }                                   \
+    }                                       \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
+          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *buffer)
+{
+    BLASLONG i, j, k;
+    FLOAT *y_org = y;
+    FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+    FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v4f32 v_alpha, x0, x1, y0, y1;
+    v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+
+    v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
+
+    pa0 = A;
+    pa1 = A + lda;
+    pa2 = A + 2 * lda;
+    pa3 = A + 3 * lda;
+    pa4 = A + 4 * lda;
+    pa5 = A + 5 * lda;
+    pa6 = A + 6 * lda;
+    pa7 = A + 7 * lda;
+
+    if ((1 == inc_x) && (1 == inc_y))
+    {
+        #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_VECTOR
+        #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_VECTOR
+        #define SLOAD_Y8   SLOAD_Y8_VECTOR
+        #define SLOAD_Y4   SLOAD_Y4_VECTOR
+        #define SSTORE_Y8  SSTORE_Y8_VECTOR
+        #define SSTORE_Y4  SSTORE_Y4_VECTOR
+
+        SGEMV_N_MSA();
+
+        #undef SLOAD_X8_SCALE
+        #undef SLOAD_X4_SCALE
+        #undef SLOAD_Y8
+        #undef SLOAD_Y4
+        #undef SSTORE_Y8
+        #undef SSTORE_Y4
+    }
+    else if (1 == inc_y)
+    {
+        #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_GP
+        #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_GP
+        #define SLOAD_Y8   SLOAD_Y8_VECTOR
+        #define SLOAD_Y4   SLOAD_Y4_VECTOR
+        #define SSTORE_Y8  SSTORE_Y8_VECTOR
+        #define SSTORE_Y4  SSTORE_Y4_VECTOR
+
+        SGEMV_N_MSA();
+
+        #undef SLOAD_X8_SCALE
+        #undef SLOAD_X4_SCALE
+        #undef SLOAD_Y8
+        #undef SLOAD_Y4
+        #undef SSTORE_Y8
+        #undef SSTORE_Y4
+    }
+    else if (1 == inc_x)
+    {
+        #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_VECTOR
+        #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_VECTOR
+        #define SLOAD_Y8   SLOAD_Y8_GP
+        #define SLOAD_Y4   SLOAD_Y4_GP
+        #define SSTORE_Y8  SSTORE_Y8_GP
+        #define SSTORE_Y4  SSTORE_Y4_GP
+
+        SGEMV_N_MSA();
+
+        #undef SLOAD_X8_SCALE
+        #undef SLOAD_X4_SCALE
+        #undef SLOAD_Y8
+        #undef SLOAD_Y4
+        #undef SSTORE_Y8
+        #undef SSTORE_Y4
+    }
+    else
+    {
+        #define SLOAD_X8_SCALE   SLOAD_X8_SCALE_GP
+        #define SLOAD_X4_SCALE   SLOAD_X4_SCALE_GP
+        #define SLOAD_Y8   SLOAD_Y8_GP
+        #define SLOAD_Y4   SLOAD_Y4_GP
+        #define SSTORE_Y8  SSTORE_Y8_GP
+        #define SSTORE_Y4  SSTORE_Y4_GP
+
+        SGEMV_N_MSA();
+
+        #undef SLOAD_X8_SCALE
+        #undef SLOAD_X4_SCALE
+        #undef SLOAD_Y8
+        #undef SLOAD_Y4
+        #undef SSTORE_Y8
+        #undef SSTORE_Y4
+    }
+
+    return(0);
+}
diff --git a/kernel/mips/sgemv_t_msa.c b/kernel/mips/sgemv_t_msa.c
new file mode 100644 (file)
index 0000000..1c7f299
--- /dev/null
@@ -0,0 +1,463 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define SGEMV_T_8x8()              \
+{                                  \
+    LD_SP2(pa0 + k, 4, t0, t1);    \
+    LD_SP2(pa1 + k, 4, t2, t3);    \
+    LD_SP2(pa2 + k, 4, t4, t5);    \
+    LD_SP2(pa3 + k, 4, t6, t7);    \
+    LD_SP2(pa4 + k, 4, t8, t9);    \
+    LD_SP2(pa5 + k, 4, t10, t11);  \
+    LD_SP2(pa6 + k, 4, t12, t13);  \
+    LD_SP2(pa7 + k, 4, t14, t15);  \
+                                   \
+    tp0 += x0 * t0;                \
+    tp0 += x1 * t1;                \
+                                   \
+    tp1 += x0 * t2;                \
+    tp1 += x1 * t3;                \
+                                   \
+    tp2 += x0 * t4;                \
+    tp2 += x1 * t5;                \
+                                   \
+    tp3 += x0 * t6;                \
+    tp3 += x1 * t7;                \
+                                   \
+    tp4 += x0 * t8;                \
+    tp4 += x1 * t9;                \
+                                   \
+    tp5 += x0 * t10;               \
+    tp5 += x1 * t11;               \
+                                   \
+    tp6 += x0 * t12;               \
+    tp6 += x1 * t13;               \
+                                   \
+    tp7 += x0 * t14;               \
+    tp7 += x1 * t15;               \
+}
+
+#define SGEMV_T_8x4()      \
+{                          \
+    t0  = LD_SP(pa0 + k);  \
+    t2  = LD_SP(pa1 + k);  \
+    t4  = LD_SP(pa2 + k);  \
+    t6  = LD_SP(pa3 + k);  \
+    t8  = LD_SP(pa4 + k);  \
+    t10 = LD_SP(pa5 + k);  \
+    t12 = LD_SP(pa6 + k);  \
+    t14 = LD_SP(pa7 + k);  \
+                           \
+    tp0 += x0 * t0;        \
+    tp1 += x0 * t2;        \
+    tp2 += x0 * t4;        \
+    tp3 += x0 * t6;        \
+    tp4 += x0 * t8;        \
+    tp5 += x0 * t10;       \
+    tp6 += x0 * t12;       \
+    tp7 += x0 * t14;       \
+}
+
+#define SGEMV_T_4x8()            \
+{                                \
+    LD_SP2(pa0 + k, 4, t0, t1);  \
+    LD_SP2(pa1 + k, 4, t2, t3);  \
+    LD_SP2(pa2 + k, 4, t4, t5);  \
+    LD_SP2(pa3 + k, 4, t6, t7);  \
+                                 \
+    tp0 += x0 * t0;              \
+    tp0 += x1 * t1;              \
+                                 \
+    tp1 += x0 * t2;              \
+    tp1 += x1 * t3;              \
+                                 \
+    tp2 += x0 * t4;              \
+    tp2 += x1 * t5;              \
+                                 \
+    tp3 += x0 * t6;              \
+    tp3 += x1 * t7;              \
+}
+
+#define SGEMV_T_4x4()     \
+{                         \
+    t0 = LD_SP(pa0 + k);  \
+    t2 = LD_SP(pa1 + k);  \
+    t4 = LD_SP(pa2 + k);  \
+    t6 = LD_SP(pa3 + k);  \
+                          \
+    tp0 += x0 * t0;       \
+    tp1 += x0 * t2;       \
+    tp2 += x0 * t4;       \
+    tp3 += x0 * t6;       \
+}
+
+#define SGEMV_T_2x8()            \
+{                                \
+    LD_SP2(pa0 + k, 4, t0, t1);  \
+    LD_SP2(pa1 + k, 4, t2, t3);  \
+                                 \
+    tp0 += x0 * t0;              \
+    tp0 += x1 * t1;              \
+                                 \
+    tp1 += x0 * t2;              \
+    tp1 += x1 * t3;              \
+}
+
+#define SGEMV_T_2x4()     \
+{                         \
+    t0 = LD_SP(pa0 + k);  \
+    t2 = LD_SP(pa1 + k);  \
+                          \
+    tp0 += x0 * t0;       \
+    tp1 += x0 * t2;       \
+}
+
+#define SLOAD_X8_GP()                                                        \
+    x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x)));  \
+    x0 = (v4f32) __msa_insert_w((v4i32) x0,  1, *((int *)(x + 1 * inc_x)));  \
+    x0 = (v4f32) __msa_insert_w((v4i32) x0,  2, *((int *)(x + 2 * inc_x)));  \
+    x0 = (v4f32) __msa_insert_w((v4i32) x0,  3, *((int *)(x + 3 * inc_x)));  \
+    x1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 4 * inc_x)));  \
+    x1 = (v4f32) __msa_insert_w((v4i32) x1,  1, *((int *)(x + 5 * inc_x)));  \
+    x1 = (v4f32) __msa_insert_w((v4i32) x1,  2, *((int *)(x + 6 * inc_x)));  \
+    x1 = (v4f32) __msa_insert_w((v4i32) x1,  3, *((int *)(x + 7 * inc_x)));  \
+
+#define SLOAD_X4_GP()                                                        \
+    x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x)));  \
+    x0 = (v4f32) __msa_insert_w((v4i32) x0,  1, *((int *)(x + 1 * inc_x)));  \
+    x0 = (v4f32) __msa_insert_w((v4i32) x0,  2, *((int *)(x + 2 * inc_x)));  \
+    x0 = (v4f32) __msa_insert_w((v4i32) x0,  3, *((int *)(x + 3 * inc_x)));  \
+
+#define SLOAD_X8_VECTOR()  LD_SP2(x, 4, x0, x1);
+#define SLOAD_X4_VECTOR()  x0 = LD_SP(x);
+
+#define SGEMV_T_MSA()                            \
+    for (j = (n >> 3); j--;)                     \
+    {                                            \
+        tp0 = zero;                              \
+        tp1 = zero;                              \
+        tp2 = zero;                              \
+        tp3 = zero;                              \
+        tp4 = zero;                              \
+        tp5 = zero;                              \
+        tp6 = zero;                              \
+        tp7 = zero;                              \
+                                                 \
+        k = 0;                                   \
+        x = srcx_org;                            \
+                                                 \
+        for (i = (m >> 3); i--;)                 \
+        {                                        \
+            SLOAD_X8();                          \
+            SGEMV_T_8x8();                       \
+                                                 \
+            x += 8 * inc_x;                      \
+            k += 8;                              \
+        }                                        \
+                                                 \
+        if (m & 4)                               \
+        {                                        \
+            SLOAD_X4();                          \
+            SGEMV_T_8x4();                       \
+                                                 \
+            x += 4 * inc_x;                      \
+            k += 4;                              \
+        }                                        \
+                                                 \
+        TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3,   \
+                           tp0, tp1, tp2, tp3);  \
+        TRANSPOSE4x4_SP_SP(tp4, tp5, tp6, tp7,   \
+                           tp4, tp5, tp6, tp7);  \
+        tp0 += tp1;                              \
+        tp0 += tp2;                              \
+        tp0 += tp3;                              \
+        tp4 += tp5;                              \
+        tp4 += tp6;                              \
+        tp4 += tp7;                              \
+                                                 \
+        temp0 = tp0[0];                          \
+        temp1 = tp0[1];                          \
+        temp2 = tp0[2];                          \
+        temp3 = tp0[3];                          \
+        temp4 = tp4[0];                          \
+        temp5 = tp4[1];                          \
+        temp6 = tp4[2];                          \
+        temp7 = tp4[3];                          \
+                                                 \
+        for (i = (m & 3); i--;)                  \
+        {                                        \
+            temp0 += pa0[k] * x[0];              \
+            temp1 += pa1[k] * x[0];              \
+            temp2 += pa2[k] * x[0];              \
+            temp3 += pa3[k] * x[0];              \
+            temp4 += pa4[k] * x[0];              \
+            temp5 += pa5[k] * x[0];              \
+            temp6 += pa6[k] * x[0];              \
+            temp7 += pa7[k] * x[0];              \
+                                                 \
+            x += inc_x;                          \
+            k++;                                 \
+        }                                        \
+                                                 \
+        res0 = y[0 * inc_y];                     \
+        res1 = y[1 * inc_y];                     \
+        res2 = y[2 * inc_y];                     \
+        res3 = y[3 * inc_y];                     \
+        res4 = y[4 * inc_y];                     \
+        res5 = y[5 * inc_y];                     \
+        res6 = y[6 * inc_y];                     \
+        res7 = y[7 * inc_y];                     \
+                                                 \
+        res0 += alpha * temp0;                   \
+        res1 += alpha * temp1;                   \
+        res2 += alpha * temp2;                   \
+        res3 += alpha * temp3;                   \
+        res4 += alpha * temp4;                   \
+        res5 += alpha * temp5;                   \
+        res6 += alpha * temp6;                   \
+        res7 += alpha * temp7;                   \
+                                                 \
+        y[0 * inc_y] = res0;                     \
+        y[1 * inc_y] = res1;                     \
+        y[2 * inc_y] = res2;                     \
+        y[3 * inc_y] = res3;                     \
+        y[4 * inc_y] = res4;                     \
+        y[5 * inc_y] = res5;                     \
+        y[6 * inc_y] = res6;                     \
+        y[7 * inc_y] = res7;                     \
+                                                 \
+        y += 8 * inc_y;                          \
+                                                 \
+        pa0 += 8 * lda;                          \
+        pa1 += 8 * lda;                          \
+        pa2 += 8 * lda;                          \
+        pa3 += 8 * lda;                          \
+        pa4 += 8 * lda;                          \
+        pa5 += 8 * lda;                          \
+        pa6 += 8 * lda;                          \
+        pa7 += 8 * lda;                          \
+    }                                            \
+                                                 \
+    if (n & 4)                                   \
+    {                                            \
+        tp0 = zero;                              \
+        tp1 = zero;                              \
+        tp2 = zero;                              \
+        tp3 = zero;                              \
+                                                 \
+        k = 0;                                   \
+        x = srcx_org;                            \
+                                                 \
+        for (i = (m >> 3); i--;)                 \
+        {                                        \
+            SLOAD_X8();                          \
+            SGEMV_T_4x8();                       \
+                                                 \
+            x += 8 * inc_x;                      \
+            k += 8;                              \
+        }                                        \
+                                                 \
+        if (m & 4)                               \
+        {                                        \
+            SLOAD_X4();                          \
+            SGEMV_T_4x4();                       \
+                                                 \
+            x += 4 * inc_x;                      \
+            k += 4;                              \
+        }                                        \
+                                                 \
+        TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3,   \
+                           tp0, tp1, tp2, tp3);  \
+        tp0 += tp1;                              \
+        tp0 += tp2;                              \
+        tp0 += tp3;                              \
+                                                 \
+        temp0 = tp0[0];                          \
+        temp1 = tp0[1];                          \
+        temp2 = tp0[2];                          \
+        temp3 = tp0[3];                          \
+                                                 \
+        for (i = (m & 3); i--;)                  \
+        {                                        \
+            temp0 += pa0[k] * x[0];              \
+            temp1 += pa1[k] * x[0];              \
+            temp2 += pa2[k] * x[0];              \
+            temp3 += pa3[k] * x[0];              \
+                                                 \
+            x += inc_x;                          \
+            k++;                                 \
+        }                                        \
+                                                 \
+        res0 = y[0 * inc_y];                     \
+        res1 = y[1 * inc_y];                     \
+        res2 = y[2 * inc_y];                     \
+        res3 = y[3 * inc_y];                     \
+                                                 \
+        res0 += alpha * temp0;                   \
+        res1 += alpha * temp1;                   \
+        res2 += alpha * temp2;                   \
+        res3 += alpha * temp3;                   \
+                                                 \
+        y[0 * inc_y] = res0;                     \
+        y[1 * inc_y] = res1;                     \
+        y[2 * inc_y] = res2;                     \
+        y[3 * inc_y] = res3;                     \
+                                                 \
+        y += 4 * inc_y;                          \
+                                                 \
+        pa0 += 4 * lda;                          \
+        pa1 += 4 * lda;                          \
+        pa2 += 4 * lda;                          \
+        pa3 += 4 * lda;                          \
+    }                                            \
+                                                 \
+    if (n & 2)                                   \
+    {                                            \
+        tp0 = zero;                              \
+        tp1 = zero;                              \
+                                                 \
+        k = 0;                                   \
+        x = srcx_org;                            \
+                                                 \
+        for (i = (m >> 3); i--;)                 \
+        {                                        \
+            SLOAD_X8();                          \
+            SGEMV_T_2x8();                       \
+                                                 \
+            x += 8 * inc_x;                      \
+            k += 8;                              \
+        }                                        \
+                                                 \
+        if (m & 4)                               \
+        {                                        \
+            SLOAD_X4();                          \
+            SGEMV_T_2x4();                       \
+                                                 \
+            x += 4 * inc_x;                      \
+            k += 4;                              \
+        }                                        \
+                                                 \
+        ILVRL_W2_SP(tp1, tp0, tp2, tp3);         \
+                                                 \
+        tp2 += tp3;                              \
+                                                 \
+        temp0 = tp2[0] + tp2[2];                 \
+        temp1 = tp2[1] + tp2[3];                 \
+                                                 \
+        for (i = (m & 3); i--;)                  \
+        {                                        \
+            temp0 += pa0[k] * x[0];              \
+            temp1 += pa1[k] * x[0];              \
+                                                 \
+            x += inc_x;                          \
+            k++;                                 \
+        }                                        \
+                                                 \
+        res0 = y[0 * inc_y];                     \
+        res1 = y[1 * inc_y];                     \
+                                                 \
+        res0 += alpha * temp0;                   \
+        res1 += alpha * temp1;                   \
+                                                 \
+        y[0 * inc_y] = res0;                     \
+        y[1 * inc_y] = res1;                     \
+                                                 \
+        y += 2 * inc_y;                          \
+                                                 \
+        pa0 += 2 * lda;                          \
+        pa1 += 2 * lda;                          \
+    }                                            \
+                                                 \
+    if (n & 1)                                   \
+    {                                            \
+        temp0 = 0.0;                             \
+                                                 \
+        k = 0;                                   \
+        x = srcx_org;                            \
+                                                 \
+        for (i = m; i--;)                        \
+        {                                        \
+            temp0 += pa0[k] * x[0];              \
+                                                 \
+            x += inc_x;                          \
+            k++;                                 \
+        }                                        \
+                                                 \
+        y[0] += alpha * temp0;                   \
+        y += inc_y;                              \
+        pa0 += lda;                              \
+    }
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
+          BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
+          FLOAT *buffer)
+{
+    BLASLONG i, j, k;
+    FLOAT *srcx_org = x;
+    FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+    FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
+    v4f32 x0, x1;
+    v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
+    v4f32 zero = {0};
+
+    pa0 = A + 0 * lda;
+    pa1 = A + 1 * lda;
+    pa2 = A + 2 * lda;
+    pa3 = A + 3 * lda;
+    pa4 = A + 4 * lda;
+    pa5 = A + 5 * lda;
+    pa6 = A + 6 * lda;
+    pa7 = A + 7 * lda;
+
+    if (1 == inc_x)
+    {
+        #define SLOAD_X8  SLOAD_X8_VECTOR
+        #define SLOAD_X4  SLOAD_X4_VECTOR
+
+        SGEMV_T_MSA();
+
+        #undef SLOAD_X8
+        #undef SLOAD_X4
+    }
+    else
+    {
+        #define SLOAD_X8  SLOAD_X8_GP
+        #define SLOAD_X4  SLOAD_X4_GP
+
+        SGEMV_T_MSA();
+
+        #undef SLOAD_X8
+        #undef SLOAD_X4
+    }
+
+    return(0);
+}
diff --git a/kernel/mips/zasum_msa.c b/kernel/mips/zasum_msa.c
new file mode 100644 (file)
index 0000000..c84d48e
--- /dev/null
@@ -0,0 +1,170 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include "macros_msa.h"
+
+#define AND_VEC_D(in)   ((v2f64) ((v2i64) in & and_vec))
+
+#define PROCESS_ZD(inc_val)                           \
+    if (n > 8)                                        \
+    {                                                 \
+        n -= 8;                                       \
+                                                      \
+        LD_DP8_INC(x, inc_val, src0, src1, src2,      \
+                   src3, src4, src5, src6, src7);     \
+                                                      \
+        sum_abs0 = AND_VEC_D(src0);                   \
+        sum_abs1 = AND_VEC_D(src1);                   \
+        sum_abs2 = AND_VEC_D(src2);                   \
+        sum_abs3 = AND_VEC_D(src3);                   \
+        sum_abs0 += AND_VEC_D(src4);                  \
+        sum_abs1 += AND_VEC_D(src5);                  \
+        sum_abs2 += AND_VEC_D(src6);                  \
+        sum_abs3 += AND_VEC_D(src7);                  \
+    }                                                 \
+    else                                              \
+    {                                                 \
+        sum_abs0 = zero_v;                            \
+        sum_abs1 = zero_v;                            \
+        sum_abs2 = zero_v;                            \
+        sum_abs3 = zero_v;                            \
+    }                                                 \
+                                                      \
+    for (i = (n >> 3); i--;)                          \
+    {                                                 \
+        LD_DP8_INC(x, inc_val, src0, src1, src2,      \
+                   src3, src4, src5, src6, src7);     \
+                                                      \
+        sum_abs0 += AND_VEC_D(src0);                  \
+        sum_abs1 += AND_VEC_D(src1);                  \
+        sum_abs2 += AND_VEC_D(src2);                  \
+        sum_abs3 += AND_VEC_D(src3);                  \
+        sum_abs0 += AND_VEC_D(src4);                  \
+        sum_abs1 += AND_VEC_D(src5);                  \
+        sum_abs2 += AND_VEC_D(src6);                  \
+        sum_abs3 += AND_VEC_D(src7);                  \
+    }                                                 \
+                                                      \
+    if (n & 7)                                        \
+    {                                                 \
+        if ((n & 4) && (n & 2) && (n & 1))            \
+        {                                             \
+            LD_DP7_INC(x, inc_val, src0, src1, src2,  \
+                       src3, src4, src5, src6);       \
+                                                      \
+            sum_abs0 += AND_VEC_D(src0);              \
+            sum_abs1 += AND_VEC_D(src1);              \
+            sum_abs2 += AND_VEC_D(src2);              \
+            sum_abs3 += AND_VEC_D(src3);              \
+            sum_abs0 += AND_VEC_D(src4);              \
+            sum_abs1 += AND_VEC_D(src5);              \
+            sum_abs2 += AND_VEC_D(src6);              \
+        }                                             \
+        else if ((n & 4) && (n & 2))                  \
+        {                                             \
+            LD_DP6_INC(x, inc_val, src0, src1, src2,  \
+                       src3, src4, src5);             \
+                                                      \
+            sum_abs0 += AND_VEC_D(src0);              \
+            sum_abs1 += AND_VEC_D(src1);              \
+            sum_abs2 += AND_VEC_D(src2);              \
+            sum_abs3 += AND_VEC_D(src3);              \
+            sum_abs0 += AND_VEC_D(src4);              \
+            sum_abs1 += AND_VEC_D(src5);              \
+        }                                             \
+        else if ((n & 4) && (n & 1))                  \
+        {                                             \
+            LD_DP5_INC(x, inc_val, src0, src1, src2,  \
+                       src3, src4);                   \
+                                                      \
+            sum_abs0 += AND_VEC_D(src0);              \
+            sum_abs1 += AND_VEC_D(src1);              \
+            sum_abs2 += AND_VEC_D(src2);              \
+            sum_abs3 += AND_VEC_D(src3);              \
+            sum_abs0 += AND_VEC_D(src4);              \
+        }                                             \
+        else if ((n & 2) && (n & 1))                  \
+        {                                             \
+            LD_DP3_INC(x, inc_val, src0, src1, src2); \
+                                                      \
+            sum_abs0 += AND_VEC_D(src0);              \
+            sum_abs1 += AND_VEC_D(src1);              \
+            sum_abs2 += AND_VEC_D(src2);              \
+        }                                             \
+        else if (n & 4)                               \
+        {                                             \
+            LD_DP4_INC(x, inc_val, src0, src1, src2,  \
+                       src3);                         \
+                                                      \
+            sum_abs0 += AND_VEC_D(src0);              \
+            sum_abs1 += AND_VEC_D(src1);              \
+            sum_abs2 += AND_VEC_D(src2);              \
+            sum_abs3 += AND_VEC_D(src3);              \
+        }                                             \
+        else if (n & 2)                               \
+        {                                             \
+            LD_DP2_INC(x, inc_val, src0, src1);       \
+                                                      \
+            sum_abs0 += AND_VEC_D(src0);              \
+            sum_abs1 += AND_VEC_D(src1);              \
+        }                                             \
+        else if (n & 1)                               \
+        {                                             \
+            src0 = LD_DP(x);                          \
+                                                      \
+            sum_abs0 += AND_VEC_D(src0);              \
+        }                                             \
+    }                                                 \
+                                                      \
+    sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3;       \
+    sumf = sum_abs0[0] + sum_abs0[1];
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i;
+    FLOAT sumf = 0.0;
+    v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+    v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
+    v2f64 zero_v = {0};
+    v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
+
+    if (n <= 0 || inc_x <= 0) return (sumf);
+
+    if (1 == inc_x)
+    {
+        PROCESS_ZD(2);
+    }
+    else
+    {
+        inc_x *= 2;
+        PROCESS_ZD(inc_x);
+    }
+
+    return (sumf);
+}
diff --git a/kernel/mips/zdot_msa.c b/kernel/mips/zdot_msa.c
new file mode 100644 (file)
index 0000000..b945093
--- /dev/null
@@ -0,0 +1,227 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#if !defined(CONJ)
+       #define OP2             +=
+       #define OP3             -
+       #define OP4             +
+#else
+       #define OP2             -=
+       #define OP3             +
+       #define OP4             -
+#endif
+
+#define DOT16_KERNEL(OPR0, OPR1)  \
+       dot0 += (vx0r * vy0r);            \
+       dot0 OPR0## = (vx0i * vy0i);  \
+       dot1 OPR1## = (vx0i * vy0r);  \
+       dot1 += (vx0r * vy0i);        \
+                                                                 \
+       dot0 += (vx1r * vy1r);        \
+       dot0 OPR0## = (vx1i * vy1i);  \
+       dot1 OPR1## = (vx1i * vy1r);  \
+       dot1 += (vx1r * vy1i);        \
+                                                                 \
+       dot0 += (vx2r * vy2r);        \
+       dot0 OPR0## = (vx2i * vy2i);  \
+       dot1 OPR1## = (vx2i * vy2r);  \
+       dot1 += (vx2r * vy2i);        \
+                                                                 \
+       dot0 += (vx3r * vy3r);        \
+       dot0 OPR0## = (vx3i * vy3i);  \
+       dot1 OPR1## = (vx3i * vy3r);  \
+       dot1 += (vx3r * vy3i);
+
+#define DOT12_KERNEL(OPR0, OPR1)  \
+       dot0 += (vx0r * vy0r);            \
+       dot0 OPR0## = (vx0i * vy0i);  \
+       dot1 OPR1## = (vx0i * vy0r);  \
+       dot1 += (vx0r * vy0i);        \
+                                                                 \
+       dot0 += (vx1r * vy1r);        \
+       dot0 OPR0## = (vx1i * vy1i);  \
+       dot1 OPR1## = (vx1i * vy1r);  \
+       dot1 += (vx1r * vy1i);            \
+                                                                 \
+       dot0 += (vx2r * vy2r);        \
+       dot0 OPR0## = (vx2i * vy2i);  \
+       dot1 OPR1## = (vx2i * vy2r);  \
+       dot1 += (vx2r * vy2i);
+
+#define DOT8_KERNEL(OPR0, OPR1)   \
+       dot0 += (vx0r * vy0r);            \
+       dot0 OPR0## = (vx0i * vy0i);  \
+       dot1 OPR1## = (vx0i * vy0r);  \
+       dot1 += (vx0r * vy0i);        \
+                                                                 \
+       dot0 += (vx1r * vy1r);        \
+       dot0 OPR0## = (vx1i * vy1i);  \
+       dot1 OPR1## = (vx1i * vy1r);  \
+       dot1 += (vx1r * vy1i);
+
+#define DOT4_KERNEL(OPR0, OPR1)   \
+       dot0 += (vx0r * vy0r);            \
+       dot0 OPR0## = (vx0i * vy0i);  \
+       dot1 OPR1## = (vx0i * vy0r);  \
+       dot1 += (vx0r * vy0i);
+
+/* return double, x,y double */
+/* zdotc -  CONJ */
+/* zdotu - !CONJ */
+#ifndef _MSC_VER
+#include <complex.h>
+FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+    BLASLONG i = 0;
+    FLOAT dot[2];
+    BLASLONG inc_x2;
+    BLASLONG inc_y2;
+    v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
+    v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+       v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
+       v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
+    v2f64 dot0 = {0, 0};
+    v2f64 dot1 = {0, 0};
+    v2f64 zero = {0, 0};
+    openblas_complex_double result;
+
+    dot[0] = 0.0;
+    dot[1] = 0.0;
+
+    __real__(result) = 0.0;
+    __imag__(result) = 0.0;
+
+    if ( n < 1 ) return(result);
+
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+
+       for (i = (n >> 3); i--;)
+       {
+               LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
+               LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
+
+               PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+               PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+               PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
+               PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
+
+               PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+               PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+               PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
+               PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
+
+       #if !defined(CONJ)
+               DOT16_KERNEL(-, +);
+       #else
+               DOT16_KERNEL(+, -);
+       #endif
+       }
+
+       if (n & 7)
+       {
+               if ((n & 4) && (n & 2))
+               {
+                       LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
+                       LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
+                       LD_DP2_INC(x, inc_x2, vx4, vx5);
+                       LD_DP2_INC(y, inc_y2, vy4, vy5);
+
+                       PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+                       PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+                       PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
+
+                       PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+                       PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+                       PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
+
+               #if !defined(CONJ)
+                       DOT12_KERNEL(-, +);
+               #else
+                       DOT12_KERNEL(+, -);
+               #endif
+               }
+               else if (n & 4)
+               {
+                       LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
+                       LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
+
+                       PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+                       PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
+
+                       PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+                       PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
+
+               #if !defined(CONJ)
+                       DOT8_KERNEL(-, +);
+               #else
+                       DOT8_KERNEL(+, -);
+               #endif
+               }
+               else if (n & 2)
+               {
+                       LD_DP2_INC(x, inc_x2, vx0, vx1);
+                       LD_DP2_INC(y, inc_y2, vy0, vy1);
+                       PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
+                       PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
+
+               #if !defined(CONJ)
+                       DOT4_KERNEL(-, +);
+               #else
+                       DOT4_KERNEL(+, -);
+               #endif
+               }
+
+               if (n & 1)
+               {
+                       vx0 = LD_DP(x);
+                       vy0 = LD_DP(y);
+                       PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i);
+                       PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i);
+
+               #if !defined(CONJ)
+                       DOT4_KERNEL(-, +);
+               #else
+                       DOT4_KERNEL(+, -);
+               #endif
+               }
+       }
+
+       dot[0] += (dot0[0] + dot0[1]);
+       dot[1] += (dot1[0] + dot1[1]);
+
+    __real__(result) = dot[0];
+    __imag__(result) = dot[1];
+
+    return(result);
+}
diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c
new file mode 100644 (file)
index 0000000..aadc610
--- /dev/null
@@ -0,0 +1,667 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#undef OP0
+#undef OP1
+#undef OP2
+#undef OP3
+#undef OP4
+
+#if !defined(XCONJ)
+    #define OP3  -=
+    #define OP4  +=
+#else
+    #define OP3  +=
+    #define OP4  -=
+#endif
+
+#if !defined(CONJ)
+    #if !defined(XCONJ)
+        #define OP0  -=
+        #define OP1  +=
+        #define OP2  +=
+    #else
+        #define OP0  +=
+        #define OP1  +=
+        #define OP2  -=
+    #endif
+#else
+    #if !defined(XCONJ)
+        #define OP0  +=
+        #define OP1  -=
+        #define OP2  -=
+    #else
+        #define OP0  -=
+        #define OP1  -=
+        #define OP2  +=
+    #endif
+#endif
+
+#define ZGEMV_N_4x4()                        \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
+    LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
+    LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
+                                             \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);     \
+    PCKEVOD_D2_DP(t3, t2, src1r, src1i);     \
+    PCKEVOD_D2_DP(t5, t4, src2r, src2i);     \
+    PCKEVOD_D2_DP(t7, t6, src3r, src3i);     \
+    PCKEVOD_D2_DP(t9, t8, src4r, src4i);     \
+    PCKEVOD_D2_DP(t11, t10, src5r, src5i);   \
+    PCKEVOD_D2_DP(t13, t12, src6r, src6i);   \
+    PCKEVOD_D2_DP(t15, t14, src7r, src7i);   \
+                                             \
+    y0r += tp0r * src0r;                     \
+    y1r += tp0r * src1r;                     \
+    y0r += tp1r * src2r;                     \
+    y1r += tp1r * src3r;                     \
+    y0r += tp2r * src4r;                     \
+    y1r += tp2r * src5r;                     \
+    y0r += tp3r * src6r;                     \
+    y1r += tp3r * src7r;                     \
+                                             \
+    y0r OP0 tp0i * src0i;                    \
+    y1r OP0 tp0i * src1i;                    \
+    y0r OP0 tp1i * src2i;                    \
+    y1r OP0 tp1i * src3i;                    \
+    y0r OP0 tp2i * src4i;                    \
+    y1r OP0 tp2i * src5i;                    \
+    y0r OP0 tp3i * src6i;                    \
+    y1r OP0 tp3i * src7i;                    \
+                                             \
+    y0i OP1 tp0r * src0i;                    \
+    y1i OP1 tp0r * src1i;                    \
+    y0i OP1 tp1r * src2i;                    \
+    y1i OP1 tp1r * src3i;                    \
+    y0i OP1 tp2r * src4i;                    \
+    y1i OP1 tp2r * src5i;                    \
+    y0i OP1 tp3r * src6i;                    \
+    y1i OP1 tp3r * src7i;                    \
+                                             \
+    y0i OP2 tp0i * src0r;                    \
+    y1i OP2 tp0i * src1r;                    \
+    y0i OP2 tp1i * src2r;                    \
+    y1i OP2 tp1i * src3r;                    \
+    y0i OP2 tp2i * src4r;                    \
+    y1i OP2 tp2i * src5r;                    \
+    y0i OP2 tp3i * src6r;                    \
+    y1i OP2 tp3i * src7r;                    \
+
+#define ZGEMV_N_2x4()                       \
+    LD_DP2(pa0 + k, 2, t0, t1);             \
+    LD_DP2(pa1 + k, 2, t4, t5);             \
+    LD_DP2(pa2 + k, 2, t8, t9);             \
+    LD_DP2(pa3 + k, 2, t12, t13);           \
+                                            \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);    \
+    PCKEVOD_D2_DP(t5, t4, src2r, src2i);    \
+    PCKEVOD_D2_DP(t9, t8, src4r, src4i);    \
+    PCKEVOD_D2_DP(t13, t12, src6r, src6i);  \
+                                            \
+    y0r += tp0r * src0r;                    \
+    y0r += tp1r * src2r;                    \
+    y0r += tp2r * src4r;                    \
+    y0r += tp3r * src6r;                    \
+                                            \
+    y0r OP0 tp0i * src0i;                   \
+    y0r OP0 tp1i * src2i;                   \
+    y0r OP0 tp2i * src4i;                   \
+    y0r OP0 tp3i * src6i;                   \
+                                            \
+    y0i OP1 tp0r * src0i;                   \
+    y0i OP1 tp1r * src2i;                   \
+    y0i OP1 tp2r * src4i;                   \
+    y0i OP1 tp3r * src6i;                   \
+                                            \
+    y0i OP2 tp0i * src0r;                   \
+    y0i OP2 tp1i * src2r;                   \
+    y0i OP2 tp2i * src4r;                   \
+    y0i OP2 tp3i * src6r;                   \
+
+#define ZGEMV_N_1x4()               \
+    res0 = y[0 * inc_y2];           \
+    res1 = y[0 * inc_y2 + 1];       \
+                                    \
+    res0  += temp0_r * pa0[k];      \
+    res0 OP0 temp0_i * pa0[k + 1];  \
+    res0  += temp1_r * pa1[k];      \
+    res0 OP0 temp1_i * pa1[k + 1];  \
+    res0  += temp2_r * pa2[k];      \
+    res0 OP0 temp2_i * pa2[k + 1];  \
+    res0  += temp3_r * pa3[k];      \
+    res0 OP0 temp3_i * pa3[k + 1];  \
+                                    \
+    res1 OP1 temp0_r * pa0[k + 1];  \
+    res1 OP2 temp0_i * pa0[k];      \
+    res1 OP1 temp1_r * pa1[k + 1];  \
+    res1 OP2 temp1_i * pa1[k];      \
+    res1 OP1 temp2_r * pa2[k + 1];  \
+    res1 OP2 temp2_i * pa2[k];      \
+    res1 OP1 temp3_r * pa3[k + 1];  \
+    res1 OP2 temp3_i * pa3[k];      \
+                                    \
+    y[0 * inc_y2]     = res0;       \
+    y[0 * inc_y2 + 1] = res1;       \
+
+#define ZGEMV_N_4x2()                     \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);   \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);   \
+                                          \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
+    PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
+    PCKEVOD_D2_DP(t5, t4, src2r, src2i);  \
+    PCKEVOD_D2_DP(t7, t6, src3r, src3i);  \
+                                          \
+    y0r += tp0r * src0r;                  \
+    y1r += tp0r * src1r;                  \
+    y0r += tp1r * src2r;                  \
+    y1r += tp1r * src3r;                  \
+                                          \
+    y0r OP0 tp0i * src0i;                 \
+    y1r OP0 tp0i * src1i;                 \
+    y0r OP0 tp1i * src2i;                 \
+    y1r OP0 tp1i * src3i;                 \
+                                          \
+    y0i OP1 tp0r * src0i;                 \
+    y1i OP1 tp0r * src1i;                 \
+    y0i OP1 tp1r * src2i;                 \
+    y1i OP1 tp1r * src3i;                 \
+                                          \
+    y0i OP2 tp0i * src0r;                 \
+    y1i OP2 tp0i * src1r;                 \
+    y0i OP2 tp1i * src2r;                 \
+    y1i OP2 tp1i * src3r;                 \
+
+#define ZGEMV_N_2x2()                     \
+    LD_DP2(pa0 + k, 2, t0, t1);           \
+    LD_DP2(pa1 + k, 2, t4, t5);           \
+                                          \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
+    PCKEVOD_D2_DP(t5, t4, src2r, src2i);  \
+                                          \
+    y0r += tp0r * src0r;                  \
+    y0r += tp1r * src2r;                  \
+                                          \
+    y0r OP0 tp0i * src0i;                 \
+    y0r OP0 tp1i * src2i;                 \
+                                          \
+    y0i OP1 tp0r * src0i;                 \
+    y0i OP1 tp1r * src2i;                 \
+                                          \
+    y0i OP2 tp0i * src0r;                 \
+    y0i OP2 tp1i * src2r;                 \
+
+#define ZGEMV_N_1x2()               \
+    res0 = y[0 * inc_y2];           \
+    res1 = y[0 * inc_y2 + 1];       \
+                                    \
+    res0  += temp0_r * pa0[k];      \
+    res0 OP0 temp0_i * pa0[k + 1];  \
+    res0  += temp1_r * pa1[k];      \
+    res0 OP0 temp1_i * pa1[k + 1];  \
+                                    \
+    res1 OP1 temp0_r * pa0[k + 1];  \
+    res1 OP2 temp0_i * pa0[k];      \
+    res1 OP1 temp1_r * pa1[k + 1];  \
+    res1 OP2 temp1_i * pa1[k];      \
+                                    \
+    y[0 * inc_y2]     = res0;       \
+    y[0 * inc_y2 + 1] = res1;       \
+
+#define ZGEMV_N_4x1()                     \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);   \
+                                          \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
+    PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
+                                          \
+    y0r += tp0r * src0r;                  \
+    y1r += tp0r * src1r;                  \
+                                          \
+    y0r OP0 tp0i * src0i;                 \
+    y1r OP0 tp0i * src1i;                 \
+                                          \
+    y0i OP1 tp0r * src0i;                 \
+    y1i OP1 tp0r * src1i;                 \
+                                          \
+    y0i OP2 tp0i * src0r;                 \
+    y1i OP2 tp0i * src1r;                 \
+
+#define ZGEMV_N_2x1()                     \
+    LD_DP2(pa0 + k, 2, t0, t1);           \
+                                          \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
+                                          \
+    y0r += tp0r * src0r;                  \
+    y0r OP0 tp0i * src0i;                 \
+    y0i OP1 tp0r * src0i;                 \
+    y0i OP2 tp0i * src0r;                 \
+
+#define ZGEMV_N_1x1()               \
+    res0 = y[0 * inc_y2];           \
+    res1 = y[0 * inc_y2 + 1];       \
+                                    \
+    res0  += temp0_r * pa0[k];      \
+    res0 OP0 temp0_i * pa0[k + 1];  \
+                                    \
+    res1 OP1 temp0_r * pa0[k + 1];  \
+    res1 OP2 temp0_i * pa0[k];      \
+                                    \
+    y[0 * inc_y2]     = res0;       \
+    y[0 * inc_y2 + 1] = res1;       \
+
+#define ZLOAD_X4_SCALE_VECTOR()       \
+    LD_DP4(x, 2, x0, x1, x2, x3);     \
+                                      \
+    PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
+    PCKEVOD_D2_DP(x3, x2, x1r, x1i);  \
+                                      \
+    tp4r   = alphar * x0r;            \
+    tp4r OP3 alphai * x0i;            \
+    tp4i   = alphar * x0i;            \
+    tp4i OP4 alphai * x0r;            \
+                                      \
+    tp5r   = alphar * x1r;            \
+    tp5r OP3 alphai * x1i;            \
+    tp5i   = alphar * x1i;            \
+    tp5i OP4 alphai * x1r;            \
+                                      \
+    SPLATI_D2_DP(tp4r, tp0r, tp1r);   \
+    SPLATI_D2_DP(tp5r, tp2r, tp3r);   \
+    SPLATI_D2_DP(tp4i, tp0i, tp1i);   \
+    SPLATI_D2_DP(tp5i, tp2i, tp3i);   \
+
+#define ZLOAD_X2_SCALE_VECTOR()       \
+    LD_DP2(x, 2, x0, x1);             \
+                                      \
+    PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
+                                      \
+    tp4r   = alphar * x0r;            \
+    tp4r OP3 alphai * x0i;            \
+    tp4i   = alphar * x0i;            \
+    tp4i OP4 alphai * x0r;            \
+                                      \
+    SPLATI_D2_DP(tp4r, tp0r, tp1r);   \
+    SPLATI_D2_DP(tp4i, tp0i, tp1i);   \
+
+#define ZLOAD_X4_SCALE_GP()                                                               \
+    x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2)));      \
+    x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *)(x + 1 * inc_x2)));      \
+    x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 2 * inc_x2)));      \
+    x1r = (v2f64) __msa_insert_d((v2i64) x1r,  1, *((long long *)(x + 3 * inc_x2)));      \
+    x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2 + 1)));  \
+    x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *)(x + 1 * inc_x2 + 1)));  \
+    x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 2 * inc_x2 + 1)));  \
+    x1i = (v2f64) __msa_insert_d((v2i64) x1i,  1, *((long long *)(x + 3 * inc_x2 + 1)));  \
+                                                                                          \
+    tp4r   = alphar * x0r;                                                                \
+    tp4r OP3 alphai * x0i;                                                                \
+    tp4i   = alphar * x0i;                                                                \
+    tp4i OP4 alphai * x0r;                                                                \
+                                                                                          \
+    tp5r   = alphar * x1r;                                                                \
+    tp5r OP3 alphai * x1i;                                                                \
+    tp5i   = alphar * x1i;                                                                \
+    tp5i OP4 alphai * x1r;                                                                \
+                                                                                          \
+    SPLATI_D2_DP(tp4r, tp0r, tp1r);                                                       \
+    SPLATI_D2_DP(tp5r, tp2r, tp3r);                                                       \
+    SPLATI_D2_DP(tp4i, tp0i, tp1i);                                                       \
+    SPLATI_D2_DP(tp5i, tp2i, tp3i);                                                       \
+
+#define ZLOAD_X2_SCALE_GP()                                                               \
+    x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2)));      \
+    x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *)(x + 1 * inc_x2)));      \
+    x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2 + 1)));  \
+    x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *)(x + 1 * inc_x2 + 1)));  \
+                                                                                          \
+    tp4r   = alphar * x0r;                                                                \
+    tp4r OP3 alphai * x0i;                                                                \
+    tp4i   = alphar * x0i;                                                                \
+    tp4i OP4 alphai * x0r;                                                                \
+                                                                                          \
+    SPLATI_D2_DP(tp4r, tp0r, tp1r);                                                       \
+    SPLATI_D2_DP(tp4i, tp0i, tp1i);                                                       \
+
+#define ZLOAD_X1_SCALE_GP()                         \
+    temp0_r   = alpha_r * x[0 * inc_x2];            \
+    temp0_r OP3 alpha_i * x[0 * inc_x2 + 1];        \
+    temp0_i   = alpha_r * x[0 * inc_x2 + 1];        \
+    temp0_i OP4 alpha_i * x[0 * inc_x2];            \
+                                                    \
+    tp0r = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_r);  \
+    tp0i = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_i);  \
+
+#define ZLOAD_Y4_VECTOR()             \
+    LD_DP4(y, 2, y0, y1, y2, y3);     \
+    PCKEVOD_D2_DP(y1, y0, y0r, y0i);  \
+    PCKEVOD_D2_DP(y3, y2, y1r, y1i);  \
+
+#define ZLOAD_Y2_VECTOR()             \
+    LD_DP2(y, 2, y0, y1);             \
+    PCKEVOD_D2_DP(y1, y0, y0r, y0i);  \
+
+#define ZSTORE_Y4_VECTOR()          \
+    ILVRL_D2_DP(y0i, y0r, y0, y1);  \
+    ILVRL_D2_DP(y1i, y1r, y2, y3);  \
+    ST_DP4(y0, y1, y2, y3, y, 2);   \
+
+#define ZSTORE_Y2_VECTOR()          \
+    ILVRL_D2_DP(y0i, y0r, y0, y1);  \
+    ST_DP2(y0, y1, y, 2);           \
+
+#define ZLOAD_Y4_GP()                                                                      \
+    y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y +  0 * inc_y2)));      \
+    y0r = (v2f64) __msa_insert_d((v2i64) y0r,  1, *((long long *)(y +  1 * inc_y2)));      \
+    y1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y +  2 * inc_y2)));      \
+    y1r = (v2f64) __msa_insert_d((v2i64) y1r,  1, *((long long *)(y +  3 * inc_y2)));      \
+    y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y +  0 * inc_y2 + 1)));  \
+    y0i = (v2f64) __msa_insert_d((v2i64) y0i,  1, *((long long *)(y +  1 * inc_y2 + 1)));  \
+    y1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y +  2 * inc_y2 + 1)));  \
+    y1i = (v2f64) __msa_insert_d((v2i64) y1i,  1, *((long long *)(y +  3 * inc_y2 + 1)));  \
+
+#define ZLOAD_Y2_GP()                                                                      \
+    y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y +  0 * inc_y2)));      \
+    y0r = (v2f64) __msa_insert_d((v2i64) y0r,  1, *((long long *)(y +  1 * inc_y2)));      \
+    y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y +  0 * inc_y2 + 1)));  \
+    y0i = (v2f64) __msa_insert_d((v2i64) y0i,  1, *((long long *)(y +  1 * inc_y2 + 1)));  \
+
+#define ZSTORE_Y4_GP()                                                      \
+    *((long long *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0);      \
+    *((long long *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1);      \
+    *((long long *)(y + 2 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 0);      \
+    *((long long *)(y + 3 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 1);      \
+    *((long long *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0);  \
+    *((long long *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1);  \
+    *((long long *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 0);  \
+    *((long long *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 1);  \
+
+#define ZSTORE_Y2_GP()                                                      \
+    *((long long *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0);      \
+    *((long long *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1);      \
+    *((long long *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0);  \
+    *((long long *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1);  \
+
+#define ZGEMV_N_MSA()             \
+    for (j = (n >> 2); j--;)      \
+    {                             \
+        ZLOAD_X4_SCALE()          \
+                                  \
+        k = 0;                    \
+        y = y_org;                \
+                                  \
+        for (i = (m >> 2); i--;)  \
+        {                         \
+            ZLOAD_Y4()            \
+            ZGEMV_N_4x4()         \
+            ZSTORE_Y4()           \
+                                  \
+            k += 2 * 4;           \
+            y += inc_y2 * 4;      \
+        }                         \
+                                  \
+        if (m & 2)                \
+        {                         \
+            ZLOAD_Y2()            \
+            ZGEMV_N_2x4()         \
+            ZSTORE_Y2()           \
+                                  \
+            k += 2 * 2;           \
+            y += inc_y2 * 2;      \
+        }                         \
+                                  \
+        if (m & 1)                \
+        {                         \
+            temp0_r = tp4r[0];    \
+            temp1_r = tp4r[1];    \
+            temp2_r = tp5r[0];    \
+            temp3_r = tp5r[1];    \
+                                  \
+            temp0_i = tp4i[0];    \
+            temp1_i = tp4i[1];    \
+            temp2_i = tp5i[0];    \
+            temp3_i = tp5i[1];    \
+                                  \
+            ZGEMV_N_1x4()         \
+            k += 2;               \
+            y += inc_y2;          \
+        }                         \
+                                  \
+        pa0 += 4 * lda2;          \
+        pa1 += 4 * lda2;          \
+        pa2 += 4 * lda2;          \
+        pa3 += 4 * lda2;          \
+                                  \
+        x += 4 * inc_x2;          \
+    }                             \
+                                  \
+    if (n & 2)                    \
+    {                             \
+        ZLOAD_X2_SCALE()          \
+                                  \
+        k = 0;                    \
+        y = y_org;                \
+                                  \
+        for (i = (m >> 2); i--;)  \
+        {                         \
+            ZLOAD_Y4()            \
+            ZGEMV_N_4x2()         \
+            ZSTORE_Y4()           \
+                                  \
+            k += 2 * 4;           \
+            y += inc_y2 * 4;      \
+        }                         \
+                                  \
+        if (m & 2)                \
+        {                         \
+            ZLOAD_Y2()            \
+            ZGEMV_N_2x2()         \
+            ZSTORE_Y2()           \
+                                  \
+            k += 2 * 2;           \
+            y += inc_y2 * 2;      \
+        }                         \
+                                  \
+        if (m & 1)                \
+        {                         \
+            temp0_r = tp4r[0];    \
+            temp1_r = tp4r[1];    \
+                                  \
+            temp0_i = tp4i[0];    \
+            temp1_i = tp4i[1];    \
+                                  \
+            ZGEMV_N_1x2()         \
+                                  \
+            k += 2;               \
+            y += inc_y2;          \
+        }                         \
+                                  \
+        pa0 += 2 * lda2;          \
+        pa1 += 2 * lda2;          \
+                                  \
+        x += 2 * inc_x2;          \
+    }                             \
+                                  \
+    if (n & 1)                    \
+    {                             \
+        ZLOAD_X1_SCALE()          \
+                                  \
+        k = 0;                    \
+        y = y_org;                \
+                                  \
+        for (i = (m >> 2); i--;)  \
+        {                         \
+            ZLOAD_Y4()            \
+            ZGEMV_N_4x1()         \
+            ZSTORE_Y4()           \
+                                  \
+            k += 2 * 4;           \
+            y += inc_y2 * 4;      \
+        }                         \
+                                  \
+        if (m & 2)                \
+        {                         \
+            ZLOAD_Y2()            \
+            ZGEMV_N_2x1()         \
+            ZSTORE_Y2()           \
+                                  \
+            k += 2 * 2;           \
+            y += inc_y2 * 2;      \
+        }                         \
+                                  \
+        if (m & 1)                \
+        {                         \
+            ZGEMV_N_1x1()         \
+                                  \
+            k += 2;               \
+            y += inc_y2;          \
+        }                         \
+                                  \
+        pa0 += lda2;              \
+        x += inc_x2;              \
+    }                             \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+          FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
+          BLASLONG inc_y2, FLOAT *buffer)
+{
+    BLASLONG i, j, k;
+    FLOAT *y_org = y;
+    FLOAT *pa0, *pa1, *pa2, *pa3;
+    FLOAT temp0_r, temp1_r, temp2_r, temp3_r, temp0_i, temp1_i, temp2_i;
+    FLOAT temp3_i, res0, res1;
+    v2f64 alphar, alphai;
+    v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
+    v2f64 x0r, x1r, x0i, x1i, y0r, y1r, y0i, y1i;
+    v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
+    v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
+    v2f64 tp0r, tp1r, tp2r, tp3r, tp4r, tp5r, tp0i, tp1i, tp2i, tp3i, tp4i, tp5i;
+
+    lda2   = 2 * lda2;
+    inc_x2 = 2 * inc_x2;
+    inc_y2 = 2 * inc_y2;
+
+    pa0 = A;
+    pa1 = A + lda2;
+    pa2 = A + 2 * lda2;
+    pa3 = A + 3 * lda2;
+
+    alphar = COPY_DOUBLE_TO_VECTOR(alpha_r);
+    alphai = COPY_DOUBLE_TO_VECTOR(alpha_i);
+
+    if ((2 == inc_x2) && (2 == inc_y2))
+    {
+        #define ZLOAD_X4_SCALE  ZLOAD_X4_SCALE_VECTOR
+        #define ZLOAD_X2_SCALE  ZLOAD_X2_SCALE_VECTOR
+        #define ZLOAD_X1_SCALE  ZLOAD_X1_SCALE_GP
+        #define ZLOAD_Y4        ZLOAD_Y4_VECTOR
+        #define ZLOAD_Y2        ZLOAD_Y2_VECTOR
+        #define ZSTORE_Y4       ZSTORE_Y4_VECTOR
+        #define ZSTORE_Y2       ZSTORE_Y2_VECTOR
+
+        ZGEMV_N_MSA();
+
+        #undef ZLOAD_X4_SCALE
+        #undef ZLOAD_X2_SCALE
+        #undef ZLOAD_X1_SCALE
+        #undef ZLOAD_Y4
+        #undef ZLOAD_Y2
+        #undef ZSTORE_Y4
+        #undef ZSTORE_Y2
+    }
+    else if (2 == inc_x2)
+    {
+        #define ZLOAD_X4_SCALE  ZLOAD_X4_SCALE_VECTOR
+        #define ZLOAD_X2_SCALE  ZLOAD_X2_SCALE_VECTOR
+        #define ZLOAD_X1_SCALE  ZLOAD_X1_SCALE_GP
+        #define ZLOAD_Y4        ZLOAD_Y4_GP
+        #define ZLOAD_Y2        ZLOAD_Y2_GP
+        #define ZSTORE_Y4       ZSTORE_Y4_GP
+        #define ZSTORE_Y2       ZSTORE_Y2_GP
+
+        ZGEMV_N_MSA();
+
+        #undef ZLOAD_X4_SCALE
+        #undef ZLOAD_X2_SCALE
+        #undef ZLOAD_X1_SCALE
+        #undef ZLOAD_Y4
+        #undef ZLOAD_Y2
+        #undef ZSTORE_Y4
+        #undef ZSTORE_Y2
+    }
+    else if (2 == inc_y2)
+    {
+        #define ZLOAD_X4_SCALE  ZLOAD_X4_SCALE_GP
+        #define ZLOAD_X2_SCALE  ZLOAD_X2_SCALE_GP
+        #define ZLOAD_X1_SCALE  ZLOAD_X1_SCALE_GP
+        #define ZLOAD_Y4        ZLOAD_Y4_VECTOR
+        #define ZLOAD_Y2        ZLOAD_Y2_VECTOR
+        #define ZSTORE_Y4       ZSTORE_Y4_VECTOR
+        #define ZSTORE_Y2       ZSTORE_Y2_VECTOR
+
+        ZGEMV_N_MSA();
+
+        #undef ZLOAD_X4_SCALE
+        #undef ZLOAD_X2_SCALE
+        #undef ZLOAD_X1_SCALE
+        #undef ZLOAD_Y4
+        #undef ZLOAD_Y2
+        #undef ZSTORE_Y4
+        #undef ZSTORE_Y2
+    }
+    else
+    {
+        #define ZLOAD_X4_SCALE  ZLOAD_X4_SCALE_GP
+        #define ZLOAD_X2_SCALE  ZLOAD_X2_SCALE_GP
+        #define ZLOAD_X1_SCALE  ZLOAD_X1_SCALE_GP
+        #define ZLOAD_Y4        ZLOAD_Y4_GP
+        #define ZLOAD_Y2        ZLOAD_Y2_GP
+        #define ZSTORE_Y4       ZSTORE_Y4_GP
+        #define ZSTORE_Y2       ZSTORE_Y2_GP
+
+        ZGEMV_N_MSA();
+
+        #undef ZLOAD_X4_SCALE
+        #undef ZLOAD_X2_SCALE
+        #undef ZLOAD_X1_SCALE
+        #undef ZLOAD_Y4
+        #undef ZLOAD_Y2
+        #undef ZSTORE_Y4
+        #undef ZSTORE_Y2
+    }
+    return(0);
+}
+
+#undef OP0
+#undef OP1
+#undef OP2
+#undef OP3
+#undef OP4
diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c
new file mode 100644 (file)
index 0000000..b2147b0
--- /dev/null
@@ -0,0 +1,544 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#undef OP0
+#undef OP1
+#undef OP2
+#undef OP3
+#undef OP4
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    #define OP0  -=
+    #define OP1  +=
+    #define OP2  +=
+#else
+    #define OP0  +=
+    #define OP1  +=
+    #define OP2  -=
+#endif
+
+#define ZGEMV_T_4x4()                        \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);      \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);      \
+    LD_DP4(pa2 + k, 2, t8, t9, t10, t11);    \
+    LD_DP4(pa3 + k, 2, t12, t13, t14, t15);  \
+                                             \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);     \
+    PCKEVOD_D2_DP(t3, t2, src1r, src1i);     \
+    PCKEVOD_D2_DP(t5, t4, src2r, src2i);     \
+    PCKEVOD_D2_DP(t7, t6, src3r, src3i);     \
+    PCKEVOD_D2_DP(t9, t8, src4r, src4i);     \
+    PCKEVOD_D2_DP(t11, t10, src5r, src5i);   \
+    PCKEVOD_D2_DP(t13, t12, src6r, src6i);   \
+    PCKEVOD_D2_DP(t15, t14, src7r, src7i);   \
+                                             \
+    tp0r += src0r * x0r;                     \
+    tp0r += src1r * x1r;                     \
+    tp0r OP0 src0i * x0i;                    \
+    tp0r OP0 src1i * x1i;                    \
+                                             \
+    tp1r += src2r * x0r;                     \
+    tp1r += src3r * x1r;                     \
+    tp1r OP0 src2i * x0i;                    \
+    tp1r OP0 src3i * x1i;                    \
+                                             \
+    tp2r += src4r * x0r;                     \
+    tp2r += src5r * x1r;                     \
+    tp2r OP0 src4i * x0i;                    \
+    tp2r OP0 src5i * x1i;                    \
+                                             \
+    tp3r += src6r * x0r;                     \
+    tp3r += src7r * x1r;                     \
+    tp3r OP0 src6i * x0i;                    \
+    tp3r OP0 src7i * x1i;                    \
+                                             \
+    tp0i OP1 src0r * x0i;                    \
+    tp0i OP1 src1r * x1i;                    \
+    tp0i OP2 src0i * x0r;                    \
+    tp0i OP2 src1i * x1r;                    \
+                                             \
+    tp1i OP1 src2r * x0i;                    \
+    tp1i OP1 src3r * x1i;                    \
+    tp1i OP2 src2i * x0r;                    \
+    tp1i OP2 src3i * x1r;                    \
+                                             \
+    tp2i OP1 src4r * x0i;                    \
+    tp2i OP1 src5r * x1i;                    \
+    tp2i OP2 src4i * x0r;                    \
+    tp2i OP2 src5i * x1r;                    \
+                                             \
+    tp3i OP1 src6r * x0i;                    \
+    tp3i OP1 src7r * x1i;                    \
+    tp3i OP2 src6i * x0r;                    \
+    tp3i OP2 src7i * x1r;                    \
+
+#define ZGEMV_T_4x2()                     \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);   \
+    LD_DP4(pa1 + k, 2, t4, t5, t6, t7);   \
+                                          \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
+    PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
+    PCKEVOD_D2_DP(t5, t4, src2r, src2i);  \
+    PCKEVOD_D2_DP(t7, t6, src3r, src3i);  \
+                                          \
+    tp0r += src0r * x0r;                  \
+    tp0r += src1r * x1r;                  \
+    tp0r OP0 src0i * x0i;                 \
+    tp0r OP0 src1i * x1i;                 \
+                                          \
+    tp1r += src2r * x0r;                  \
+    tp1r += src3r * x1r;                  \
+    tp1r OP0 src2i * x0i;                 \
+    tp1r OP0 src3i * x1i;                 \
+                                          \
+    tp0i OP1 src0r * x0i;                 \
+    tp0i OP1 src1r * x1i;                 \
+    tp0i OP2 src0i * x0r;                 \
+    tp0i OP2 src1i * x1r;                 \
+                                          \
+    tp1i OP1 src2r * x0i;                 \
+    tp1i OP1 src3r * x1i;                 \
+    tp1i OP2 src2i * x0r;                 \
+    tp1i OP2 src3i * x1r;                 \
+
+#define ZGEMV_T_4x1()                     \
+    LD_DP4(pa0 + k, 2, t0, t1, t2, t3);   \
+                                          \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
+    PCKEVOD_D2_DP(t3, t2, src1r, src1i);  \
+                                          \
+    tp0r += src0r * x0r;                  \
+    tp0r += src1r * x1r;                  \
+    tp0r OP0 src0i * x0i;                 \
+    tp0r OP0 src1i * x1i;                 \
+                                          \
+    tp0i OP1 src0r * x0i;                 \
+    tp0i OP1 src1r * x1i;                 \
+    tp0i OP2 src0i * x0r;                 \
+    tp0i OP2 src1i * x1r;                 \
+
+#define ZGEMV_T_2x4()                       \
+    LD_DP2(pa0 + k, 2, t0, t1);             \
+    LD_DP2(pa1 + k, 2, t4, t5);             \
+    LD_DP2(pa2 + k, 2, t8, t9);             \
+    LD_DP2(pa3 + k, 2, t12, t13);           \
+                                            \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);    \
+    PCKEVOD_D2_DP(t5, t4, src2r, src2i);    \
+    PCKEVOD_D2_DP(t9, t8, src4r, src4i);    \
+    PCKEVOD_D2_DP(t13, t12, src6r, src6i);  \
+                                            \
+    tp0r += src0r * x0r;                    \
+    tp0r OP0 src0i * x0i;                   \
+                                            \
+    tp1r += src2r * x0r;                    \
+    tp1r OP0 src2i * x0i;                   \
+                                            \
+    tp2r += src4r * x0r;                    \
+    tp2r OP0 src4i * x0i;                   \
+                                            \
+    tp3r += src6r * x0r;                    \
+    tp3r OP0 src6i * x0i;                   \
+                                            \
+    tp0i OP1 src0r * x0i;                   \
+    tp0i OP2 src0i * x0r;                   \
+                                            \
+    tp1i OP1 src2r * x0i;                   \
+    tp1i OP2 src2i * x0r;                   \
+                                            \
+    tp2i OP1 src4r * x0i;                   \
+    tp2i OP2 src4i * x0r;                   \
+                                            \
+    tp3i OP1 src6r * x0i;                   \
+    tp3i OP2 src6i * x0r;                   \
+
+#define ZGEMV_T_2x2()                     \
+    LD_DP2(pa0 + k, 2, t0, t1);           \
+    LD_DP2(pa1 + k, 2, t4, t5);           \
+                                          \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
+    PCKEVOD_D2_DP(t5, t4, src2r, src2i);  \
+                                          \
+    tp0r += src0r * x0r;                  \
+    tp0r OP0 src0i * x0i;                 \
+                                          \
+    tp1r += src2r * x0r;                  \
+    tp1r OP0 src2i * x0i;                 \
+                                          \
+    tp0i OP1 src0r * x0i;                 \
+    tp0i OP2 src0i * x0r;                 \
+                                          \
+    tp1i OP1 src2r * x0i;                 \
+    tp1i OP2 src2i * x0r;                 \
+
+#define ZGEMV_T_2x1()                     \
+    LD_DP2(pa0 + k, 2, t0, t1);           \
+                                          \
+    PCKEVOD_D2_DP(t1, t0, src0r, src0i);  \
+                                          \
+    tp0r += src0r * x0r;                  \
+    tp0r OP0 src0i * x0i;                 \
+                                          \
+    tp0i OP1 src0r * x0i;                 \
+    tp0i OP2 src0i * x0r;                 \
+
+#define ZGEMV_T_1x4()                           \
+    temp0r  += pa0[k + 0] * x[0 * inc_x2];      \
+    temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1];  \
+    temp1r  += pa1[k + 0] * x[0 * inc_x2];      \
+    temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1];  \
+    temp2r  += pa2[k + 0] * x[0 * inc_x2];      \
+    temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1];  \
+    temp3r  += pa3[k + 0] * x[0 * inc_x2];      \
+    temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1];  \
+                                                \
+    temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1];  \
+    temp0i OP2 pa0[k + 1] * x[0 * inc_x2];      \
+    temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1];  \
+    temp1i OP2 pa1[k + 1] * x[0 * inc_x2];      \
+    temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1];  \
+    temp2i OP2 pa2[k + 1] * x[0 * inc_x2];      \
+    temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1];  \
+    temp3i OP2 pa3[k + 1] * x[0 * inc_x2];      \
+
+#define ZGEMV_T_1x2()                           \
+    temp0r  += pa0[k + 0] * x[0 * inc_x2];      \
+    temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1];  \
+    temp1r  += pa1[k + 0] * x[0 * inc_x2];      \
+    temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1];  \
+                                                \
+    temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1];  \
+    temp0i OP2 pa0[k + 1] * x[0 * inc_x2];      \
+    temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1];  \
+    temp1i OP2 pa1[k + 1] * x[0 * inc_x2];      \
+
+#define ZGEMV_T_1x1()                           \
+    temp0r  += pa0[k + 0] * x[0 * inc_x2];      \
+    temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1];  \
+                                                \
+    temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1];  \
+    temp0i OP2 pa0[k + 1] * x[0 * inc_x2];      \
+
+#define ZSCALE_STORE_Y4_GP()    \
+    res0r = y[0 * inc_y2];      \
+    res1r = y[1 * inc_y2];      \
+    res2r = y[2 * inc_y2];      \
+    res3r = y[3 * inc_y2];      \
+                                \
+    res0i = y[0 * inc_y2 + 1];  \
+    res1i = y[1 * inc_y2 + 1];  \
+    res2i = y[2 * inc_y2 + 1];  \
+    res3i = y[3 * inc_y2 + 1];  \
+                                \
+    res0r  += alphar * temp0r;  \
+    res0r OP0 alphai * temp0i;  \
+    res1r  += alphar * temp1r;  \
+    res1r OP0 alphai * temp1i;  \
+    res2r  += alphar * temp2r;  \
+    res2r OP0 alphai * temp2i;  \
+    res3r  += alphar * temp3r;  \
+    res3r OP0 alphai * temp3i;  \
+                                \
+    res0i OP1 alphar * temp0i;  \
+    res0i OP2 alphai * temp0r;  \
+    res1i OP1 alphar * temp1i;  \
+    res1i OP2 alphai * temp1r;  \
+    res2i OP1 alphar * temp2i;  \
+    res2i OP2 alphai * temp2r;  \
+    res3i OP1 alphar * temp3i;  \
+    res3i OP2 alphai * temp3r;  \
+                                \
+    y[0 * inc_y2] = res0r;      \
+    y[1 * inc_y2] = res1r;      \
+    y[2 * inc_y2] = res2r;      \
+    y[3 * inc_y2] = res3r;      \
+                                \
+    y[0 * inc_y2 + 1] = res0i;  \
+    y[1 * inc_y2 + 1] = res1i;  \
+    y[2 * inc_y2 + 1] = res2i;  \
+    y[3 * inc_y2 + 1] = res3i;  \
+
+#define ZSCALE_STORE_Y2_GP()    \
+    res0r = y[0 * inc_y2];      \
+    res1r = y[1 * inc_y2];      \
+                                \
+    res0i = y[0 * inc_y2 + 1];  \
+    res1i = y[1 * inc_y2 + 1];  \
+                                \
+    res0r  += alphar * temp0r;  \
+    res0r OP0 alphai * temp0i;  \
+    res1r  += alphar * temp1r;  \
+    res1r OP0 alphai * temp1i;  \
+                                \
+    res0i OP1 alphar * temp0i;  \
+    res0i OP2 alphai * temp0r;  \
+    res1i OP1 alphar * temp1i;  \
+    res1i OP2 alphai * temp1r;  \
+                                \
+    y[0 * inc_y2] = res0r;      \
+    y[1 * inc_y2] = res1r;      \
+                                \
+    y[0 * inc_y2 + 1] = res0i;  \
+    y[1 * inc_y2 + 1] = res1i;  \
+
+#define ZSCALE_STORE_Y1_GP()    \
+    res0r = y[0 * inc_y2];      \
+    res0i = y[0 * inc_y2 + 1];  \
+                                \
+    res0r  += alphar * temp0r;  \
+    res0r OP0 alphai * temp0i;  \
+                                \
+    res0i OP1 alphar * temp0i;  \
+    res0i OP2 alphai * temp0r;  \
+                                \
+    y[0 * inc_y2] = res0r;      \
+    y[0 * inc_y2 + 1] = res0i;  \
+
+#define ZLOAD_X4_VECTOR()             \
+    LD_DP4(x, 2, x0, x1, x2, x3);     \
+    PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
+    PCKEVOD_D2_DP(x3, x2, x1r, x1i);  \
+
+#define ZLOAD_X2_VECTOR()             \
+    LD_DP2(x, 2, x0, x1);             \
+    PCKEVOD_D2_DP(x1, x0, x0r, x0i);  \
+
+#define ZLOAD_X4_GP()                                                                      \
+    x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2)));      \
+    x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *) (x + 1 * inc_x2)));      \
+    x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2)));      \
+    x1r = (v2f64) __msa_insert_d((v2i64) x1r,  1, *((long long *) (x + 3 * inc_x2)));      \
+    x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1)));  \
+    x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *) (x + 1 * inc_x2 + 1)));  \
+    x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1)));  \
+    x1i = (v2f64) __msa_insert_d((v2i64) x1i,  1, *((long long *) (x + 3 * inc_x2 + 1)));  \
+
+#define ZLOAD_X2_GP()                                                                      \
+    x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2)));      \
+    x0r = (v2f64) __msa_insert_d((v2i64) x0r,  1, *((long long *) (x + 1 * inc_x2)));      \
+    x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1)));  \
+    x0i = (v2f64) __msa_insert_d((v2i64) x0i,  1, *((long long *) (x + 1 * inc_x2 + 1)));  \
+
+#define ZGEMV_T_MSA()                      \
+    for (j = (n >> 2); j--;)               \
+    {                                      \
+        tp0r = tp1r = tp2r = tp3r = zero;  \
+        tp0i = tp1i = tp2i = tp3i = zero;  \
+                                           \
+        k = 0;                             \
+        x = srcx_org;                      \
+                                           \
+        for (i = (m >> 2); i--;)           \
+        {                                  \
+            ZLOAD_X4();                    \
+            ZGEMV_T_4x4();                 \
+                                           \
+            k += 2 * 4;                    \
+            x += inc_x2 * 4;               \
+        }                                  \
+                                           \
+        if (m & 2)                         \
+        {                                  \
+            ZLOAD_X2();                    \
+            ZGEMV_T_2x4();                 \
+                                           \
+            k += 2 * 2;                    \
+            x += inc_x2 * 2;               \
+        }                                  \
+                                           \
+        temp0r = tp0r[0] + tp0r[1];        \
+        temp1r = tp1r[0] + tp1r[1];        \
+        temp2r = tp2r[0] + tp2r[1];        \
+        temp3r = tp3r[0] + tp3r[1];        \
+        temp0i = tp0i[0] + tp0i[1];        \
+        temp1i = tp1i[0] + tp1i[1];        \
+        temp2i = tp2i[0] + tp2i[1];        \
+        temp3i = tp3i[0] + tp3i[1];        \
+                                           \
+        if (m & 1)                         \
+        {                                  \
+            ZGEMV_T_1x4();                 \
+                                           \
+            k += 2;                        \
+            x += inc_x2;                   \
+        }                                  \
+                                           \
+        ZSCALE_STORE_Y4_GP();              \
+                                           \
+        pa0 += 4 * lda2;                   \
+        pa1 += 4 * lda2;                   \
+        pa2 += 4 * lda2;                   \
+        pa3 += 4 * lda2;                   \
+        y += 4 * inc_y2;                   \
+    }                                      \
+                                           \
+    if (n & 2)                             \
+    {                                      \
+        tp0r = tp1r = zero;                \
+        tp0i = tp1i = zero;                \
+                                           \
+        k = 0;                             \
+        x = srcx_org;                      \
+                                           \
+        for (i = (m >> 2); i--;)           \
+        {                                  \
+            ZLOAD_X4();                    \
+            ZGEMV_T_4x2();                 \
+                                           \
+            k += 2 * 4;                    \
+            x += inc_x2 * 4;               \
+        }                                  \
+                                           \
+        if (m & 2)                         \
+        {                                  \
+            ZLOAD_X2();                    \
+            ZGEMV_T_2x2();                 \
+                                           \
+            k += 2 * 2;                    \
+            x += inc_x2 * 2;               \
+        }                                  \
+                                           \
+        temp0r = tp0r[0] + tp0r[1];        \
+        temp1r = tp1r[0] + tp1r[1];        \
+        temp0i = tp0i[0] + tp0i[1];        \
+        temp1i = tp1i[0] + tp1i[1];        \
+                                           \
+        if (m & 1)                         \
+        {                                  \
+            ZGEMV_T_1x2();                 \
+                                           \
+            k += 2;                        \
+            x += inc_x2;                   \
+        }                                  \
+                                           \
+        ZSCALE_STORE_Y2_GP();              \
+                                           \
+        pa0 += 2 * lda2;                   \
+        pa1 += 2 * lda2;                   \
+        y += 2 * inc_y2;                   \
+    }                                      \
+                                           \
+    if (n & 1)                             \
+    {                                      \
+        tp0r = zero;                       \
+        tp0i = zero;                       \
+                                           \
+        k = 0;                             \
+        x = srcx_org;                      \
+                                           \
+        for (i = (m >> 2); i--;)           \
+        {                                  \
+            ZLOAD_X4();                    \
+            ZGEMV_T_4x1();                 \
+                                           \
+            k += 2 * 4;                    \
+            x += inc_x2 * 4;               \
+        }                                  \
+                                           \
+        if (m & 2)                         \
+        {                                  \
+            ZLOAD_X2();                    \
+            ZGEMV_T_2x1();                 \
+                                           \
+            k += 2 * 2;                    \
+            x += inc_x2 * 2;               \
+        }                                  \
+                                           \
+        temp0r = tp0r[0] + tp0r[1];        \
+        temp0i = tp0i[0] + tp0i[1];        \
+                                           \
+        if (m & 1)                         \
+        {                                  \
+            ZGEMV_T_1x1();                 \
+                                           \
+            k += 2;                        \
+            x += inc_x2;                   \
+        }                                  \
+                                           \
+        ZSCALE_STORE_Y1_GP();              \
+                                           \
+        pa0  += lda2;                      \
+        y += inc_y2;                       \
+    }                                      \
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
+          FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+          BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i, j, k;
+    BLASLONG inc_x2, inc_y2, lda2;
+    FLOAT *pa0, *pa1, *pa2, *pa3;
+    FLOAT *srcx_org = x;
+    FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
+    FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
+    v2f64 zero = {0};
+    v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
+    v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
+    v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
+    v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
+    v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
+
+    lda2 = 2 * lda;
+
+    pa0 = A;
+    pa1 = A + lda2;
+    pa2 = A + 2 * lda2;
+    pa3 = A + 3 * lda2;
+
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+
+    if (2 == inc_x2)
+    {
+        #define ZLOAD_X4  ZLOAD_X4_VECTOR
+        #define ZLOAD_X2  ZLOAD_X2_VECTOR
+
+        ZGEMV_T_MSA();
+
+        #undef ZLOAD_X4
+        #undef ZLOAD_X2
+    }
+    else
+    {
+        #define ZLOAD_X4  ZLOAD_X4_GP
+        #define ZLOAD_X2  ZLOAD_X2_GP
+
+        ZGEMV_T_MSA();
+
+        #undef ZLOAD_X4
+        #undef ZLOAD_X2
+    }
+
+    return(0);
+}
+
+#undef OP0
+#undef OP1
+#undef OP2
diff --git a/kernel/mips64/KERNEL.P6600 b/kernel/mips64/KERNEL.P6600
new file mode 100644 (file)
index 0000000..abf4481
--- /dev/null
@@ -0,0 +1 @@
+include $(KERNELDIR)/../mips/KERNEL.P5600
diff --git a/param.h b/param.h
index 1a0cc61..555829d 100644 (file)
--- a/param.h
+++ b/param.h
@@ -2174,7 +2174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P 16
 #endif
 
-#if defined(I6400)
+#if defined(I6400) || defined(P6600)
 #define SNUMOPT  2
 #define DNUMOPT  2
 
@@ -2190,7 +2190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define CGEMM_DEFAULT_UNROLL_M  8
 #define CGEMM_DEFAULT_UNROLL_N  4
-                                
+
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4