Added CGEMM, ZGEMM, STRMM, DTRMM, CTRMM, ZTRMM. Updated macros in SGEMM, DGEMM, STRMM.
authorShivraj Patil <shivraj.patil@imgtec.com>
Tue, 28 Jun 2016 12:21:10 +0000 (17:51 +0530)
committerShivraj Patil <shivraj.patil@imgtec.com>
Tue, 28 Jun 2016 12:21:10 +0000 (17:51 +0530)
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
23 files changed:
kernel/mips/KERNEL.P5600
kernel/mips/cgemm_kernel_8x4_msa.c [new file with mode: 0644]
kernel/mips/cgemm_ncopy_4_msa.c [new file with mode: 0644]
kernel/mips/cgemm_ncopy_8_msa.c [new file with mode: 0644]
kernel/mips/cgemm_tcopy_4_msa.c [new file with mode: 0644]
kernel/mips/cgemm_tcopy_8_msa.c [new file with mode: 0644]
kernel/mips/dgemm_kernel_8x4_msa.c
kernel/mips/dgemm_ncopy_4_msa.c
kernel/mips/dgemm_ncopy_8_msa.c
kernel/mips/dgemm_tcopy_4_msa.c
kernel/mips/dgemm_tcopy_8_msa.c
kernel/mips/macros_msa.h
kernel/mips/sgemm_kernel_8x8_msa.c
kernel/mips/sgemm_ncopy_8_msa.c
kernel/mips/sgemm_tcopy_8_msa.c
kernel/mips/strsm_kernel_LN_8x8_msa.c
kernel/mips/strsm_kernel_LT_8x8_msa.c
kernel/mips/strsm_kernel_RN_8x8_msa.c
kernel/mips/strsm_kernel_RT_8x8_msa.c
kernel/mips/zgemm_kernel_4x4_msa.c [new file with mode: 0644]
kernel/mips/zgemm_ncopy_4_msa.c [new file with mode: 0644]
kernel/mips/zgemm_tcopy_4_msa.c [new file with mode: 0644]
param.h

index 802f0e0..5d8bcb9 100644 (file)
@@ -80,11 +80,6 @@ DGEMVTKERNEL = ../mips/gemv_t.c
 CGEMVTKERNEL = ../mips/zgemv_t.c
 ZGEMVTKERNEL = ../mips/zgemv_t.c
 
-STRMMKERNEL  = ../generic/trmmkernel_2x2.c
-DTRMMKERNEL  = ../generic/trmmkernel_2x2.c
-CTRMMKERNEL  = ../generic/ztrmmkernel_2x2.c
-ZTRMMKERNEL  = ../generic/ztrmmkernel_2x2.c
-
 SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
 SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
 SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
@@ -101,15 +96,19 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
+CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
+CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
+CGEMMONCOPY    = ../mips/cgemm_ncopy_4_msa.c
+CGEMMOTCOPY    = ../mips/cgemm_tcopy_4_msa.c
+CGEMMINCOPYOBJ = cgemm_incopy.o
+CGEMMITCOPYOBJ = cgemm_itcopy.o
 CGEMMONCOPYOBJ = cgemm_oncopy.o
 CGEMMOTCOPYOBJ = cgemm_otcopy.o
 
-ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
+ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
+ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
 ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 
diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c
new file mode 100644 (file)
index 0000000..cd1fa45
--- /dev/null
@@ -0,0 +1,2154 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4)    \
+{                                                        \
+    LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);  \
+    LD_SP2_INC(pb0, 4, src_b0, src_b1);                  \
+                                                         \
+    PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);     \
+    PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i);     \
+                                                         \
+    /* 0th col */                                        \
+    SPLATI_W2_SP(src_b0, 0, src_br, src_bi);             \
+    res0_r OP0## = src_a0r * src_br;                     \
+    res0_r OP1## = src_a0i * src_bi;                     \
+    res0_i OP2## = (OP4 src_a0r) * src_bi;               \
+    res0_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res1_r OP0## = src_a1r * src_br;                     \
+    res1_r OP1## = src_a1i * src_bi;                     \
+    res1_i OP2## = (OP4 src_a1r) * src_bi;               \
+    res1_i OP3## = src_a1i * src_br;                     \
+                                                         \
+    /* 1st col */                                        \
+    SPLATI_W2_SP(src_b0, 2, src_br, src_bi);             \
+    res2_r OP0## = src_a0r * src_br;                     \
+    res2_r OP1## = src_a0i * src_bi;                     \
+    res2_i OP2## = (OP4 src_a0r) * src_bi;               \
+    res2_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res3_r OP0## = src_a1r * src_br;                     \
+    res3_r OP1## = src_a1i * src_bi;                     \
+    res3_i OP2## = (OP4 src_a1r) * src_bi;               \
+    res3_i OP3## = src_a1i * src_br;                     \
+                                                         \
+    /* 2nd col */                                        \
+    SPLATI_W2_SP(src_b1, 0, src_br, src_bi);             \
+    res4_r OP0## = src_a0r * src_br;                     \
+    res4_r OP1## = src_a0i * src_bi;                     \
+    res4_i OP2## = (OP4 src_a0r) * src_bi;               \
+    res4_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res5_r OP0## = src_a1r * src_br;                     \
+    res5_r OP1## = src_a1i * src_bi;                     \
+    res5_i OP2## = (OP4 src_a1r) * src_bi;               \
+    res5_i OP3## = src_a1i * src_br;                     \
+                                                         \
+    /* 3rd col */                                        \
+    SPLATI_W2_SP(src_b1, 2, src_br, src_bi);             \
+    res6_r OP0## = src_a0r * src_br;                     \
+    res6_r OP1## = src_a0i * src_bi;                     \
+    res6_i OP2## = (OP4 src_a0r) * src_bi;               \
+    res6_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res7_r OP0## = src_a1r * src_br;                     \
+    res7_r OP1## = src_a1i * src_bi;                     \
+    res7_i OP2## = (OP4 src_a1r) * src_bi;               \
+    res7_i OP3## = src_a1i * src_br;                     \
+}
+
+#define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4)    \
+{                                                        \
+    LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);  \
+    src_b0 = LD_SP(pb0);                                 \
+                                                         \
+    PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);     \
+    PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i);     \
+                                                         \
+    /* 0th col */                                        \
+    SPLATI_W2_SP(src_b0, 0, src_br, src_bi);             \
+    res0_r OP0## = src_a0r * src_br;                     \
+    res0_r OP1## = src_a0i * src_bi;                     \
+    res0_i OP2## = (OP4 src_a0r) * src_bi;               \
+    res0_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res1_r OP0## = src_a1r * src_br;                     \
+    res1_r OP1## = src_a1i * src_bi;                     \
+    res1_i OP2## = (OP4 src_a1r) * src_bi;               \
+    res1_i OP3## = src_a1i * src_br;                     \
+                                                         \
+    /* 1st col */                                        \
+    SPLATI_W2_SP(src_b0, 2, src_br, src_bi);             \
+    res2_r OP0## = src_a0r * src_br;                     \
+    res2_r OP1## = src_a0i * src_bi;                     \
+    res2_i OP2## = (OP4 src_a0r) * src_bi;               \
+    res2_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res3_r OP0## = src_a1r * src_br;                     \
+    res3_r OP1## = src_a1i * src_bi;                     \
+    res3_i OP2## = (OP4 src_a1r) * src_bi;               \
+    res3_i OP3## = src_a1i * src_br;                     \
+}
+
+#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
+{                                                                     \
+    LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);               \
+    src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
+    SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
+                                                                      \
+    PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
+    PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i);                  \
+                                                                      \
+    /* 0th col */                                                     \
+    res0_r OP0## = src_a0r * src_br;                                  \
+    res0_r OP1## = src_a0i * src_bi;                                  \
+    res0_i OP2## = (OP4 src_a0r) * src_bi;                            \
+    res0_i OP3## = src_a0i * src_br;                                  \
+                                                                      \
+    res1_r OP0## = src_a1r * src_br;                                  \
+    res1_r OP1## = src_a1i * src_bi;                                  \
+    res1_i OP2## = (OP4 src_a1r) * src_bi;                            \
+    res1_i OP3## = src_a1i * src_br;                                  \
+}
+
+#define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4)  \
+{                                                      \
+    LD_SP2_INC(pa0, 4, src_a0, src_a1);                \
+    LD_SP2_INC(pb0, 4, src_b0, src_b1);                \
+                                                       \
+    PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);   \
+                                                       \
+    /* 0th col */                                      \
+    SPLATI_W2_SP(src_b0, 0, src_br, src_bi);           \
+    res0_r OP0## = src_a0r * src_br;                   \
+    res0_r OP1## = src_a0i * src_bi;                   \
+    res0_i OP2## = OP4 src_a0r * src_bi;               \
+    res0_i OP3## = src_a0i * src_br;                   \
+                                                       \
+    /* 1st col */                                      \
+    SPLATI_W2_SP(src_b0, 2, src_br, src_bi);           \
+    res2_r OP0## = src_a0r * src_br;                   \
+    res2_r OP1## = src_a0i * src_bi;                   \
+    res2_i OP2## = OP4 src_a0r * src_bi;               \
+    res2_i OP3## = src_a0i * src_br;                   \
+                                                       \
+    /* 2nd col */                                      \
+    SPLATI_W2_SP(src_b1, 0, src_br, src_bi);           \
+    res4_r OP0## = src_a0r * src_br;                   \
+    res4_r OP1## = src_a0i * src_bi;                   \
+    res4_i OP2## = OP4 src_a0r * src_bi;               \
+    res4_i OP3## = src_a0i * src_br;                   \
+                                                       \
+    /* 3rd col */                                      \
+    SPLATI_W2_SP(src_b1, 2, src_br, src_bi);           \
+    res6_r OP0## = src_a0r * src_br;                   \
+    res6_r OP1## = src_a0i * src_bi;                   \
+    res6_i OP2## = OP4 src_a0r * src_bi;               \
+    res6_i OP3## = src_a0i * src_br;                   \
+}
+
+#define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4)  \
+{                                                      \
+    LD_SP2_INC(pa0, 4, src_a0, src_a1);                \
+    src_b0 = LD_SP(pb0);                               \
+                                                       \
+    PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);   \
+                                                       \
+    /* 0th col */                                      \
+    SPLATI_W2_SP(src_b0, 0, src_br, src_bi);           \
+    res0_r OP0## = src_a0r * src_br;                   \
+    res0_r OP1## = src_a0i * src_bi;                   \
+    res0_i OP2## = OP4 src_a0r * src_bi;               \
+    res0_i OP3## = src_a0i * src_br;                   \
+                                                       \
+    /* 1st col */                                      \
+    SPLATI_W2_SP(src_b0, 2, src_br, src_bi);           \
+    res2_r OP0## = src_a0r * src_br;                   \
+    res2_r OP1## = src_a0i * src_bi;                   \
+    res2_i OP2## = OP4 src_a0r * src_bi;               \
+    res2_i OP3## = src_a0i * src_br;                   \
+}
+
+#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
+{                                                                     \
+    LD_SP2_INC(pa0, 4, src_a0, src_a1);                               \
+    src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
+    SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
+                                                                      \
+    PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
+                                                                      \
+    /* 0th col */                                                     \
+    res0_r OP0## = src_a0r * src_br;                                  \
+    res0_r OP1## = src_a0i * src_bi;                                  \
+    res0_i OP2## = OP4 src_a0r * src_bi;                              \
+    res0_i OP3## = src_a0i * src_br;                                  \
+}
+
+#define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4)  \
+{                                                  \
+    a0_r = pa0[0];                                 \
+    a0_i = pa0[1];                                 \
+    b0_r = pb0[0];                                 \
+    b0_i = pb0[1];                                 \
+                                                   \
+    res0 OP0## = a0_r * b0_r;                      \
+    res0 OP1## = a0_i * b0_i;                      \
+    res1 OP2## = OP4 a0_r * b0_i;                  \
+    res1 OP3## = a0_i * b0_r;                      \
+                                                   \
+    a1_r = pa0[2];                                 \
+    a1_i = pa0[3];                                 \
+    res2 OP0## = a1_r * b0_r;                      \
+    res2 OP1## = a1_i * b0_i;                      \
+    res3 OP2## = OP4 a1_r * b0_i;                  \
+    res3 OP3## = a1_i * b0_r;                      \
+                                                   \
+    /* 1st col */                                  \
+    b1_r = pb0[2];                                 \
+    b1_i = pb0[3];                                 \
+    res4 OP0## = a0_r * b1_r;                      \
+    res4 OP1## = a0_i * b1_i;                      \
+    res5 OP2## = OP4 a0_r * b1_i;                  \
+    res5 OP3## = a0_i * b1_r;                      \
+                                                   \
+    res6 OP0## = a1_r * b1_r;                      \
+    res6 OP1## = a1_i * b1_i;                      \
+    res7 OP2## = OP4 a1_r * b1_i;                  \
+    res7 OP3## = a1_i * b1_r;                      \
+                                                   \
+    /* 2nd col */                                  \
+    b2_r = pb0[4];                                 \
+    b2_i = pb0[5];                                 \
+    res8 OP0## = a0_r * b2_r;                      \
+    res8 OP1## = a0_i * b2_i;                      \
+    res9 OP2## = OP4 a0_r * b2_i;                  \
+    res9 OP3## = a0_i * b2_r;                      \
+                                                   \
+    res10 OP0## = a1_r * b2_r;                     \
+    res10 OP1## = a1_i * b2_i;                     \
+    res11 OP2## = OP4 a1_r * b2_i;                 \
+    res11 OP3## = a1_i * b2_r;                     \
+                                                   \
+    /* 3rd col */                                  \
+    b3_r = pb0[6];                                 \
+    b3_i = pb0[7];                                 \
+    res12 OP0## = a0_r * b3_r;                     \
+    res12 OP1## = a0_i * b3_i;                     \
+    res13 OP2## = OP4 a0_r * b3_i;                 \
+    res13 OP3## = a0_i * b3_r;                     \
+                                                   \
+    res14 OP0## = a1_r * b3_r;                     \
+    res14 OP1## = a1_i * b3_i;                     \
+    res15 OP2## = OP4 a1_r * b3_i;                 \
+    res15 OP3## = a1_i * b3_r;                     \
+}
+
+#define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4)  \
+{                                                  \
+    /* 0th col */                                  \
+    a0_r = pa0[0];                                 \
+    a0_i = pa0[1];                                 \
+    b0_r = pb0[0];                                 \
+    b0_i = pb0[1];                                 \
+                                                   \
+    res0 OP0## = a0_r * b0_r;                      \
+    res0 OP1## = a0_i * b0_i;                      \
+    res1 OP2## = OP4 a0_r * b0_i;                  \
+    res1 OP3## = a0_i * b0_r;                      \
+                                                   \
+    a1_r = pa0[2];                                 \
+    a1_i = pa0[3];                                 \
+    res2 OP0## = a1_r * b0_r;                      \
+    res2 OP1## = a1_i * b0_i;                      \
+    res3 OP2## = OP4 a1_r * b0_i;                  \
+    res3 OP3## = a1_i * b0_r;                      \
+                                                   \
+    /* 1st col */                                  \
+    b1_r = pb0[2];                                 \
+    b1_i = pb0[3];                                 \
+    res4 OP0## = a0_r * b1_r;                      \
+    res4 OP1## = a0_i * b1_i;                      \
+    res5 OP2## = OP4 a0_r * b1_i;                  \
+    res5 OP3## = a0_i * b1_r;                      \
+                                                   \
+    res6 OP0## = a1_r * b1_r;                      \
+    res6 OP1## = a1_i * b1_i;                      \
+    res7 OP2## = OP4 a1_r * b1_i;                  \
+    res7 OP3## = a1_i * b1_r;                      \
+}
+
+#define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4)  \
+{                                                  \
+    /* 0th col */                                  \
+    a0_r = pa0[0];                                 \
+    a0_i = pa0[1];                                 \
+    b0_r = pb0[0];                                 \
+    b0_i = pb0[1];                                 \
+                                                   \
+    res0 OP0## = a0_r * b0_r;                      \
+    res0 OP1## = a0_i * b0_i;                      \
+    res1 OP2## = OP4 a0_r * b0_i;                  \
+    res1 OP3## = a0_i * b0_r;                      \
+                                                   \
+    a1_r = pa0[2];                                 \
+    a1_i = pa0[3];                                 \
+    res2 OP0## = a1_r * b0_r;                      \
+    res2 OP1## = a1_i * b0_i;                      \
+    res3 OP2## = OP4 a1_r * b0_i;                  \
+    res3 OP3## = a1_i * b0_r;                      \
+}
+
+#define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4)  \
+{                                                  \
+    /* 0th col */                                  \
+    a0_r = pa0[0];                                 \
+    a0_i = pa0[1];                                 \
+    b0_r = pb0[0];                                 \
+    b0_i = pb0[1];                                 \
+                                                   \
+    res0 OP0## = a0_r * b0_r;                      \
+    res0 OP1## = a0_i * b0_i;                      \
+    res1 OP2## = OP4 a0_r * b0_i;                  \
+    res1 OP3## = a0_i * b0_r;                      \
+                                                   \
+    /* 1st col */                                  \
+    b1_r = pb0[2];                                 \
+    b1_i = pb0[3];                                 \
+    res2 OP0## = a0_r * b1_r;                      \
+    res2 OP1## = a0_i * b1_i;                      \
+    res3 OP2## = OP4 a0_r * b1_i;                  \
+    res3 OP3## = a0_i * b1_r;                      \
+                                                   \
+    /* 2nd col */                                  \
+    b2_r = pb0[4];                                 \
+    b2_i = pb0[5];                                 \
+    res4 OP0## = a0_r * b2_r;                      \
+    res4 OP1## = a0_i * b2_i;                      \
+    res5 OP2## = OP4 a0_r * b2_i;                  \
+    res5 OP3## = a0_i * b2_r;                      \
+                                                   \
+    /* 3rd col */                                  \
+    b3_r = pb0[6];                                 \
+    b3_i = pb0[7];                                 \
+    res6 OP0## = a0_r * b3_r;                      \
+    res6 OP1## = a0_i * b3_i;                      \
+    res7 OP2## = OP4 a0_r * b3_i;                  \
+    res7 OP3## = a0_i * b3_r;                      \
+}
+
+#define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4)  \
+{                                                  \
+    /* 0th col */                                  \
+    a0_r = pa0[0];                                 \
+    a0_i = pa0[1];                                 \
+    b0_r = pb0[0];                                 \
+    b0_i = pb0[1];                                 \
+                                                   \
+    res0 OP0## = a0_r * b0_r;                      \
+    res0 OP1## = a0_i * b0_i;                      \
+    res1 OP2## = OP4 a0_r * b0_i;                  \
+    res1 OP3## = a0_i * b0_r;                      \
+                                                   \
+    /* 1st col */                                  \
+    b1_r = pb0[2];                                 \
+    b1_i = pb0[3];                                 \
+    res2 OP0## = a0_r * b1_r;                      \
+    res2 OP1## = a0_i * b1_i;                      \
+    res3 OP2## = OP4 a0_r * b1_i;                  \
+    res3 OP3## = a0_i * b1_r;                      \
+}
+
+#define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4)  \
+{                                                  \
+    /* 0th col */                                  \
+    a0_r = pa0[0];                                 \
+    a0_i = pa0[1];                                 \
+    b0_r = pb0[0];                                 \
+    b0_i = pb0[1];                                 \
+                                                   \
+    res0 OP0## = a0_r * b0_r;                      \
+    res0 OP1## = a0_i * b0_i;                      \
+    res1 OP2## = OP4 a0_r * b0_i;                  \
+    res1 OP3## = a0_i * b0_r;                      \
+}
+
+#define CGEMM_SCALE_8X4_MSA                      \
+{                                                \
+    LD_SP4(pc0, 4, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res0_r;                  \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i += alpha_r * res0_i;                  \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r += alpha_r * res1_r;                  \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i += alpha_r * res1_i;                  \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
+                                                 \
+    LD_SP4(pc1, 4, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res2_r;                  \
+    dst0_r -= alpha_i * res2_i;                  \
+    dst0_i += alpha_r * res2_i;                  \
+    dst0_i += alpha_i * res2_r;                  \
+                                                 \
+    dst1_r += alpha_r * res3_r;                  \
+    dst1_r -= alpha_i * res3_i;                  \
+    dst1_i += alpha_r * res3_i;                  \
+    dst1_i += alpha_i * res3_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4);  \
+                                                 \
+    LD_SP4(pc2, 4, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res4_r;                  \
+    dst0_r -= alpha_i * res4_i;                  \
+    dst0_i += alpha_r * res4_i;                  \
+    dst0_i += alpha_i * res4_r;                  \
+                                                 \
+    dst1_r += alpha_r * res5_r;                  \
+    dst1_r -= alpha_i * res5_i;                  \
+    dst1_i += alpha_r * res5_i;                  \
+    dst1_i += alpha_i * res5_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4);  \
+                                                 \
+    LD_SP4(pc3, 4, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res6_r;                  \
+    dst0_r -= alpha_i * res6_i;                  \
+    dst0_i += alpha_r * res6_i;                  \
+    dst0_i += alpha_i * res6_r;                  \
+                                                 \
+    dst1_r += alpha_r * res7_r;                  \
+    dst1_r -= alpha_i * res7_i;                  \
+    dst1_i += alpha_r * res7_i;                  \
+    dst1_i += alpha_i * res7_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4);  \
+}
+
+#define CGEMM_SCALE_8X2_MSA                      \
+{                                                \
+    LD_SP4(pc0, 4, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res0_r;                  \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i += alpha_r * res0_i;                  \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r += alpha_r * res1_r;                  \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i += alpha_r * res1_i;                  \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
+                                                 \
+    LD_SP4(pc1, 4, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res2_r;                  \
+    dst0_r -= alpha_i * res2_i;                  \
+    dst0_i += alpha_r * res2_i;                  \
+    dst0_i += alpha_i * res2_r;                  \
+                                                 \
+    dst1_r += alpha_r * res3_r;                  \
+    dst1_r -= alpha_i * res3_i;                  \
+    dst1_i += alpha_r * res3_i;                  \
+    dst1_i += alpha_i * res3_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4);  \
+}
+
+#define CGEMM_SCALE_8X1_MSA                      \
+{                                                \
+    LD_SP4(pc0, 4, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res0_r;                  \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i += alpha_r * res0_i;                  \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r += alpha_r * res1_r;                  \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i += alpha_r * res1_i;                  \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
+}
+
+#define CGEMM_SCALE_4X4_MSA                     \
+{                                               \
+    LD_SP2(pc0, 4, dst0, dst1);                 \
+                                                \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res0_r;                 \
+    dst0_r -= alpha_i * res0_i;                 \
+    dst0_i += alpha_r * res0_i;                 \
+    dst0_i += alpha_i * res0_r;                 \
+                                                \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_SP2_INC(dst0, dst1, pc0, 4);             \
+                                                \
+    LD_SP2(pc1, 4, dst0, dst1);                 \
+                                                \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res2_r;                 \
+    dst0_r -= alpha_i * res2_i;                 \
+    dst0_i += alpha_r * res2_i;                 \
+    dst0_i += alpha_i * res2_r;                 \
+                                                \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_SP2_INC(dst0, dst1, pc1, 4);             \
+                                                \
+    LD_SP2(pc2, 4, dst0, dst1);                 \
+                                                \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res4_r;                 \
+    dst0_r -= alpha_i * res4_i;                 \
+    dst0_i += alpha_r * res4_i;                 \
+    dst0_i += alpha_i * res4_r;                 \
+                                                \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_SP2_INC(dst0, dst1, pc2, 4);             \
+                                                \
+    LD_SP2(pc3, 4, dst0, dst1);                 \
+                                                \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res6_r;                 \
+    dst0_r -= alpha_i * res6_i;                 \
+    dst0_i += alpha_r * res6_i;                 \
+    dst0_i += alpha_i * res6_r;                 \
+                                                \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_SP2_INC(dst0, dst1, pc3, 4);             \
+}
+
+#define CGEMM_SCALE_4X2_MSA                     \
+{                                               \
+    LD_SP2(pc0, 4, dst0, dst1);                 \
+                                                \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res0_r;                 \
+    dst0_r -= alpha_i * res0_i;                 \
+    dst0_i += alpha_r * res0_i;                 \
+    dst0_i += alpha_i * res0_r;                 \
+                                                \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_SP2_INC(dst0, dst1, pc0, 4);             \
+                                                \
+    LD_SP2(pc1, 4, dst0, dst1);                 \
+                                                \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res2_r;                 \
+    dst0_r -= alpha_i * res2_i;                 \
+    dst0_i += alpha_r * res2_i;                 \
+    dst0_i += alpha_i * res2_r;                 \
+                                                \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_SP2_INC(dst0, dst1, pc1, 4);             \
+}
+
+#define CGEMM_SCALE_4X1_MSA                     \
+{                                               \
+    LD_SP2(pc0, 4, dst0, dst1);                 \
+                                                \
+    PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res0_r;                 \
+    dst0_r -= alpha_i * res0_i;                 \
+    dst0_i += alpha_r * res0_i;                 \
+    dst0_i += alpha_i * res0_r;                 \
+                                                \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_SP2_INC(dst0, dst1, pc0, 4);             \
+}
+
+#define CGEMM_SCALE_2X4        \
+{                              \
+    /* 0th col */              \
+    pc0[0] += alphar * res0;   \
+    pc0[0] -= alphai * res1;   \
+    pc0[1] += alphar * res1;   \
+    pc0[1] += alphai * res0;   \
+    pc0[2] += alphar * res2;   \
+    pc0[2] -= alphai * res3;   \
+    pc0[3] += alphar * res3;   \
+    pc0[3] += alphai * res2;   \
+                               \
+    /* 1st col */              \
+    pc1[0] += alphar * res4;   \
+    pc1[0] -= alphai * res5;   \
+    pc1[1] += alphar * res5;   \
+    pc1[1] += alphai * res4;   \
+    pc1[2] += alphar * res6;   \
+    pc1[2] -= alphai * res7;   \
+    pc1[3] += alphar * res7;   \
+    pc1[3] += alphai * res6;   \
+                               \
+    /* 2nd col */              \
+    pc2[0] += alphar * res8;   \
+    pc2[0] -= alphai * res9;   \
+    pc2[1] += alphar * res9;   \
+    pc2[1] += alphai * res8;   \
+    pc2[2] += alphar * res10;  \
+    pc2[2] -= alphai * res11;  \
+    pc2[3] += alphar * res11;  \
+    pc2[3] += alphai * res10;  \
+                               \
+    /* 3rd col */              \
+    pc3[0] += alphar * res12;  \
+    pc3[0] -= alphai * res13;  \
+    pc3[1] += alphar * res13;  \
+    pc3[1] += alphai * res12;  \
+    pc3[2] += alphar * res14;  \
+    pc3[2] -= alphai * res15;  \
+    pc3[3] += alphar * res15;  \
+    pc3[3] += alphai * res14;  \
+}
+
+#define CGEMM_SCALE_2X2       \
+{                             \
+    /* 0th col */             \
+    pc0[0] += alphar * res0;  \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] += alphar * res1;  \
+    pc0[1] += alphai * res0;  \
+    pc0[2] += alphar * res2;  \
+    pc0[2] -= alphai * res3;  \
+    pc0[3] += alphar * res3;  \
+    pc0[3] += alphai * res2;  \
+                              \
+    /* 1st col */             \
+    pc1[0] += alphar * res4;  \
+    pc1[0] -= alphai * res5;  \
+    pc1[1] += alphar * res5;  \
+    pc1[1] += alphai * res4;  \
+    pc1[2] += alphar * res6;  \
+    pc1[2] -= alphai * res7;  \
+    pc1[3] += alphar * res7;  \
+    pc1[3] += alphai * res6;  \
+}
+
+#define CGEMM_SCALE_2X1       \
+{                             \
+    pc0[0] += alphar * res0;  \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] += alphar * res1;  \
+    pc0[1] += alphai * res0;  \
+                              \
+    pc0[2] += alphar * res2;  \
+    pc0[2] -= alphai * res3;  \
+    pc0[3] += alphar * res3;  \
+    pc0[3] += alphai * res2;  \
+}
+
+#define CGEMM_SCALE_1X4       \
+{                             \
+    pc0[0] += alphar * res0;  \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] += alphar * res1;  \
+    pc0[1] += alphai * res0;  \
+                              \
+    pc1[0] += alphar * res2;  \
+    pc1[0] -= alphai * res3;  \
+    pc1[1] += alphar * res3;  \
+    pc1[1] += alphai * res2;  \
+                              \
+    pc2[0] += alphar * res4;  \
+    pc2[0] -= alphai * res5;  \
+    pc2[1] += alphar * res5;  \
+    pc2[1] += alphai * res4;  \
+                              \
+    pc3[0] += alphar * res6;  \
+    pc3[0] -= alphai * res7;  \
+    pc3[1] += alphar * res7;  \
+    pc3[1] += alphai * res6;  \
+}
+
+#define CGEMM_SCALE_1X2       \
+{                             \
+    pc0[0] += alphar * res0;  \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] += alphar * res1;  \
+    pc0[1] += alphai * res0;  \
+                              \
+    pc1[2] += alphar * res2;  \
+    pc1[2] -= alphai * res3;  \
+    pc1[3] += alphar * res3;  \
+    pc1[3] += alphai * res2;  \
+}
+
+#define CGEMM_SCALE_1X1       \
+{                             \
+    pc0[0] += alphar * res0;  \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] += alphar * res1;  \
+    pc0[1] += alphai * res0;  \
+}
+
+#define CGEMM_TRMM_SCALE_8X4_MSA                 \
+{                                                \
+    dst0_r = alpha_r * res0_r;                   \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i = alpha_r * res0_i;                   \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r = alpha_r * res1_r;                   \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i = alpha_r * res1_i;                   \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
+                                                 \
+    dst0_r = alpha_r * res2_r;                   \
+    dst0_r -= alpha_i * res2_i;                  \
+    dst0_i = alpha_r * res2_i;                   \
+    dst0_i += alpha_i * res2_r;                  \
+                                                 \
+    dst1_r = alpha_r * res3_r;                   \
+    dst1_r -= alpha_i * res3_i;                  \
+    dst1_i = alpha_r * res3_i;                   \
+    dst1_i += alpha_i * res3_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4);  \
+                                                 \
+    dst0_r = alpha_r * res4_r;                   \
+    dst0_r -= alpha_i * res4_i;                  \
+    dst0_i = alpha_r * res4_i;                   \
+    dst0_i += alpha_i * res4_r;                  \
+                                                 \
+    dst1_r = alpha_r * res5_r;                   \
+    dst1_r -= alpha_i * res5_i;                  \
+    dst1_i = alpha_r * res5_i;                   \
+    dst1_i += alpha_i * res5_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4);  \
+                                                 \
+    dst0_r = alpha_r * res6_r;                   \
+    dst0_r -= alpha_i * res6_i;                  \
+    dst0_i = alpha_r * res6_i;                   \
+    dst0_i += alpha_i * res6_r;                  \
+                                                 \
+    dst1_r = alpha_r * res7_r;                   \
+    dst1_r -= alpha_i * res7_i;                  \
+    dst1_i = alpha_r * res7_i;                   \
+    dst1_i += alpha_i * res7_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4);  \
+}
+
+#define CGEMM_TRMM_SCALE_8X2_MSA                 \
+{                                                \
+    dst0_r = alpha_r * res0_r;                   \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i = alpha_r * res0_i;                   \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r = alpha_r * res1_r;                   \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i = alpha_r * res1_i;                   \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
+                                                 \
+    dst0_r = alpha_r * res2_r;                   \
+    dst0_r -= alpha_i * res2_i;                  \
+    dst0_i = alpha_r * res2_i;                   \
+    dst0_i += alpha_i * res2_r;                  \
+                                                 \
+    dst1_r = alpha_r * res3_r;                   \
+    dst1_r -= alpha_i * res3_i;                  \
+    dst1_i = alpha_r * res3_i;                   \
+    dst1_i += alpha_i * res3_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4);  \
+}
+
+#define CGEMM_TRMM_SCALE_8X1_MSA                 \
+{                                                \
+    dst0_r = alpha_r * res0_r;                   \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i = alpha_r * res0_i;                   \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r = alpha_r * res1_r;                   \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i = alpha_r * res1_i;                   \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4);  \
+}
+
+#define CGEMM_TRMM_SCALE_4X4_MSA              \
+{                                             \
+    dst0_r = alpha_r * res0_r;                \
+    dst0_r -= alpha_i * res0_i;               \
+    dst0_i = alpha_r * res0_i;                \
+    dst0_i += alpha_i * res0_r;               \
+                                              \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_SP2_INC(dst0, dst1, pc0, 4);           \
+                                              \
+    dst0_r = alpha_r * res2_r;                \
+    dst0_r -= alpha_i * res2_i;               \
+    dst0_i = alpha_r * res2_i;                \
+    dst0_i += alpha_i * res2_r;               \
+                                              \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_SP2_INC(dst0, dst1, pc1, 4);           \
+                                              \
+    dst0_r = alpha_r * res4_r;                \
+    dst0_r -= alpha_i * res4_i;               \
+    dst0_i = alpha_r * res4_i;                \
+    dst0_i += alpha_i * res4_r;               \
+                                              \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_SP2_INC(dst0, dst1, pc2, 4);           \
+                                              \
+    dst0_r = alpha_r * res6_r;                \
+    dst0_r -= alpha_i * res6_i;               \
+    dst0_i = alpha_r * res6_i;                \
+    dst0_i += alpha_i * res6_r;               \
+                                              \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_SP2_INC(dst0, dst1, pc3, 4);           \
+}
+
+#define CGEMM_TRMM_SCALE_4X2_MSA              \
+{                                             \
+    dst0_r = alpha_r * res0_r;                \
+    dst0_r -= alpha_i * res0_i;               \
+    dst0_i = alpha_r * res0_i;                \
+    dst0_i += alpha_i * res0_r;               \
+                                              \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_SP2_INC(dst0, dst1, pc0, 4);           \
+                                              \
+    dst0_r = alpha_r * res2_r;                \
+    dst0_r -= alpha_i * res2_i;               \
+    dst0_i = alpha_r * res2_i;                \
+    dst0_i += alpha_i * res2_r;               \
+                                              \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_SP2_INC(dst0, dst1, pc1, 4);           \
+}
+
+#define CGEMM_TRMM_SCALE_4X1_MSA              \
+{                                             \
+    dst0_r = alpha_r * res0_r;                \
+    dst0_r -= alpha_i * res0_i;               \
+    dst0_i = alpha_r * res0_i;                \
+    dst0_i += alpha_i * res0_r;               \
+                                              \
+    ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_SP2_INC(dst0, dst1, pc0, 4);           \
+}
+
+#define CGEMM_TRMM_SCALE_2X4   \
+{                              \
+    /* 0th col */              \
+    pc0[0] = alphar * res0;    \
+    pc0[0] -= alphai * res1;   \
+    pc0[1] = alphar * res1;    \
+    pc0[1] += alphai * res0;   \
+    pc0[2] = alphar * res2;    \
+    pc0[2] -= alphai * res3;   \
+    pc0[3] = alphar * res3;    \
+    pc0[3] += alphai * res2;   \
+                               \
+    /* 1st col */              \
+    pc1[0] = alphar * res4;    \
+    pc1[0] -= alphai * res5;   \
+    pc1[1] = alphar * res5;    \
+    pc1[1] += alphai * res4;   \
+    pc1[2] = alphar * res6;    \
+    pc1[2] -= alphai * res7;   \
+    pc1[3] = alphar * res7;    \
+    pc1[3] += alphai * res6;   \
+                               \
+    /* 2nd col */              \
+    pc2[0] = alphar * res8;    \
+    pc2[0] -= alphai * res9;   \
+    pc2[1] = alphar * res9;    \
+    pc2[1] += alphai * res8;   \
+    pc2[2] = alphar * res10;   \
+    pc2[2] -= alphai * res11;  \
+    pc2[3] = alphar * res11;   \
+    pc2[3] += alphai * res10;  \
+                               \
+    /* 3rd col */              \
+    pc3[0] = alphar * res12;   \
+    pc3[0] -= alphai * res13;  \
+    pc3[1] = alphar * res13;   \
+    pc3[1] += alphai * res12;  \
+    pc3[2] = alphar * res14;   \
+    pc3[2] -= alphai * res15;  \
+    pc3[3] = alphar * res15;   \
+    pc3[3] += alphai * res14;  \
+}
+
+#define CGEMM_TRMM_SCALE_2X2  \
+{                             \
+    /* 0th col */             \
+    pc0[0] = alphar * res0;   \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] = alphar * res1;   \
+    pc0[1] += alphai * res0;  \
+    pc0[2] = alphar * res2;   \
+    pc0[2] -= alphai * res3;  \
+    pc0[3] = alphar * res3;   \
+    pc0[3] += alphai * res2;  \
+                              \
+    /* 1st col */             \
+    pc1[0] = alphar * res4;   \
+    pc1[0] -= alphai * res5;  \
+    pc1[1] = alphar * res5;   \
+    pc1[1] += alphai * res4;  \
+    pc1[2] = alphar * res6;   \
+    pc1[2] -= alphai * res7;  \
+    pc1[3] = alphar * res7;   \
+    pc1[3] += alphai * res6;  \
+}
+
+#define CGEMM_TRMM_SCALE_2X1  \
+{                             \
+    pc0[0] = alphar * res0;   \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] = alphar * res1;   \
+    pc0[1] += alphai * res0;  \
+                              \
+    pc0[2] = alphar * res2;   \
+    pc0[2] -= alphai * res3;  \
+    pc0[3] = alphar * res3;   \
+    pc0[3] += alphai * res2;  \
+}
+
+#define CGEMM_TRMM_SCALE_1X4  \
+{                             \
+    pc0[0] = alphar * res0;   \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] = alphar * res1;   \
+    pc0[1] += alphai * res0;  \
+                              \
+    pc1[0] = alphar * res2;   \
+    pc1[0] -= alphai * res3;  \
+    pc1[1] = alphar * res3;   \
+    pc1[1] += alphai * res2;  \
+                              \
+    pc2[0] = alphar * res4;   \
+    pc2[0] -= alphai * res5;  \
+    pc2[1] = alphar * res5;   \
+    pc2[1] += alphai * res4;  \
+                              \
+    pc3[0] = alphar * res6;   \
+    pc3[0] -= alphai * res7;  \
+    pc3[1] = alphar * res7;   \
+    pc3[1] += alphai * res6;  \
+}
+
+#define CGEMM_TRMM_SCALE_1X2  \
+{                             \
+    pc0[0] = alphar * res0;   \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] = alphar * res1;   \
+    pc0[1] += alphai * res0;  \
+                              \
+    pc1[2] = alphar * res2;   \
+    pc1[2] -= alphai * res3;  \
+    pc1[3] = alphar * res3;   \
+    pc1[3] += alphai * res2;  \
+}
+
+#define CGEMM_TRMM_SCALE_1X1  \
+{                             \
+    pc0[0] = alphar * res0;   \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] = alphar * res1;   \
+    pc0[1] += alphai * res0;  \
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
+          FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc
+#ifdef TRMMKERNEL
+         , BLASLONG offset
+#endif
+          )
+{
+    BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+    BLASLONG off;
+#endif
+    FLOAT *pc0, *pc1, *pc2, *pc3;
+    FLOAT *pa0, *pb0;
+    FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
+    FLOAT res8, res9, res10, res11, res12, res13, res14, res15;
+    FLOAT a0_r, a1_r;
+    FLOAT a0_i, a1_i;
+    FLOAT b0_r, b1_r, b2_r, b3_r;
+    FLOAT b0_i, b1_i, b2_i, b3_i;
+    v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1;
+    v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
+    v4f32 dst0, dst1, dst2, dst3;
+    v4f32 alpha_r, alpha_i;
+    v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
+    v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
+    v4f32 dst0_r, dst0_i, dst1_r, dst1_i;
+
+    alpha_r = COPY_FLOAT_TO_VECTOR(alphar);
+    alpha_i = COPY_FLOAT_TO_VECTOR(alphai);
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    off = -offset;
+#endif
+
+    for (j = (n >> 2); j--;)
+    {
+        pc0 = C;
+        pc1 = pc0 + 2 * ldc;
+        pc2 = pc1 + 2 * ldc;
+        pc3 = pc2 + 2 * ldc;
+
+        pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 8;
+            pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_8X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_8X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_8X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_8X4_MSA(, -, , -, -);
+#endif
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_8X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_8X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_8X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_8X4_MSA(+, -, -, -,);
+#endif
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_8X4_MSA
+#else
+            CGEMM_SCALE_8X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2 * 8;
+            pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 4)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 4;
+            pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_4X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_4X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_4X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_4X4_MSA(, -, , -, -);
+#endif
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_4X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_4X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_4X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_4X4_MSA(+, -, -, -,);
+#endif
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_4X4_MSA
+#else
+            CGEMM_SCALE_4X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2 * 4;
+            pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 2)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 2;
+            pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_2X4(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_2X4(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_2X4(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_2X4(, -, , -, -);
+#endif
+
+            pa0 += 4;
+            pb0 += 8;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_2X4(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_2X4(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_2X4(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_2X4(+, -, -, -,);
+#endif
+
+                pa0 += 4;
+                pb0 += 8;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_2X4
+#else
+            CGEMM_SCALE_2X4
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2 * 2;
+            pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
+
+            pc0 += 4;
+            pc1 += 4;
+            pc2 += 4;
+            pc3 += 4;
+        }
+
+        if (m & 1)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 1;
+            pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_1X4(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_1X4(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_1X4(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_1X4(, -, , -, -);
+#endif
+
+            pa0 += 2;
+            pb0 += 8;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_1X4(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_1X4(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_1X4(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_1X4(+, -, -, -,);
+#endif
+
+                pa0 += 2;
+                pb0 += 8;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_1X4
+#else
+            CGEMM_SCALE_1X4
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2 * 1;
+            pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
+
+            pc0 += 2;
+            pc1 += 2;
+            pc2 += 2;
+            pc3 += 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4; // number of values in A
+#endif
+
+        l = k << 3;
+        B = B + l;
+        i = ldc << 3;
+        C = C + i;
+    }
+
+    if (n & 2)
+    {
+        pc0 = C;
+        pc1 = pc0 + 2 * ldc;
+
+        pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 8;
+            pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_8X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_8X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_8X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_8X2_MSA(, -, , -, -);
+#endif
+
+            pb0 += 4;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_8X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_8X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_8X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_8X2_MSA(+, -, -, -,);
+#endif
+
+                pb0 += 4;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_8X2_MSA
+#else
+            CGEMM_SCALE_8X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2 * 8;
+            pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 4)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 4;
+            pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_4X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_4X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_4X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_4X2_MSA(, -, , -, -);
+#endif
+
+            pb0 += 4;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_4X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_4X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_4X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_4X2_MSA(+, -, -, -,);
+#endif
+
+                pb0 += 4;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_4X2_MSA
+#else
+            CGEMM_SCALE_4X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2 * 4;
+            pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 2)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 2;
+            pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_2X2(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_2X2(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_2X2(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_2X2(, -, , -, -);
+#endif
+
+            pa0 += 4;
+            pb0 += 4;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_2X2(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_2X2(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_2X2(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_2X2(+, -, -, -,);
+#endif
+
+                pa0 += 4;
+                pb0 += 4;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_2X2
+#else
+            CGEMM_SCALE_2X2
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2 * 2;
+            pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
+
+            pc0 += 4;
+            pc1 += 4;
+        }
+
+        if (m & 1)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 1;
+            pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_1X2(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_1X2(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_1X2(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_1X2(, -, , -, -);
+#endif
+
+            pa0 += 2;
+            pb0 += 4;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_1X2(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_1X2(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_1X2(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_1X2(+, -, -, -,);
+#endif
+
+                pa0 += 2;
+                pb0 += 4;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_1X2
+#else
+            CGEMM_SCALE_1X2
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2 * 1;
+            pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
+
+            pc0 += 2;
+            pc1 += 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2; // number of values in A
+#endif
+
+        l = k << 2;
+        B = B + l;
+        i = ldc << 2;
+        C = C + i;
+    }
+
+    if (n & 1)
+    {
+        pc0 = C;
+        pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 8;
+            pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_8X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_8X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_8X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_8X1_MSA(, -, , -, -);
+#endif
+
+            pb0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_8X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_8X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_8X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_8X1_MSA(+, -, -, -,);
+#endif
+
+                pb0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_8X1_MSA
+#else
+            CGEMM_SCALE_8X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2 * 8;
+            pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 4)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 4;
+            pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_4X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_4X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_4X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_4X1_MSA(, -, , -, -);
+#endif
+
+            pb0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_4X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_4X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_4X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_4X1_MSA(+, -, -, -,);
+#endif
+
+                pb0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_4X1_MSA
+#else
+            CGEMM_SCALE_4X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2 * 4;
+            pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 2)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 2;
+            pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_2X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_2X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_2X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_2X1(, -, , -, -);
+#endif
+
+            pa0 += 4;
+            pb0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_2X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_2X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_2X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_2X1(+, -, -, -,);
+#endif
+
+                pa0 += 4;
+                pb0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_2X1
+#else
+            CGEMM_SCALE_2X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2 * 2;
+            pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
+
+            pc0 += 4;
+        }
+
+        if (m & 1)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 1;
+            pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            CGEMM_KERNEL_1X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            CGEMM_KERNEL_1X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            CGEMM_KERNEL_1X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            CGEMM_KERNEL_1X1(, -, , -, -);
+#endif
+
+            pa0 += 2;
+            pb0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                CGEMM_KERNEL_1X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                CGEMM_KERNEL_1X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                CGEMM_KERNEL_1X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                CGEMM_KERNEL_1X1(+, -, -, -,);
+#endif
+
+                pa0 += 2;
+                pb0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            CGEMM_TRMM_SCALE_1X1
+#else
+            CGEMM_SCALE_1X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2 * 1;
+            pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
+
+            pc0 += 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1; // number of values in A
+#endif
+
+        l = k << 1;
+        B = B + l;
+        i = ldc << 1;
+        C = C + i;
+    }
+
+    return 0;
+}
diff --git a/kernel/mips/cgemm_ncopy_4_msa.c b/kernel/mips/cgemm_ncopy_4_msa.c
new file mode 100644 (file)
index 0000000..b38290b
--- /dev/null
@@ -0,0 +1,195 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+    BLASLONG i, j;
+    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
+    FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+    FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+    v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+    v4f32 dst0, dst1, dst4, dst5;
+
+    psrc0 = src;
+    pdst = dst;
+    lda *= 2;
+
+    for (j = (n >> 2); j--;)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc1 + lda;
+        psrc3 = psrc2 + lda;
+        psrc4 = psrc3 + lda;
+        psrc0 += 4 * lda;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
+            LD_SP2_INC(psrc3, 4, src4, src5);
+            LD_SP2_INC(psrc4, 4, src6, src7);
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+            ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+            ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+
+            ILVRL_D2_SP(src3, src1, dst0, dst4);
+            ILVRL_D2_SP(src7, src5, dst1, dst5);
+
+            ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+        }
+
+        if (m & 2)
+        {
+            src0 = LD_SP(psrc1);
+            src2 = LD_SP(psrc2);
+            src4 = LD_SP(psrc3);
+            src6 = LD_SP(psrc4);
+            psrc1 += 4;
+            psrc2 += 4;
+            psrc3 += 4;
+            psrc4 += 4;
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+            ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+            ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+            ctemp05 = *(psrc3 + 0);
+            ctemp06 = *(psrc3 + 1);
+            ctemp07 = *(psrc4 + 0);
+            ctemp08 = *(psrc4 + 1);
+            psrc1 += 2;
+            psrc2 += 2;
+            psrc3 += 2;
+            psrc4 += 2;
+
+            *(pdst + 0) = ctemp01;
+            *(pdst + 1) = ctemp02;
+            *(pdst + 2) = ctemp03;
+            *(pdst + 3) = ctemp04;
+            *(pdst + 4) = ctemp05;
+            *(pdst + 5) = ctemp06;
+            *(pdst + 6) = ctemp07;
+            *(pdst + 7) = ctemp08;
+            pdst += 8;
+        }
+    }
+
+    if (n & 2)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc1 + lda;
+        psrc0 += 2 * lda;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+            ST_SP2_INC(dst0, dst4, pdst, 4);
+
+            ILVRL_D2_SP(src3, src1, dst0, dst4);
+
+            ST_SP2_INC(dst0, dst4, pdst, 4);
+        }
+
+        if (m & 2)
+        {
+            src0 = LD_SP(psrc1);
+            src2 = LD_SP(psrc2);
+            psrc1 += 4;
+            psrc2 += 4;
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+            ST_SP2_INC(dst0, dst4, pdst, 4);
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+            psrc1 += 2;
+            psrc2 += 2;
+
+            *(pdst + 0) = ctemp01;
+            *(pdst + 1) = ctemp02;
+            *(pdst + 2) = ctemp03;
+            *(pdst + 3) = ctemp04;
+            pdst += 4;
+        }
+    }
+
+    if (n & 1)
+    {
+        psrc1 = psrc0;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            ST_SP2_INC(src0, src1, pdst, 4);
+        }
+
+        if (m & 2)
+        {
+            src0 = LD_SP(psrc1);
+            psrc1 += 4;
+
+            ST_SP(src0, pdst);
+            pdst += 4;
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            psrc1 += 2;
+
+            *(pdst + 0) = ctemp01;
+            *(pdst + 1) = ctemp02;
+            pdst += 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/mips/cgemm_ncopy_8_msa.c b/kernel/mips/cgemm_ncopy_8_msa.c
new file mode 100644 (file)
index 0000000..9ea7490
--- /dev/null
@@ -0,0 +1,310 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+    BLASLONG i, j;
+    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+    FLOAT *psrc8, *pdst;
+    FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07;
+    FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14;
+    FLOAT ctemp15, ctemp16;
+    v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+    v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+    v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    psrc0 = src;
+    pdst = dst;
+    lda *= 2;
+
+    for (j = (n >> 3); j--;)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc1 + lda;
+        psrc3 = psrc2 + lda;
+        psrc4 = psrc3 + lda;
+        psrc5 = psrc4 + lda;
+        psrc6 = psrc5 + lda;
+        psrc7 = psrc6 + lda;
+        psrc8 = psrc7 + lda;
+        psrc0 += 8 * lda;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
+            LD_SP2_INC(psrc3, 4, src4, src5);
+            LD_SP2_INC(psrc4, 4, src6, src7);
+            LD_SP2_INC(psrc5, 4, src8, src9);
+            LD_SP2_INC(psrc6, 4, src10, src11);
+            LD_SP2_INC(psrc7, 4, src12, src13);
+            LD_SP2_INC(psrc8, 4, src14, src15);
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+            ILVRL_D2_SP(src6, src4, dst1, dst5);
+            ILVRL_D2_SP(src10, src8, dst2, dst6);
+            ILVRL_D2_SP(src14, src12, dst3, dst7);
+
+            ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+
+            ILVRL_D2_SP(src3, src1, dst0, dst4);
+            ILVRL_D2_SP(src7, src5, dst1, dst5);
+            ILVRL_D2_SP(src11, src9, dst2, dst6);
+            ILVRL_D2_SP(src15, src13, dst3, dst7);
+
+            ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+        }
+
+        if (m & 2)
+        {
+            src0 = LD_SP(psrc1);
+            src2 = LD_SP(psrc2);
+            src4 = LD_SP(psrc3);
+            src6 = LD_SP(psrc4);
+            src8 = LD_SP(psrc5);
+            src10 = LD_SP(psrc6);
+            src12 = LD_SP(psrc7);
+            src14 = LD_SP(psrc8);
+            psrc1 += 4;
+            psrc2 += 4;
+            psrc3 += 4;
+            psrc4 += 4;
+            psrc5 += 4;
+            psrc6 += 4;
+            psrc7 += 4;
+            psrc8 += 4;
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+            ILVRL_D2_SP(src6, src4, dst1, dst5);
+            ILVRL_D2_SP(src10, src8, dst2, dst6);
+            ILVRL_D2_SP(src14, src12, dst3, dst7);
+
+            ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+            ctemp05 = *(psrc3 + 0);
+            ctemp06 = *(psrc3 + 1);
+            ctemp07 = *(psrc4 + 0);
+            ctemp08 = *(psrc4 + 1);
+            ctemp09 = *(psrc5 + 0);
+            ctemp10 = *(psrc5 + 1);
+            ctemp11 = *(psrc6 + 0);
+            ctemp12 = *(psrc6 + 1);
+            ctemp13 = *(psrc7 + 0);
+            ctemp14 = *(psrc7 + 1);
+            ctemp15 = *(psrc8 + 0);
+            ctemp16 = *(psrc8 + 1);
+            psrc1 += 2;
+            psrc2 += 2;
+            psrc3 += 2;
+            psrc4 += 2;
+            psrc5 += 2;
+            psrc6 += 2;
+            psrc7 += 2;
+            psrc8 += 2;
+
+            *(pdst +  0) = ctemp01;
+            *(pdst +  1) = ctemp02;
+            *(pdst +  2) = ctemp03;
+            *(pdst +  3) = ctemp04;
+            *(pdst +  4) = ctemp05;
+            *(pdst +  5) = ctemp06;
+            *(pdst +  6) = ctemp07;
+            *(pdst +  7) = ctemp08;
+            *(pdst +  8) = ctemp09;
+            *(pdst +  9) = ctemp10;
+            *(pdst + 10) = ctemp11;
+            *(pdst + 11) = ctemp12;
+            *(pdst + 12) = ctemp13;
+            *(pdst + 13) = ctemp14;
+            *(pdst + 14) = ctemp15;
+            *(pdst + 15) = ctemp16;
+            pdst += 16;
+        }
+    }
+
+    if (n & 4)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc1 + lda;
+        psrc3 = psrc2 + lda;
+        psrc4 = psrc3 + lda;
+        psrc0 += 4 * lda;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
+            LD_SP2_INC(psrc3, 4, src4, src5);
+            LD_SP2_INC(psrc4, 4, src6, src7);
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+            ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+            ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+
+            ILVRL_D2_SP(src3, src1, dst0, dst4);
+            ILVRL_D2_SP(src7, src5, dst1, dst5);
+
+            ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+        }
+
+        if (m & 2)
+        {
+            src0 = LD_SP(psrc1);
+            src2 = LD_SP(psrc2);
+            src4 = LD_SP(psrc3);
+            src6 = LD_SP(psrc4);
+            psrc1 += 4;
+            psrc2 += 4;
+            psrc3 += 4;
+            psrc4 += 4;
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+            ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+            ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+            ctemp05 = *(psrc3 + 0);
+            ctemp06 = *(psrc3 + 1);
+            ctemp07 = *(psrc4 + 0);
+            ctemp08 = *(psrc4 + 1);
+            psrc1 += 2;
+            psrc2 += 2;
+            psrc3 += 2;
+            psrc4 += 2;
+
+            *(pdst + 0) = ctemp01;
+            *(pdst + 1) = ctemp02;
+            *(pdst + 2) = ctemp03;
+            *(pdst + 3) = ctemp04;
+            *(pdst + 4) = ctemp05;
+            *(pdst + 5) = ctemp06;
+            *(pdst + 6) = ctemp07;
+            *(pdst + 7) = ctemp08;
+            pdst += 8;
+        }
+    }
+
+    if (n & 2)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc1 + lda;
+        psrc0 += 2 * lda;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+            ST_SP2_INC(dst0, dst4, pdst, 4);
+
+            ILVRL_D2_SP(src3, src1, dst0, dst4);
+
+            ST_SP2_INC(dst0, dst4, pdst, 4);
+        }
+
+        if (m & 2)
+        {
+            src0 = LD_SP(psrc1);
+            src2 = LD_SP(psrc2);
+            psrc1 += 4;
+            psrc2 += 4;
+
+            ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+            ST_SP2_INC(dst0, dst4, pdst, 4);
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+            psrc1 += 2;
+            psrc2 += 2;
+
+            *(pdst + 0) = ctemp01;
+            *(pdst + 1) = ctemp02;
+            *(pdst + 2) = ctemp03;
+            *(pdst + 3) = ctemp04;
+            pdst  += 4;
+        }
+    }
+
+    if (n & 1)
+    {
+        psrc1 = psrc0;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            ST_SP2_INC(src0, src1, pdst, 4);
+        }
+
+        if (m & 2)
+        {
+            src0 = LD_SP(psrc1);
+            psrc1 += 4;
+
+            ST_SP(src0, pdst);
+            pdst += 4;
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            psrc1 += 2;
+
+            *(pdst + 0) = ctemp01;
+            *(pdst + 1) = ctemp02;
+            pdst += 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/mips/cgemm_tcopy_4_msa.c b/kernel/mips/cgemm_tcopy_4_msa.c
new file mode 100644 (file)
index 0000000..12aaa97
--- /dev/null
@@ -0,0 +1,125 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+    BLASLONG i, j;
+    FLOAT *psrc0;
+    FLOAT *psrc1, *psrc2;
+    FLOAT *pdst0;
+    FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+    v4f32 src0, src1, src2, src3;
+
+    psrc0 = src;
+    pdst0 = dst;
+    lda *= 2;
+
+    for (j = (n >> 2); j--;)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc0 + lda;
+        psrc0 += 8;
+
+        for (i = (m >> 1); i--;)
+        {
+            LD_SP2(psrc1, 4, src0, src1);
+            LD_SP2(psrc2, 4, src2, src3);
+            ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+        }
+
+        if (m & 1)
+        {
+            LD_SP2(psrc1, 4, src0, src1);
+            ST_SP2_INC(src0, src1, pdst0, 4);
+        }
+    }
+
+    if (n & 2)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc0 + lda;
+        psrc0 += 4;
+
+        for (i = (m >> 1); i--;)
+        {
+            src0 = LD_SP(psrc1);
+            src1 = LD_SP(psrc2);
+            ST_SP2_INC(src0, src1, pdst0, 4);
+
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+        }
+
+        if (m & 1)
+        {
+            src0 = LD_SP(psrc1);
+            ST_SP(src0, pdst0);
+            pdst0 += 4;
+        }
+    }
+
+    if (n & 1)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc0 + lda;
+        psrc0 += 2;
+
+        for (i = (m >> 1); i--;)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+
+            *(pdst0 + 0) = ctemp01;
+            *(pdst0 + 1) = ctemp02;
+            *(pdst0 + 2) = ctemp03;
+            *(pdst0 + 3) = ctemp04;
+
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+            pdst0 += 4;
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+
+            *(pdst0 + 0) = ctemp01;
+            *(pdst0 + 1) = ctemp02;
+            pdst0 += 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/mips/cgemm_tcopy_8_msa.c b/kernel/mips/cgemm_tcopy_8_msa.c
new file mode 100644 (file)
index 0000000..9f78fa7
--- /dev/null
@@ -0,0 +1,214 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+    BLASLONG i, j;
+    FLOAT *psrc0, *psrc1, *psrc2, *pdst0;
+    FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+    v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+    v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    psrc0 = src;
+    pdst0 = dst;
+    lda *= 2;
+
+    for (j = (n >> 3); j--;)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc0 + lda;
+        psrc0 += 16;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP4(psrc1, 4, src0, src1, src2, src3);
+            LD_SP4(psrc2, 4, src4, src5, src6, src7);
+            LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11);
+            LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15);
+            ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
+            ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4);
+            psrc1 += 4 * lda;
+            psrc2 += 4 * lda;
+        }
+
+        if (m & 2)
+        {
+            LD_SP4(psrc1, 4, src0, src1, src2, src3);
+            LD_SP4(psrc2, 4, src4, src5, src6, src7);
+            ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+        }
+
+        if (m & 1)
+        {
+            LD_SP4(psrc1, 4, src0, src1, src2, src3);
+            ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+        }
+    }
+
+    if (n & 4)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc0 + lda;
+        psrc0 += 8;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_SP2(psrc1, 4, src0, src1);
+            LD_SP2(psrc2, 4, src2, src3);
+            LD_SP2(psrc1 + 2 * lda, 4, src4, src5);
+            LD_SP2(psrc2 + 2 * lda, 4, src6, src7);
+
+            ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+            ST_SP4_INC(src4, src5, src6, src7, pdst0, 4);
+            psrc1 += 4 * lda;
+            psrc2 += 4 * lda;
+        }
+
+        if (m & 2)
+        {
+            LD_SP2(psrc1, 4, src0, src1);
+            LD_SP2(psrc2, 4, src2, src3);
+            ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+        }
+
+        if (m & 1)
+        {
+            LD_SP2(psrc1, 4, src0, src1);
+            ST_SP2_INC(src0, src1, pdst0, 4);
+        }
+    }
+
+    if (n & 2)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc0 + lda;
+        psrc0 += 4;
+
+        for (i = (m >> 2); i--;)
+        {
+            src0 = LD_SP(psrc1);
+            src1 = LD_SP(psrc2);
+            src2 = LD_SP(psrc1 + 2 * lda);
+            src3 = LD_SP(psrc2 + 2 * lda);
+            ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+
+            psrc1 += 4 * lda;
+            psrc2 += 4 * lda;
+        }
+
+        if (m & 2)
+        {
+            src0 = LD_SP(psrc1);
+            src1 = LD_SP(psrc2);
+            ST_SP2_INC(src0, src1, pdst0, 4);
+
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+        }
+
+        if (m & 1)
+        {
+            src0 = LD_SP(psrc1);
+            ST_SP(src0, pdst0);
+            pdst0 += 4;
+        }
+    }
+
+    if (n & 1)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc0 + lda;
+        psrc0 += 2;
+
+        for (i = (m >> 2); i--;)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+
+            *(pdst0 + 0) = ctemp01;
+            *(pdst0 + 1) = ctemp02;
+            *(pdst0 + 2) = ctemp03;
+            *(pdst0 + 3) = ctemp04;
+
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+            pdst0 += 4;
+
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+
+            *(pdst0 + 0) = ctemp01;
+            *(pdst0 + 1) = ctemp02;
+            *(pdst0 + 2) = ctemp03;
+            *(pdst0 + 3) = ctemp04;
+
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+            pdst0 += 4;
+        }
+
+        if (m & 2)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+            ctemp03 = *(psrc2 + 0);
+            ctemp04 = *(psrc2 + 1);
+
+            *(pdst0 + 0) = ctemp01;
+            *(pdst0 + 1) = ctemp02;
+            *(pdst0 + 2) = ctemp03;
+            *(pdst0 + 3) = ctemp04;
+
+            psrc1 += 2 * lda;
+            psrc2 += 2 * lda;
+            pdst0 += 4;
+        }
+
+        if (m & 1)
+        {
+            ctemp01 = *(psrc1 + 0);
+            ctemp02 = *(psrc1 + 1);
+
+            *(pdst0 + 0) = ctemp01;
+            *(pdst0 + 1) = ctemp02;
+            pdst0 += 2;
+        }
+    }
+
+    return 0;
+}
index 1f0a2ae..9286e74 100644 (file)
@@ -35,19 +35,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
 #endif
           )
 {
-    BLASLONG i, j, l;
-    FLOAT *pc0, *pc1, *pc2, *pc3;
-    FLOAT *pa0, *pb0;
+    BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+    BLASLONG off;
+#endif
+    FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
     FLOAT tmp0, tmp1, tmp2, tmp3;
-    FLOAT a0;
-    FLOAT b0, b1, b2, b3;
+    FLOAT a0, b0, b1, b2, b3;
     v2f64 v_alpha = {alpha, alpha};
     v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1;
     v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
     v2f64 res0, res1, res2, res3, res4, res5, res6, res7;
     v2f64 res8, res9, res10, res11, res12, res13, res14, res15;
 
-    for (j = (n / 4); j--;)
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    off = -offset;
+#endif
+
+    for (j = (n >> 2); j--;)
     {
         pc0 = C;
         pc1 = pc0 + ldc;
@@ -56,12 +61,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
 
         pa0 = A;
 
-        for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 8;
+            pb0 = B + off * 4;
+#endif
 
-            LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
-            LD_DP2(pb0, 2, src_b0, src_b1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+            LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+            LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
             src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
             res0 = src_a0 * src_b;
@@ -87,13 +114,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res14 = src_a2 * src_b;
             res15 = src_a3 * src_b;
 
-            pa0 += 8;
-            pb0 += 4;
-
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -119,11 +143,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res14 += src_a2 * src_b;
                 res15 += src_a3 * src_b;
 
-                pa0 += 8;
-                pb0 += 4;
-
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -148,15 +169,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res13 += src_a1 * src_b;
                 res14 += src_a2 * src_b;
                 res15 += src_a3 * src_b;
-
-                pa0 += 8;
-                pb0 += 4;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -181,11 +199,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res13 += src_a1 * src_b;
                 res14 += src_a2 * src_b;
                 res15 += src_a3 * src_b;
-
-                pa0 += 8;
-                pb0 += 4;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+            dst4 = res4 * v_alpha;
+            dst5 = res5 * v_alpha;
+            dst6 = res6 * v_alpha;
+            dst7 = res7 * v_alpha;
+#else
             LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
             LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
 
@@ -197,10 +222,20 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst5 += res5 * v_alpha;
             dst6 += res6 * v_alpha;
             dst7 += res7 * v_alpha;
-
-            ST_DP4(dst0, dst1, dst2, dst3, pc0, 2);
-            ST_DP4(dst4, dst5, dst6, dst7, pc1, 2);
-
+#endif
+            ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+            ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
+
+#if defined(TRMMKERNEL)
+            dst0 = res8 * v_alpha;
+            dst1 = res9 * v_alpha;
+            dst2 = res10 * v_alpha;
+            dst3 = res11 * v_alpha;
+            dst4 = res12 * v_alpha;
+            dst5 = res13 * v_alpha;
+            dst6 = res14 * v_alpha;
+            dst7 = res15 * v_alpha;
+#else
             LD_DP4(pc2, 2, dst0, dst1, dst2, dst3);
             LD_DP4(pc3, 2, dst4, dst5, dst6, dst7);
 
@@ -212,22 +247,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst5 += res13 * v_alpha;
             dst6 += res14 * v_alpha;
             dst7 += res15 * v_alpha;
+#endif
 
-            ST_DP4(dst0, dst1, dst2, dst3, pc2, 2);
-            ST_DP4(dst4, dst5, dst6, dst7, pc3, 2);
+            ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);
+            ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);
 
-            pc0 += 8;
-            pc1 += 8;
-            pc2 += 8;
-            pc3 += 8;
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 8;
+            pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 4) / 4); i--;)
+        if (m & 4)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 4;
+            pb0 = B + off * 4;
+#endif
 
-            LD_DP2(pa0, 2, src_a0, src_a1);
-            LD_DP2(pb0, 2, src_b0, src_b1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+            LD_DP2_INC(pa0, 2, src_a0, src_a1);
+            LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
             src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
             res0 = src_a0 * src_b;
@@ -245,13 +311,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res6 = src_a0 * src_b;
             res7 = src_a1 * src_b;
 
-            pa0 += 4;
-            pb0 += 4;
-
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_DP2(pa0, 2, src_a0, src_a1);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -269,11 +332,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res6 += src_a0 * src_b;
                 res7 += src_a1 * src_b;
 
-                pa0 += 4;
-                pb0 += 4;
-
-                LD_DP2(pa0, 2, src_a0, src_a1);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -290,15 +350,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
                 res6 += src_a0 * src_b;
                 res7 += src_a1 * src_b;
-
-                pa0 += 4;
-                pb0 += 4;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_DP2(pa0, 2, src_a0, src_a1);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -315,11 +372,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
                 res6 += src_a0 * src_b;
                 res7 += src_a1 * src_b;
-
-                pa0 += 4;
-                pb0 += 4;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+            dst4 = res4 * v_alpha;
+            dst5 = res5 * v_alpha;
+            dst6 = res6 * v_alpha;
+            dst7 = res7 * v_alpha;
+#else
             LD_DP2(pc0, 2, dst0, dst1);
             LD_DP2(pc1, 2, dst2, dst3);
             LD_DP2(pc2, 2, dst4, dst5);
@@ -333,24 +397,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst5 += res5 * v_alpha;
             dst6 += res6 * v_alpha;
             dst7 += res7 * v_alpha;
+#endif
+            ST_DP2_INC(dst0, dst1, pc0, 2);
+            ST_DP2_INC(dst2, dst3, pc1, 2);
+            ST_DP2_INC(dst4, dst5, pc2, 2);
+            ST_DP2_INC(dst6, dst7, pc3, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 4;
+            pb0 += temp * 4;
+#endif
 
-            ST_DP2(dst0, dst1, pc0, 2);
-            ST_DP2(dst2, dst3, pc1, 2);
-            ST_DP2(dst4, dst5, pc2, 2);
-            ST_DP2(dst6, dst7, pc3, 2);
-
-            pc0 += 4;
-            pc1 += 4;
-            pc2 += 4;
-            pc3 += 4;
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 2) / 2); i--;)
+        if (m & 2)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 2;
+            pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             src_a0 = LD_DP(pa0);
-            LD_DP2(pb0, 2, src_b0, src_b1);
+            pa0 += 2;
+            LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
             src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
             res0 = src_a0 * src_b;
@@ -364,13 +459,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
             res3 = src_a0 * src_b;
 
-            pa0 += 2;
-            pb0 += 4;
-
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 src_a0 = LD_DP(pa0);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                pa0 += 2;
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -384,11 +477,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
                 res3 += src_a0 * src_b;
 
-                pa0 += 2;
-                pb0 += 4;
-
                 src_a0 = LD_DP(pa0);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                pa0 += 2;
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -401,15 +492,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
 
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
                 res3 += src_a0 * src_b;
-
-                pa0 += 2;
-                pb0 += 4;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 src_a0 = LD_DP(pa0);
-                LD_DP2(pb0, 2, src_b0, src_b1);
+                pa0 += 2;
+                LD_DP2_INC(pb0, 2, src_b0, src_b1);
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -422,11 +511,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
 
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
                 res3 += src_a0 * src_b;
-
-                pa0 += 2;
-                pb0 += 4;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+#else
             dst0 = LD_DP(pc0);
             dst1 = LD_DP(pc1);
             dst2 = LD_DP(pc2);
@@ -436,21 +528,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst1 += res1 * v_alpha;
             dst2 += res2 * v_alpha;
             dst3 += res3 * v_alpha;
-
+#endif
             ST_DP(dst0, pc0);
             ST_DP(dst1, pc1);
             ST_DP(dst2, pc2);
             ST_DP(dst3, pc3);
 
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2;
+            pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
             pc0 += 2;
             pc1 += 2;
             pc2 += 2;
             pc3 += 2;
         }
 
-        for (i = (m & 1); i--;)
+        if (m & 1)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 1;
+            pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -468,7 +594,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 1;
             pb0 += 4;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -503,7 +629,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 4;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -527,10 +653,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             tmp2 = alpha * tmp2;
             tmp3 = alpha * tmp3;
 
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc1[0] = tmp1;
+            pc2[0] = tmp2;
+            pc3[0] = tmp3;
+#else
             pc0[0] += tmp0;
             pc1[0] += tmp1;
             pc2[0] += tmp2;
             pc3[0] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 1;
+            pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
 
             pc0 += 1;
             pc1 += 1;
@@ -538,25 +688,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pc3 += 1;
         }
 
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4; // number of values in A
+#endif
+
         l = (k << 2);
         B = B + l;
         i = (ldc << 2);
         C = C + i;
     }
 
-    for (j = ((n & 2) / 2); j--;)
+    if (n & 2)
     {
         pc0 = C;
         pc1 = pc0 + ldc;
 
         pa0 = A;
 
-        for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 8;
+            pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
             pb0 = B;
+            temp = k;
+#endif
+
 
-            LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+            LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
             src_b0 = LD_DP(pb0);
+            pb0 += 2;
 
             src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
             res0 = src_a0 * src_b;
@@ -570,13 +748,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res6 = src_a2 * src_b;
             res7 = src_a3 * src_b;
 
-            pa0 += 8;
-            pb0 += 2;
-
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -590,11 +766,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res6 += src_a2 * src_b;
                 res7 += src_a3 * src_b;
 
-                pa0 += 8;
-                pb0 += 2;
-
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -607,15 +781,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res5 += src_a1 * src_b;
                 res6 += src_a2 * src_b;
                 res7 += src_a3 * src_b;
-
-                pa0 += 8;
-                pb0 += 2;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -628,11 +800,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res5 += src_a1 * src_b;
                 res6 += src_a2 * src_b;
                 res7 += src_a3 * src_b;
-
-                pa0 += 8;
-                pb0 += 2;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+            dst4 = res4 * v_alpha;
+            dst5 = res5 * v_alpha;
+            dst6 = res6 * v_alpha;
+            dst7 = res7 * v_alpha;
+#else
             LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
             LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
 
@@ -644,20 +823,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst5 += res5 * v_alpha;
             dst6 += res6 * v_alpha;
             dst7 += res7 * v_alpha;
+#endif
+            ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+            ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 8;
+            pb0 += temp * 2;
+#endif
 
-            ST_DP4(dst0, dst1, dst2, dst3, pc0, 2);
-            ST_DP4(dst4, dst5, dst6, dst7, pc1, 2);
-
-            pc0 += 8;
-            pc1 += 8;
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 4) / 4); i--;)
+        if (m & 4)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 4;
+            pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
             pb0 = B;
+            temp = k;
+#endif
 
-            LD_DP2(pa0, 2, src_a0, src_a1);
+            LD_DP2_INC(pa0, 2, src_a0, src_a1);
             src_b0 = LD_DP(pb0);
+            pb0 += 2;
 
             src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
             res0 = src_a0 * src_b;
@@ -667,13 +879,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res2 = src_a0 * src_b;
             res3 = src_a1 * src_b;
 
-            pa0 += 4;
-            pb0 += 2;
-
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_DP2(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -683,11 +893,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res2 += src_a0 * src_b;
                 res3 += src_a1 * src_b;
 
-                pa0 += 4;
-                pb0 += 2;
-
-                LD_DP2(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -696,15 +904,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
                 res2 += src_a0 * src_b;
                 res3 += src_a1 * src_b;
-
-                pa0 += 4;
-                pb0 += 2;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_DP2(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -713,11 +919,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
                 res2 += src_a0 * src_b;
                 res3 += src_a1 * src_b;
-
-                pa0 += 4;
-                pb0 += 2;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+#else
             LD_DP2(pc0, 2, dst0, dst1);
             LD_DP2(pc1, 2, dst2, dst3);
 
@@ -725,20 +934,54 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst1 += res1 * v_alpha;
             dst2 += res2 * v_alpha;
             dst3 += res3 * v_alpha;
+#endif
+            ST_DP2_INC(dst0, dst1, pc0, 2);
+            ST_DP2_INC(dst2, dst3, pc1, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 4;
+            pb0 += temp * 2;
+#endif
 
-            ST_DP2(dst0, dst1, pc0, 2);
-            ST_DP2(dst2, dst3, pc1, 2);
-
-            pc0 += 4;
-            pc1 += 4;
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 2) / 2); i--;)
+        if (m & 2)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 2;
+            pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             src_a0 = LD_DP(pa0);
+            pa0 += 2;
             src_b0 = LD_DP(pb0);
+            pb0 += 2;
 
             src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
             res0 = src_a0 * src_b;
@@ -746,13 +989,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
             res1 = src_a0 * src_b;
 
-            pa0 += 2;
-            pb0 += 2;
-
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 src_a0 = LD_DP(pa0);
+                pa0 += 2;
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
@@ -760,53 +1002,86 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
                 res1 += src_a0 * src_b;
 
-                pa0 += 2;
-                pb0 += 2;
-
                 src_a0 = LD_DP(pa0);
+                pa0 += 2;
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
 
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
                 res1 += src_a0 * src_b;
-
-                pa0 += 2;
-                pb0 += 2;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 src_a0 = LD_DP(pa0);
+                pa0 += 2;
                 src_b0 = LD_DP(pb0);
+                pb0 += 2;
 
                 src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
                 res0 += src_a0 * src_b;
 
                 src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
                 res1 += src_a0 * src_b;
-
-                pa0 += 2;
-                pb0 += 2;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+#else
             dst0 = LD_DP(pc0);
             dst1 = LD_DP(pc1);
 
             dst0 += res0 * v_alpha;
             dst1 += res1 * v_alpha;
-
+#endif
             ST_DP(dst0, pc0);
             ST_DP(dst1, pc1);
 
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2;
+            pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
             pc0 += 2;
             pc1 += 2;
         }
 
-        for (i = (m & 1); i--;)
+        if (m & 1)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 1;
+            pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -818,7 +1093,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 1;
             pb0 += 2;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -841,7 +1116,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 2;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -857,29 +1132,77 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             tmp0 = alpha * tmp0;
             tmp1 = alpha * tmp1;
 
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc1[0] = tmp1;
+#else
             pc0[0] += tmp0;
             pc1[0] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 1;
+            pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
 
             pc0 += 1;
             pc1 += 1;
         }
 
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2; // number of values in A
+#endif
+
         l = (k << 1);
         B = B + l;
         i = (ldc << 1);
         C = C + i;
     }
 
-    for (j = (n & 1); j--;)
+    if (n & 1)
     {
         pc0 = C;
         pa0 = A;
 
-        for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 8;
+            pb0 = B + off * 1;
+#endif
 
-            LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+            LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
             src_b[0] = pb0[0];
             src_b[1] = pb0[0];
 
@@ -888,12 +1211,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res2 = src_a2 * src_b;
             res3 = src_a3 * src_b;
 
-            pa0 += 8;
             pb0 += 1;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
                 src_b[0] = pb0[0];
                 src_b[1] = pb0[0];
 
@@ -902,10 +1224,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res2 += src_a2 * src_b;
                 res3 += src_a3 * src_b;
 
-                pa0 += 8;
                 pb0 += 1;
 
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
                 src_b[0] = pb0[0];
                 src_b[1] = pb0[0];
 
@@ -914,13 +1235,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res2 += src_a2 * src_b;
                 res3 += src_a3 * src_b;
 
-                pa0 += 8;
                 pb0 += 1;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+                LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
                 src_b[0] = pb0[0];
                 src_b[1] = pb0[0];
 
@@ -929,85 +1249,156 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res2 += src_a2 * src_b;
                 res3 += src_a3 * src_b;
 
-                pa0 += 8;
                 pb0 += 1;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+#else
             LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
 
             dst0 += res0 * v_alpha;
             dst1 += res1 * v_alpha;
             dst2 += res2 * v_alpha;
             dst3 += res3 * v_alpha;
+#endif
+            ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 8;
+            pb0 += temp * 1;
+#endif
 
-            ST_DP4(dst0, dst1, dst2, dst3, pc0, 2);
-
-            pc0 += 8;
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 4) / 4); i--;)
+        if (m & 4)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 4;
+            pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
             pb0 = B;
+            temp = k;
+#endif
 
-            LD_DP2(pa0, 2, src_a0, src_a1);
+            LD_DP2_INC(pa0, 2, src_a0, src_a1);
             src_b[0] = pb0[0];
             src_b[1] = pb0[0];
 
             res0 = src_a0 * src_b;
             res1 = src_a1 * src_b;
 
-            pa0 += 4;
             pb0 += 1;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_DP2(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
                 src_b[0] = pb0[0];
                 src_b[1] = pb0[0];
 
                 res0 += src_a0 * src_b;
                 res1 += src_a1 * src_b;
 
-                pa0 += 4;
                 pb0 += 1;
 
-                LD_DP2(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
                 src_b[0] = pb0[0];
                 src_b[1] = pb0[0];
 
                 res0 += src_a0 * src_b;
                 res1 += src_a1 * src_b;
 
-                pa0 += 4;
                 pb0 += 1;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_DP2(pa0, 2, src_a0, src_a1);
+                LD_DP2_INC(pa0, 2, src_a0, src_a1);
                 src_b[0] = pb0[0];
                 src_b[1] = pb0[0];
 
                 res0 += src_a0 * src_b;
                 res1 += src_a1 * src_b;
 
-                pa0 += 4;
                 pb0 += 1;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+#else
             LD_DP2(pc0, 2, dst0, dst1);
 
             dst0 += res0 * v_alpha;
             dst1 += res1 * v_alpha;
+#endif
+            ST_DP2_INC(dst0, dst1, pc0, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 4;
+            pb0 += temp * 1;
+#endif
 
-            ST_DP2(dst0, dst1, pc0, 2);
-
-            pc0 += 4;
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 2) / 2); i--;)
+        if (m & 2)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2;
+            pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
             pb0 = B;
+            temp = k;
+#endif
 
             src_a0 = LD_DP(pa0);
             src_b[0] = pb0[0];
@@ -1018,7 +1409,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 2;
             pb0 += 1;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 src_a0 = LD_DP(pa0);
                 src_b[0] = pb0[0];
@@ -1039,7 +1430,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 src_a0 = LD_DP(pa0);
                 src_b[0] = pb0[0];
@@ -1051,18 +1442,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+#else
             dst0 = LD_DP(pc0);
 
             dst0 += res0 * v_alpha;
-
+#endif
             ST_DP(dst0, pc0);
 
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2;
+            pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
             pc0 += 2;
         }
 
-        for (i = (m & 1); i--;)
+        if (m & 1)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 1;
+            pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -1071,7 +1499,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 1;
             pb0 += 1;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1088,7 +1516,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1098,15 +1526,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
+#if defined(TRMMKERNEL)
+            pc0[0] = alpha * tmp0;
+#else
             pc0[0] += alpha * tmp0;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 1;
+            pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
 
             pc0 += 1;
         }
 
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1; // number of values in A
+#endif
+
         l = (k << 0);
         B = B + l;
         i = (ldc << 0);
         C = C + i;
     }
+
     return 0;
 }
index bbd7607..a61b2e8 100644 (file)
@@ -32,8 +32,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
           FLOAT * __restrict dst)
 {
     BLASLONG i, j;
-    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
-    FLOAT *pdst;
+    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
     v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
     v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
@@ -50,28 +49,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (m >> 2); i--;)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            LD_DP2(psrc3, 2, src4, src5);
-            LD_DP2(psrc4, 2, src6, src7);
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+            LD_DP2_INC(psrc3, 2, src4, src5);
+            LD_DP2_INC(psrc4, 2, src6, src7);
 
-            psrc1 += 4;
-            psrc2 += 4;
-            psrc3 += 4;
-            psrc4 += 4;
-
-            dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
-            dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
-            dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
-            dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
-
-            dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
-            dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
-            dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
-            dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
-
-            ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
-            pdst += 16;
+            ILVRL_D2_DP(src2, src0, dst0, dst4);
+            ILVRL_D2_DP(src6, src4, dst1, dst5);
+            ILVRL_D2_DP(src3, src1, dst2, dst6);
+            ILVRL_D2_DP(src7, src5, dst3, dst7);
+
+            ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
         }
 
         for (i = (m & 3); i--;)
@@ -91,18 +79,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (m >> 2); i--;)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            psrc1 += 4;
-            psrc2 += 4;
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
 
-            dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
-            dst1 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
-            dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
-            dst5 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
+            ILVRL_D2_DP(src2, src0, dst0, dst4);
+            ILVRL_D2_DP(src3, src1, dst1, dst5);
 
-            ST_DP4(dst0, dst4, dst1, dst5, pdst, 2);
-            pdst += 8;
+            ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2);
         }
 
         for (i = (m & 3); i--;)
index 43c9775..86d019c 100644 (file)
@@ -32,9 +32,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
           FLOAT *  __restrict dst)
 {
     BLASLONG i, j;
-    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
-    FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
-    FLOAT *pdst;
+    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+    FLOAT *psrc8, *pdst;
     v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
     v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
     v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@@ -56,80 +55,51 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (m >> 3); i--;)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            LD_DP2(psrc3, 2, src4, src5);
-            LD_DP2(psrc4, 2, src6, src7);
-            LD_DP2(psrc5, 2, src8, src9);
-            LD_DP2(psrc6, 2, src10, src11);
-            LD_DP2(psrc7, 2, src12, src13);
-            LD_DP2(psrc8, 2, src14, src15);
-
-            dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
-            dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
-            dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8);
-            dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12);
-            dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
-            dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
-            dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8);
-            dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12);
-
-            ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
-
-            dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
-            dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
-            dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9);
-            dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13);
-            dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
-            dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
-            dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9);
-            dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13);
-
-            ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 16,
-                   2);
-
-            LD_DP2(psrc1 + 4, 2, src0, src1);
-            LD_DP2(psrc2 + 4, 2, src2, src3);
-            LD_DP2(psrc3 + 4, 2, src4, src5);
-            LD_DP2(psrc4 + 4, 2, src6, src7);
-            LD_DP2(psrc5 + 4, 2, src8, src9);
-            LD_DP2(psrc6 + 4, 2, src10, src11);
-            LD_DP2(psrc7 + 4, 2, src12, src13);
-            LD_DP2(psrc8 + 4, 2, src14, src15);
-
-            dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
-            dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
-            dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8);
-            dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12);
-            dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
-            dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
-            dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8);
-            dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12);
-
-            ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 32,
-                   2);
-
-            dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
-            dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
-            dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9);
-            dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13);
-            dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
-            dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
-            dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9);
-            dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13);
-
-            ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 48,
-                   2);
-
-            psrc1 += 8;
-            psrc2 += 8;
-            psrc3 += 8;
-            psrc4 += 8;
-            psrc5 += 8;
-            psrc6 += 8;
-            psrc7 += 8;
-            psrc8 += 8;
-            pdst += 64;
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+            LD_DP2_INC(psrc3, 2, src4, src5);
+            LD_DP2_INC(psrc4, 2, src6, src7);
+            LD_DP2_INC(psrc5, 2, src8, src9);
+            LD_DP2_INC(psrc6, 2, src10, src11);
+            LD_DP2_INC(psrc7, 2, src12, src13);
+            LD_DP2_INC(psrc8, 2, src14, src15);
+
+            ILVRL_D2_DP(src2, src0, dst0, dst4);
+            ILVRL_D2_DP(src6, src4, dst1, dst5);
+            ILVRL_D2_DP(src10, src8, dst2, dst6);
+            ILVRL_D2_DP(src14, src12, dst3, dst7);
+
+            ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+            ILVRL_D2_DP(src3, src1, dst0, dst4);
+            ILVRL_D2_DP(src7, src5, dst1, dst5);
+            ILVRL_D2_DP(src11, src9, dst2, dst6);
+            ILVRL_D2_DP(src15, src13, dst3, dst7);
+
+            ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+            LD_DP2_INC(psrc3, 2, src4, src5);
+            LD_DP2_INC(psrc4, 2, src6, src7);
+            LD_DP2_INC(psrc5, 2, src8, src9);
+            LD_DP2_INC(psrc6, 2, src10, src11);
+            LD_DP2_INC(psrc7, 2, src12, src13);
+            LD_DP2_INC(psrc8, 2, src14, src15);
+
+            ILVRL_D2_DP(src2, src0, dst0, dst4);
+            ILVRL_D2_DP(src6, src4, dst1, dst5);
+            ILVRL_D2_DP(src10, src8, dst2, dst6);
+            ILVRL_D2_DP(src14, src12, dst3, dst7);
+
+            ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+            ILVRL_D2_DP(src3, src1, dst0, dst4);
+            ILVRL_D2_DP(src7, src5, dst1, dst5);
+            ILVRL_D2_DP(src11, src9, dst2, dst6);
+            ILVRL_D2_DP(src15, src13, dst3, dst7);
+
+            ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
         }
 
         for (i = (m & 7); i--;)
@@ -155,27 +125,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (m >> 2); i--;)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            LD_DP2(psrc3, 2, src4, src5);
-            LD_DP2(psrc4, 2, src6, src7);
-            psrc1 += 4;
-            psrc2 += 4;
-            psrc3 += 4;
-            psrc4 += 4;
-
-            dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
-            dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
-            dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
-            dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
-
-            dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
-            dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
-            dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
-            dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
-
-            ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
-            pdst += 16;
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+            LD_DP2_INC(psrc3, 2, src4, src5);
+            LD_DP2_INC(psrc4, 2, src6, src7);
+
+            ILVRL_D2_DP(src2, src0, dst0, dst4);
+            ILVRL_D2_DP(src6, src4, dst1, dst5);
+            ILVRL_D2_DP(src3, src1, dst2, dst6);
+            ILVRL_D2_DP(src7, src5, dst3, dst7);
+
+            ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
         }
 
         for (i = (m & 3); i--;)
@@ -200,11 +160,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc1 += 2;
             psrc2 += 2;
 
-            dst0 = (v2f64) __msa_ilvr_d((v2i64) src1, (v2i64) src0);
-            dst1 = (v2f64) __msa_ilvl_d((v2i64) src1, (v2i64) src0);
+            ILVRL_D2_DP(src1, src0, dst0, dst1);
 
-            ST_DP2(dst0, dst1, pdst, 2);
-            pdst += 4;
+            ST_DP2_INC(dst0, dst1, pdst, 2);
         }
 
         if (m & 1)
index f147d19..a51c474 100644 (file)
@@ -55,14 +55,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 2); i--;)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            LD_DP2(psrc3, 2, src4, src5);
-            LD_DP2(psrc4, 2, src6, src7);
-            psrc1 += 4;
-            psrc2 += 4;
-            psrc3 += 4;
-            psrc4 += 4;
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+            LD_DP2_INC(psrc3, 2, src4, src5);
+            LD_DP2_INC(psrc4, 2, src6, src7);
 
             ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
             pdst1 += m * 4;
@@ -79,8 +75,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc3 += 2;
             psrc4 += 2;
 
-            ST_DP4(src0, src1, src2, src3, pdst2, 2);
-            pdst2 += 8;
+            ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
         }
 
         if (n & 1)
@@ -103,10 +98,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 2); i--;)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            psrc1 += 4;
-            psrc2 += 4;
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
 
             ST_DP4(src0, src1, src2, src3, pdst1, 2);
             pdst1 += m * 4;
@@ -119,8 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc1 += 2;
             psrc2 += 2;
 
-            ST_DP2(src0, src1, pdst2, 2);
-            pdst2 += 4;
+            ST_DP2_INC(src0, src1, pdst2, 2);
         }
 
         if (n & 1)
@@ -137,8 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 2); i--;)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            psrc1 += 4;
+            LD_DP2_INC(psrc1, 2, src0, src1);
 
             ST_DP2(src0, src1, pdst1, 2);
             pdst1 += 4 * m;
index d1ac49b..350ecb3 100644 (file)
@@ -62,27 +62,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 3); i--;)
         {
-            LD_DP4(psrc1, 2, src0, src1, src2, src3);
-            LD_DP4(psrc2, 2, src4, src5, src6, src7);
-            LD_DP4(psrc3, 2, src8, src9, src10, src11);
-            LD_DP4(psrc4, 2, src12, src13, src14, src15);
-            psrc1 += 8;
-            psrc2 += 8;
-            psrc3 += 8;
-            psrc4 += 8;
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+            LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+            LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
 
             ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
             ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
                    pdst1 + 16, 2);
 
-            LD_DP4(psrc5, 2, src0, src1, src2, src3);
-            LD_DP4(psrc6, 2, src4, src5, src6, src7);
-            LD_DP4(psrc7, 2, src8, src9, src10, src11);
-            LD_DP4(psrc8, 2, src12, src13, src14, src15);
-            psrc5 += 8;
-            psrc6 += 8;
-            psrc7 += 8;
-            psrc8 += 8;
+            LD_DP4_INC(psrc5, 2, src0, src1, src2, src3);
+            LD_DP4_INC(psrc6, 2, src4, src5, src6, src7);
+            LD_DP4_INC(psrc7, 2, src8, src9, src10, src11);
+            LD_DP4_INC(psrc8, 2, src12, src13, src14, src15);
 
             ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32,
                    2);
@@ -93,27 +85,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         if (n & 4)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            LD_DP2(psrc3, 2, src4, src5);
-            LD_DP2(psrc4, 2, src6, src7);
-            LD_DP2(psrc5, 2, src8, src9);
-            LD_DP2(psrc6, 2, src10, src11);
-            LD_DP2(psrc7, 2, src12, src13);
-            LD_DP2(psrc8, 2, src14, src15);
-            psrc1 += 4;
-            psrc2 += 4;
-            psrc3 += 4;
-            psrc4 += 4;
-            psrc5 += 4;
-            psrc6 += 4;
-            psrc7 += 4;
-            psrc8 += 4;
-
-            ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
-            ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
-                   pdst2 + 16, 2);
-            pdst2 += 32;
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+            LD_DP2_INC(psrc3, 2, src4, src5);
+            LD_DP2_INC(psrc4, 2, src6, src7);
+            LD_DP2_INC(psrc5, 2, src8, src9);
+            LD_DP2_INC(psrc6, 2, src10, src11);
+            LD_DP2_INC(psrc7, 2, src12, src13);
+            LD_DP2_INC(psrc8, 2, src14, src15);
+
+            ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
+            ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15,
+                       pdst2, 2);
         }
 
         if (n & 2)
@@ -135,8 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc7 += 2;
             psrc8 += 2;
 
-            ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
-            pdst3 += 16;
+            ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
         }
 
         if (n & 1)
@@ -165,18 +147,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 3); i--;)
         {
-            LD_DP4(psrc1, 2, src0, src1, src2, src3);
-            LD_DP4(psrc2, 2, src4, src5, src6, src7);
-            LD_DP4(psrc3, 2, src8, src9, src10, src11);
-            LD_DP4(psrc4, 2, src12, src13, src14, src15);
-            psrc1 += 8;
-            psrc2 += 8;
-            psrc3 += 8;
-            psrc4 += 8;
-            psrc5 += 8;
-            psrc6 += 8;
-            psrc7 += 8;
-            psrc8 += 8;
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+            LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+            LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
 
             ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
             ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
@@ -186,17 +160,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         if (n & 4)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            LD_DP2(psrc3, 2, src4, src5);
-            LD_DP2(psrc4, 2, src6, src7);
-            psrc1 += 4;
-            psrc2 += 4;
-            psrc3 += 4;
-            psrc4 += 4;
-
-            ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
-            pdst2 += 16;
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+            LD_DP2_INC(psrc3, 2, src4, src5);
+            LD_DP2_INC(psrc4, 2, src6, src7);
+
+            ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
         }
 
         if (n & 2)
@@ -210,8 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc3 += 2;
             psrc4 += 2;
 
-            ST_DP4(src0, src1, src2, src3, pdst3, 2);
-            pdst3 += 8;
+            ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
         }
 
         if (n & 1)
@@ -234,10 +202,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 3); i--;)
         {
-            LD_DP4(psrc1, 2, src0, src1, src2, src3);
-            LD_DP4(psrc2, 2, src4, src5, src6, src7);
-            psrc1 += 8;
-            psrc2 += 8;
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
 
             ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
             pdst1 += 8 * m;
@@ -245,13 +211,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         if (n & 4)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            LD_DP2(psrc2, 2, src2, src3);
-            psrc1 += 4;
-            psrc2 += 4;
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
 
-            ST_DP4(src0, src1, src2, src3, pdst2, 2);
-            pdst2 += 8;
+            ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
         }
 
         if (n & 2)
@@ -261,8 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc1 += 2;
             psrc2 += 2;
 
-            ST_DP2(src0, src1, pdst3, 2);
-            pdst3 += 4;
+            ST_DP2_INC(src0, src1, pdst3, 2);
         }
 
         if (n & 1)
@@ -282,8 +244,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 3); i--;)
         {
-            LD_DP4(psrc1, 2, src0, src1, src2, src3);
-            psrc1 += 8;
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
 
             ST_DP4(src0, src1, src2, src3, pdst1, 2);
             pdst1 += 8 * m;
@@ -291,11 +252,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         if (n & 4)
         {
-            LD_DP2(psrc1, 2, src0, src1);
-            psrc1 += 4;
+            LD_DP2_INC(psrc1, 2, src0, src1);
 
-            ST_DP2(src0, src1, pdst2, 2);
-            pdst2 += 4;
+            ST_DP2_INC(src0, src1, pdst2, 2);
         }
 
         if (n & 2)
index 0efca78..dbc1853 100644 (file)
@@ -42,10 +42,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
 #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
 
-#define COPY_FLOAT_TO_VECTOR(a, b)            \
-    b = __msa_cast_to_vector_float(a);        \
-    b = (v4f32) __msa_splati_w((v4i32) b, 0);
+#define COPY_FLOAT_TO_VECTOR(a) ( {                \
+    v4f32  out;                                    \
+    out = __msa_cast_to_vector_float(a);           \
+    out = (v4f32) __msa_splati_w((v4i32) out, 0);  \
+    out;                                           \
+} )
 
+#define COPY_DOUBLE_TO_VECTOR(a) ( {               \
+    v2f64  out;                                    \
+    out = __msa_cast_to_vector_double(a);          \
+    out = (v2f64) __msa_splati_d((v2i64) out, 0);  \
+    out;                                           \
+} )
+
+/* Description : Load 2 variables with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+*/
+#define LD_GP2_INC(psrc, stride, out0, out1)  \
+{                                             \
+    out0 = *(psrc);                           \
+    (psrc) += stride;                         \
+    out1 = *(psrc);                           \
+    (psrc) += stride;                         \
+}
+
+#define LD_GP3_INC(psrc, stride, out0,     \
+                   out1, out2)             \
+{                                          \
+    LD_GP2_INC(psrc, stride, out0, out1);  \
+    out2 = *(psrc);                        \
+    (psrc) += stride;                      \
+}
+
+#define LD_GP4_INC(psrc, stride, out0,     \
+                   out1, out2, out3)       \
+{                                          \
+    LD_GP2_INC(psrc, stride, out0, out1);  \
+    LD_GP2_INC(psrc, stride, out2, out3);  \
+}
+
+#define LD_GP5_INC(psrc, stride, out0,      \
+                   out1, out2, out3, out4)  \
+{                                           \
+    LD_GP2_INC(psrc, stride, out0, out1);   \
+    LD_GP2_INC(psrc, stride, out2, out3);   \
+    out4 = *(psrc);                         \
+    (psrc) += stride;                       \
+}
+
+#define LD_GP6_INC(psrc, stride, out0,     \
+                   out1, out2, out3,       \
+                   out4, out5)             \
+{                                          \
+    LD_GP2_INC(psrc, stride, out0, out1);  \
+    LD_GP2_INC(psrc, stride, out2, out3);  \
+    LD_GP2_INC(psrc, stride, out4, out5);  \
+}
+
+#define LD_GP7_INC(psrc, stride, out0,     \
+                   out1, out2, out3,       \
+                   out4, out5, out6)       \
+{                                          \
+    LD_GP2_INC(psrc, stride, out0, out1);  \
+    LD_GP2_INC(psrc, stride, out2, out3);  \
+    LD_GP2_INC(psrc, stride, out4, out5);  \
+    out6 = *(psrc);                        \
+    (psrc) += stride;                      \
+}
+
+#define LD_GP8_INC(psrc, stride, out0, out1, out2,     \
+                   out3, out4, out5, out6, out7)       \
+{                                                      \
+    LD_GP4_INC(psrc, stride, out0, out1, out2, out3);  \
+    LD_GP4_INC(psrc, stride, out4, out5, out6, out7);  \
+}
 
 /* Description : Load 2 vectors of single precision floating point elements with stride
    Arguments   : Inputs  - psrc, stride
@@ -58,6 +130,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     out1 = LD_SP((psrc) + stride);        \
 }
 
+#define LD_SP4(psrc, stride, out0, out1, out2, out3)  \
+{                                                     \
+    LD_SP2(psrc, stride, out0, out1)                  \
+    LD_SP2(psrc + 2 * stride, stride, out2, out3)     \
+}
+
+#define LD_SP2_INC(psrc, stride, out0, out1)  \
+{                                             \
+    out0 = LD_SP((psrc));                     \
+    (psrc) += stride;                         \
+    out1 = LD_SP((psrc));                     \
+    (psrc) += stride;                         \
+}
+
+#define LD_SP3_INC(psrc, stride, out0,     \
+                   out1, out2)             \
+{                                          \
+    LD_SP2_INC(psrc, stride, out0, out1);  \
+    out2 = LD_SP((psrc));                  \
+    (psrc) += stride;                      \
+}
+
+#define LD_SP4_INC(psrc, stride, out0,     \
+                   out1, out2, out3)       \
+{                                          \
+    LD_SP2_INC(psrc, stride, out0, out1);  \
+    LD_SP2_INC(psrc, stride, out2, out3);  \
+}
+
+#define LD_SP5_INC(psrc, stride, out0,      \
+                   out1, out2, out3, out4)  \
+{                                           \
+    LD_SP2_INC(psrc, stride, out0, out1);   \
+    LD_SP2_INC(psrc, stride, out2, out3);   \
+    out4 = LD_SP((psrc));                   \
+    (psrc) += stride;                       \
+}
+
+#define LD_SP6_INC(psrc, stride, out0,     \
+                   out1, out2, out3,       \
+                   out4, out5)             \
+{                                          \
+    LD_SP2_INC(psrc, stride, out0, out1);  \
+    LD_SP2_INC(psrc, stride, out2, out3);  \
+    LD_SP2_INC(psrc, stride, out4, out5);  \
+}
+
+#define LD_SP7_INC(psrc, stride, out0,     \
+                   out1, out2, out3,       \
+                   out4, out5, out6)       \
+{                                          \
+    LD_SP2_INC(psrc, stride, out0, out1);  \
+    LD_SP2_INC(psrc, stride, out2, out3);  \
+    LD_SP2_INC(psrc, stride, out4, out5);  \
+    out6 = LD_SP((psrc));                  \
+    (psrc) += stride;                      \
+}
+
+#define LD_SP8_INC(psrc, stride, out0, out1, out2,     \
+                   out3, out4, out5, out6, out7)       \
+{                                                      \
+    LD_SP4_INC(psrc, stride, out0, out1, out2, out3);  \
+    LD_SP4_INC(psrc, stride, out4, out5, out6, out7);  \
+}
+
+#define LD_SP16_INC(psrc, stride, out0, out1, out2,      \
+                    out3, out4, out5, out6, out7, out8,  \
+                    out9, out10, out11, out12, out13,    \
+                    out14, out15)                        \
+{                                                        \
+    LD_SP8_INC(psrc, stride, out0, out1, out2,           \
+               out3, out4, out5, out6, out7);            \
+    LD_SP8_INC(psrc, stride, out8, out9, out10,          \
+               out11, out12, out13, out14, out15);       \
+}
+
 /* Description : Load 2 vectors of double precision floating point elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
@@ -75,6 +223,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     LD_DP2(psrc + 2 * stride, stride, out2, out3)     \
 }
 
+#define LD_DP2_INC(psrc, stride, out0, out1)  \
+{                                             \
+    out0 = LD_DP(psrc);                       \
+    (psrc) += stride;                         \
+    out1 = LD_DP(psrc);                       \
+    (psrc) += stride;                         \
+}
+
+#define LD_DP3_INC(psrc, stride, out0,     \
+                   out1, out2)             \
+{                                          \
+    LD_DP2_INC(psrc, stride, out0, out1);  \
+    out2 = LD_DP((psrc));                  \
+    (psrc) += stride;                      \
+}
+
+#define LD_DP4_INC(psrc, stride, out0,     \
+                   out1, out2, out3)       \
+{                                          \
+    LD_DP2_INC(psrc, stride, out0, out1);  \
+    LD_DP2_INC(psrc, stride, out2, out3);  \
+}
+
+#define LD_DP5_INC(psrc, stride, out0,      \
+                   out1, out2, out3, out4)  \
+{                                           \
+    LD_DP2_INC(psrc, stride, out0, out1);   \
+    LD_DP2_INC(psrc, stride, out2, out3);   \
+    out4 = LD_DP((psrc));                   \
+    (psrc) += stride;                       \
+}
+
+#define LD_DP6_INC(psrc, stride, out0,     \
+                   out1, out2, out3,       \
+                   out4, out5)             \
+{                                          \
+    LD_DP2_INC(psrc, stride, out0, out1);  \
+    LD_DP2_INC(psrc, stride, out2, out3);  \
+    LD_DP2_INC(psrc, stride, out4, out5);  \
+}
+
+#define LD_DP7_INC(psrc, stride, out0,     \
+                   out1, out2, out3,       \
+                   out4, out5, out6)       \
+{                                          \
+    LD_DP2_INC(psrc, stride, out0, out1);  \
+    LD_DP2_INC(psrc, stride, out2, out3);  \
+    LD_DP2_INC(psrc, stride, out4, out5);  \
+    out6 = LD_DP((psrc));                  \
+    (psrc) += stride;                      \
+}
+
+#define LD_DP8_INC(psrc, stride, out0, out1, out2,     \
+                   out3, out4, out5, out6, out7)       \
+{                                                      \
+    LD_DP4_INC(psrc, stride, out0, out1, out2, out3);  \
+    LD_DP4_INC(psrc, stride, out4, out5, out6, out7);  \
+}
+
+#define LD_DP16_INC(psrc, stride, out0, out1, out2,      \
+                    out3, out4, out5, out6, out7, out8,  \
+                    out9, out10, out11, out12, out13,    \
+                    out14, out15)                        \
+{                                                        \
+    LD_DP8_INC(psrc, stride, out0, out1, out2,           \
+               out3, out4, out5, out6, out7);            \
+    LD_DP8_INC(psrc, stride, out8, out9, out10,          \
+               out11, out12, out13, out14, out15);       \
+}
+
+/* Description : Store GP variable with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 single precision floating point elements from 'in0' to (pdst)
+                 Store 4 single precision floating point elements from 'in1' to (pdst + stride)
+*/
+#define ST_GP2_INC(in0, in1,      \
+                   pdst, stride)  \
+{                                 \
+    *(pdst) = in0;                \
+    (pdst) += stride;             \
+    *(pdst) = in1;                \
+    (pdst) += stride;             \
+}
+
+#define ST_GP3_INC(in0, in1, in2,        \
+                   pdst, stride)         \
+{                                        \
+    ST_GP2_INC(in0, in1, pdst, stride);  \
+    *(pdst) = in2;                       \
+    (pdst) += stride;                    \
+}
+
+#define ST_GP4_INC(in0, in1, in2, in3,   \
+                   pdst, stride)         \
+{                                        \
+    ST_GP2_INC(in0, in1, pdst, stride);  \
+    ST_GP2_INC(in2, in3, pdst, stride);  \
+}
+
+#define ST_GP5_INC(in0, in1, in2, in3,   \
+                   in4, pdst, stride)    \
+{                                        \
+    ST_GP2_INC(in0, in1, pdst, stride);  \
+    ST_GP2_INC(in2, in3, pdst, stride);  \
+    *(pdst) = in4;                       \
+    (pdst) += stride;                    \
+}
+
+#define ST_GP6_INC(in0, in1, in2, in3,     \
+                   in4, in5, pdst, stride) \
+{                                          \
+    ST_GP2_INC(in0, in1, pdst, stride);    \
+    ST_GP2_INC(in2, in3, pdst, stride);    \
+    ST_GP2_INC(in4, in5, pdst, stride);    \
+}
+
+#define ST_GP7_INC(in0, in1, in2, in3, in4,  \
+                   in5, in6, pdst, stride)   \
+{                                            \
+    ST_GP2_INC(in0, in1, pdst, stride);      \
+    ST_GP2_INC(in2, in3, pdst, stride);      \
+    ST_GP2_INC(in4, in5, pdst, stride);      \
+    *(pdst) = in6;                           \
+    (pdst) += stride;                        \
+}
+
+#define ST_GP8_INC(in0, in1, in2, in3, in4, in5,   \
+                   in6, in7, pdst, stride)         \
+{                                                  \
+    ST_GP4_INC(in0, in1, in2, in3, pdst, stride);  \
+    ST_GP4_INC(in4, in5, in6, in7, pdst, stride);  \
+}
+
 /* Description : Store vectors of single precision floating point elements with stride
    Arguments   : Inputs - in0, in1, pdst, stride
    Details     : Store 4 single precision floating point elements from 'in0' to (pdst)
@@ -98,6 +379,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride);          \
 }
 
+#define ST_SP2_INC(in0, in1, pdst, stride)  \
+{                                           \
+    ST_SP(in0, (pdst));                     \
+    (pdst) += stride;                       \
+    ST_SP(in1, (pdst));                     \
+    (pdst) += stride;                       \
+}
+
+#define ST_SP3_INC(in0, in1, in2,        \
+                   pdst, stride)         \
+{                                        \
+    ST_SP2_INC(in0, in1, pdst, stride);  \
+    ST_SP(in2, (pdst));                  \
+    (pdst) += stride;                    \
+}
+
+#define ST_SP4_INC(in0, in1, in2, in3,   \
+                   pdst, stride)         \
+{                                        \
+    ST_SP2_INC(in0, in1, pdst, stride);  \
+    ST_SP2_INC(in2, in3, pdst, stride);  \
+}
+
+#define ST_SP5_INC(in0, in1, in2, in3,   \
+                   in4, pdst, stride)    \
+{                                        \
+    ST_SP2_INC(in0, in1, pdst, stride);  \
+    ST_SP2_INC(in2, in3, pdst, stride);  \
+    ST_SP(in4, (pdst));                  \
+    (pdst) += stride;                    \
+}
+
+#define ST_SP6_INC(in0, in1, in2, in3,     \
+                   in4, in5, pdst, stride) \
+{                                          \
+    ST_SP2_INC(in0, in1, pdst, stride);    \
+    ST_SP2_INC(in2, in3, pdst, stride);    \
+    ST_SP2_INC(in4, in5, pdst, stride);    \
+}
+
+#define ST_SP7_INC(in0, in1, in2, in3, in4,  \
+                   in5, in6, pdst, stride)   \
+{                                            \
+    ST_SP2_INC(in0, in1, pdst, stride);      \
+    ST_SP2_INC(in2, in3, pdst, stride);      \
+    ST_SP2_INC(in4, in5, pdst, stride);      \
+    ST_SP(in6, (pdst));                      \
+    (pdst) += stride;                        \
+}
+
+#define ST_SP8_INC(in0, in1, in2, in3, in4, in5,   \
+                   in6, in7, pdst, stride)         \
+{                                                  \
+    ST_SP4_INC(in0, in1, in2, in3, pdst, stride);  \
+    ST_SP4_INC(in4, in5, in6, in7, pdst, stride);  \
+}
+
+#define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6,  \
+                    in7, in8, in9, in10, in11, in12,    \
+                    in13, in14, in15, pdst, stride)     \
+{                                                       \
+    ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6,       \
+               in7, pdst, stride);                      \
+    ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14,  \
+               in15, pdst, stride);                     \
+}
+
 /* Description : Store vectors of double precision floating point elements with stride
    Arguments   : Inputs - in0, in1, pdst, stride
    Details     : Store 2 double precision floating point elements from 'in0' to (pdst)
@@ -121,6 +469,104 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
 }
 
+#define ST_DP2_INC(in0, in1, pdst, stride)  \
+{                                           \
+    ST_DP(in0, (pdst));                     \
+    (pdst) += stride;                       \
+    ST_DP(in1, (pdst));                     \
+    (pdst) += stride;                       \
+}
+
+#define ST_DP3_INC(in0, in1, in2,        \
+                   pdst, stride)         \
+{                                        \
+    ST_DP2_INC(in0, in1, pdst, stride);  \
+    ST_DP(in2, (pdst));                  \
+    (pdst) += stride;                    \
+}
+
+#define ST_DP4_INC(in0, in1, in2, in3,   \
+                   pdst, stride)         \
+{                                        \
+    ST_DP2_INC(in0, in1, pdst, stride);  \
+    ST_DP2_INC(in2, in3, pdst, stride);  \
+}
+
+#define ST_DP5_INC(in0, in1, in2, in3,   \
+                   in4, pdst, stride)    \
+{                                        \
+    ST_DP2_INC(in0, in1, pdst, stride);  \
+    ST_DP2_INC(in2, in3, pdst, stride);  \
+    ST_DP(in4, (pdst));                  \
+    (pdst) += stride;                    \
+}
+
+#define ST_DP6_INC(in0, in1, in2, in3,     \
+                   in4, in5, pdst, stride) \
+{                                          \
+    ST_DP2_INC(in0, in1, pdst, stride);    \
+    ST_DP2_INC(in2, in3, pdst, stride);    \
+    ST_DP2_INC(in4, in5, pdst, stride);    \
+}
+
+#define ST_DP7_INC(in0, in1, in2, in3, in4,  \
+                   in5, in6, pdst, stride)   \
+{                                            \
+    ST_DP2_INC(in0, in1, pdst, stride);      \
+    ST_DP2_INC(in2, in3, pdst, stride);      \
+    ST_DP2_INC(in4, in5, pdst, stride);      \
+    ST_DP(in6, (pdst));                      \
+    (pdst) += stride;                        \
+}
+
+#define ST_DP8_INC(in0, in1, in2, in3, in4, in5,   \
+                   in6, in7, pdst, stride)         \
+{                                                  \
+    ST_DP4_INC(in0, in1, in2, in3, pdst, stride);  \
+    ST_DP4_INC(in4, in5, in6, in7, pdst, stride);  \
+}
+
+#define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6,  \
+                    in7, in8, in9, in10, in11, in12,    \
+                    in13, in14, in15, pdst, stride)     \
+{                                                       \
+    ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6,       \
+               in7, pdst, stride);                      \
+    ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14,  \
+               in15, pdst, stride);                     \
+}
+
+/* Description : shuffle elements in vector as shf_val
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+*/
+#define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val)   \
+{                                                      \
+    out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val);  \
+    out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val);  \
+}
+#define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
+#define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
+
+#define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2,  \
+               shf_val)                                 \
+{                                                       \
+    out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val);   \
+    out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val);   \
+    out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val);   \
+}
+#define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
+
+#define SHF_W4(RTYPE, in0, in1, in2, in3,           \
+               out0, out1, out2, out3, shf_val)     \
+{                                                   \
+    SHF_W2(RTYPE, in0, in1, out0, out1, shf_val);   \
+    SHF_W2(RTYPE, in2, in3, out2, out3, shf_val);   \
+}
+#define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
+#define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
+
 /* Description : Interleave both left and right half of input vectors
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -134,12 +580,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
 }
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
 
 #define ILVRL_D2(RTYPE, in0, in1, out0, out1)               \
 {                                                           \
     out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
     out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
 }
+#define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
 #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
 
 /* Description : Indexed word element values are replicated to all
@@ -158,6 +606,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
 }
+#define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
 
 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
 {                                                     \
@@ -166,22 +615,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 }
 #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
 
+#define SPLATI_D2(RTYPE, in, out0, out1)           \
+{                                                  \
+    out0 = (RTYPE) __msa_splati_d((v2i64) in, 0);  \
+    out1 = (RTYPE) __msa_splati_d((v2i64) in, 1);  \
+}
+#define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' are copied to the left half
+                 of 'out0' & even double word elements of 'in1' are copied to
+                 the right half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
+{                                                            \
+    out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
+    out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
+}
+#define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
+#define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
+
+#define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5,        \
+                 out0, out1, out2)                           \
+{                                                            \
+    out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
+    out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
+    out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5);  \
+}
+#define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3)                         \
+{                                                                \
+    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
+
+/* Description : pack both even and odd half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are copied to the
+                 'out0' & odd double word elements of 'in0' and 'in1' are
+                 copied to the 'out1'.
+*/
+#define PCKEVOD_W2(RTYPE, in0, in1, out0, out1)              \
+{                                                            \
+    out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1);  \
+    out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1);  \
+}
+#define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
+
+#define PCKEVOD_D2(RTYPE, in0, in1, out0, out1)              \
+{                                                            \
+    out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
+    out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
+}
+#define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 * in1;                         \
+    out1 = in2 * in3;                         \
+}
+#define MUL3(in0, in1, in2, in3, in4, in5,  \
+             out0, out1, out2)              \
+{                                           \
+    out0 = in0 * in1;                       \
+    out1 = in2 * in3;                       \
+    out2 = in4 * in5;                       \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    MUL2(in0, in1, in2, in3, out0, out1);             \
+    MUL2(in4, in5, in6, in7, out2, out3);             \
+}
+
+/* Description : Addition of 2 pairs of variables
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 + in1;                         \
+    out1 = in2 + in3;                         \
+}
+#define ADD3(in0, in1, in2, in3, in4, in5,  \
+             out0, out1, out2)              \
+{                                           \
+    out0 = in0 + in1;                       \
+    out1 = in2 + in3;                       \
+    out2 = in4 + in5;                       \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    ADD2(in0, in1, in2, in3, out0, out1);             \
+    ADD2(in4, in5, in6, in7, out2, out3);             \
+}
+
 /* Description : Transpose 4x4 block with word elements in vectors
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1, out2, out3
                  Return Type - as per RTYPE
 */
-#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
-{                                                                   \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                                   \
-                                                                    \
-    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                              \
-    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                              \
-                                                                    \
-    out0 = (RTYPE) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);        \
-    out1 = (RTYPE) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);        \
-    out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);        \
-    out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);        \
+#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3,  \
+                       out0, out1, out2, out3)     \
+{                                                  \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                  \
+                                                   \
+    ILVRL_W2_SW(in1, in0, s0_m, s1_m);             \
+    ILVRL_W2_SW(in3, in2, s2_m, s3_m);             \
+    ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1);       \
+    ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3);       \
 }
 #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
 
index 611ebab..1695471 100644 (file)
@@ -35,20 +35,26 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
 #endif
           )
 {
-    BLASLONG i, j, l;
+    BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+    BLASLONG off;
+#endif
     FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
     FLOAT *pa0, *pb0;
     FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
-    FLOAT a0, a1;
-    FLOAT b0, b1, b2, b3, b4, b5, b6, b7;
+    FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7;
     v4f32 v_alpha = {alpha, alpha, alpha, alpha};
     v4f32 src_a0, src_a1, src_b, src_b0, src_b1;
     v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
     v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
     v4f32 res8, res9, res10, res11, res12, res13, res14, res15;
 
-    for (j = (n / 8); j--;)
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    off = -offset;
+#endif
+
+    for (j = (n >> 3); j--;)
     {
         pc0 = C;
         pc1 = pc0 + ldc;
@@ -59,13 +65,35 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
         pc6 = pc5 + ldc;
         pc7 = pc6 + ldc;
 
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
         pa0 = A;
-        for (i = (m / 8); i--;)
+        for (i = (m >> 3); i--;)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 8;
+            pb0 = B + off * 8;
+#endif
 
-            LD_SP2(pa0, 4, src_a0, src_a1);
-            LD_SP2(pb0, 4, src_b0, src_b1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 8; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+            LD_SP2_INC(pa0, 4, src_a0, src_a1);
+            LD_SP2_INC(pb0, 4, src_b0, src_b1);
 
             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
             res0 = src_a0 * src_b;
@@ -99,13 +127,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res14 = src_a0 * src_b;
             res15 = src_a1 * src_b;
 
-            pa0 += 8;
-            pb0 += 8;
-
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_SP2(pa0, 4, src_a0, src_a1);
-                LD_SP2(pb0, 4, src_b0, src_b1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pb0, 4, src_b0, src_b1);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
@@ -139,11 +164,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res14 += src_a0 * src_b;
                 res15 += src_a1 * src_b;
 
-                pa0 += 8;
-                pb0 += 8;
-
-                LD_SP2(pa0, 4, src_a0, src_a1);
-                LD_SP2(pb0, 4, src_b0, src_b1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pb0, 4, src_b0, src_b1);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
@@ -176,15 +198,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
                 res14 += src_a0 * src_b;
                 res15 += src_a1 * src_b;
-
-                pa0 += 8;
-                pb0 += 8;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_SP2(pa0, 4, src_a0, src_a1);
-                LD_SP2(pb0, 4, src_b0, src_b1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pb0, 4, src_b0, src_b1);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
@@ -217,11 +236,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
                 res14 += src_a0 * src_b;
                 res15 += src_a1 * src_b;
-
-                pa0 += 8;
-                pb0 += 8;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+            dst4 = res4 * v_alpha;
+            dst5 = res5 * v_alpha;
+            dst6 = res6 * v_alpha;
+            dst7 = res7 * v_alpha;
+#else
             LD_SP2(pc0, 4, dst0, dst1);
             LD_SP2(pc1, 4, dst2, dst3);
             LD_SP2(pc2, 4, dst4, dst5);
@@ -235,12 +261,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst5 += res5 * v_alpha;
             dst6 += res6 * v_alpha;
             dst7 += res7 * v_alpha;
-
-            ST_SP2(dst0, dst1, pc0, 4);
-            ST_SP2(dst2, dst3, pc1, 4);
-            ST_SP2(dst4, dst5, pc2, 4);
-            ST_SP2(dst6, dst7, pc3, 4);
-
+#endif
+            ST_SP2_INC(dst0, dst1, pc0, 4);
+            ST_SP2_INC(dst2, dst3, pc1, 4);
+            ST_SP2_INC(dst4, dst5, pc2, 4);
+            ST_SP2_INC(dst6, dst7, pc3, 4);
+
+#if defined(TRMMKERNEL)
+            dst0 = res8 * v_alpha;
+            dst1 = res9 * v_alpha;
+            dst2 = res10 * v_alpha;
+            dst3 = res11 * v_alpha;
+            dst4 = res12 * v_alpha;
+            dst5 = res13 * v_alpha;
+            dst6 = res14 * v_alpha;
+            dst7 = res15 * v_alpha;
+#else
             LD_SP2(pc4, 4, dst0, dst1);
             LD_SP2(pc5, 4, dst2, dst3);
             LD_SP2(pc6, 4, dst4, dst5);
@@ -254,28 +290,54 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst5 += res13 * v_alpha;
             dst6 += res14 * v_alpha;
             dst7 += res15 * v_alpha;
+#endif
+            ST_SP2_INC(dst0, dst1, pc4, 4);
+            ST_SP2_INC(dst2, dst3, pc5, 4);
+            ST_SP2_INC(dst4, dst5, pc6, 4);
+            ST_SP2_INC(dst6, dst7, pc7, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            pa0 += temp * 8;
+            pb0 += temp * 8;
+#endif
 
-            ST_SP2(dst0, dst1, pc4, 4);
-            ST_SP2(dst2, dst3, pc5, 4);
-            ST_SP2(dst4, dst5, pc6, 4);
-            ST_SP2(dst6, dst7, pc7, 4);
-
-            pc0 += 8;
-            pc1 += 8;
-            pc2 += 8;
-            pc3 += 8;
-            pc4 += 8;
-            pc5 += 8;
-            pc6 += 8;
-            pc7 += 8;
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 4) / 4); i--;)
+        if (m & 4)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 4;
+            pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 8; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             src_a0 = LD_SP(pa0);
-            LD_SP2(pb0, 4, src_b0, src_b1);
+            LD_SP2_INC(pb0, 4, src_b0, src_b1);
 
             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
             res0 = src_a0 * src_b;
@@ -302,12 +364,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res7 = src_a0 * src_b;
 
             pa0 += 4;
-            pb0 += 8;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 src_a0 = LD_SP(pa0);
-                LD_SP2(pb0, 4, src_b0, src_b1);
+                LD_SP2_INC(pb0, 4, src_b0, src_b1);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
@@ -334,10 +395,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res7 += src_a0 * src_b;
 
                 pa0 += 4;
-                pb0 += 8;
 
                 src_a0 = LD_SP(pa0);
-                LD_SP2(pb0, 4, src_b0, src_b1);
+                LD_SP2_INC(pb0, 4, src_b0, src_b1);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
@@ -364,13 +424,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res7 += src_a0 * src_b;
 
                 pa0 += 4;
-                pb0 += 8;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 src_a0 = LD_SP(pa0);
-                LD_SP2(pb0, 4, src_b0, src_b1);
+                LD_SP2_INC(pb0, 4, src_b0, src_b1);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
@@ -397,9 +456,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res7 += src_a0 * src_b;
 
                 pa0 += 4;
-                pb0 += 8;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+#else
             dst0 = LD_SP(pc0);
             dst1 = LD_SP(pc1);
             dst2 = LD_SP(pc2);
@@ -409,12 +473,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst1 += res1 * v_alpha;
             dst2 += res2 * v_alpha;
             dst3 += res3 * v_alpha;
-
+#endif
             ST_SP(dst0, pc0);
             ST_SP(dst1, pc1);
             ST_SP(dst2, pc2);
             ST_SP(dst3, pc3);
 
+#if defined(TRMMKERNEL)
+            dst0 = res4 * v_alpha;
+            dst1 = res5 * v_alpha;
+            dst2 = res6 * v_alpha;
+            dst3 = res7 * v_alpha;
+#else
             dst0 = LD_SP(pc4);
             dst1 = LD_SP(pc5);
             dst2 = LD_SP(pc6);
@@ -424,12 +494,29 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst1 += res5 * v_alpha;
             dst2 += res6 * v_alpha;
             dst3 += res7 * v_alpha;
-
+#endif
             ST_SP(dst0, pc4);
             ST_SP(dst1, pc5);
             ST_SP(dst2, pc6);
             ST_SP(dst3, pc7);
 
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            pa0 += temp * 4;
+            pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
+
             pc0 += 4;
             pc1 += 4;
             pc2 += 4;
@@ -440,9 +527,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pc7 += 4;
         }
 
-        for (i = ((m & 2) / 2); i--;)
+        if (m & 2)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 2;
+            pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 8; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -482,7 +587,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 2;
             pb0 += 8;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -561,7 +666,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 8;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -611,6 +716,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             tmp12 = alpha * tmp12;
             tmp14 = alpha * tmp14;
 
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc1[0] = tmp2;
+            pc2[0] = tmp4;
+            pc3[0] = tmp6;
+            pc4[0] = tmp8;
+            pc5[0] = tmp10;
+            pc6[0] = tmp12;
+            pc7[0] = tmp14;
+#else
             pc0[0] += tmp0;
             pc1[0] += tmp2;
             pc2[0] += tmp4;
@@ -619,7 +734,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pc5[0] += tmp10;
             pc6[0] += tmp12;
             pc7[0] += tmp14;
-
+#endif
             tmp1 = alpha * tmp1;
             tmp3 = alpha * tmp3;
             tmp5 = alpha * tmp5;
@@ -629,6 +744,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             tmp13 = alpha * tmp13;
             tmp15 = alpha * tmp15;
 
+#if defined(TRMMKERNEL)
+            pc0[1] = tmp1;
+            pc1[1] = tmp3;
+            pc2[1] = tmp5;
+            pc3[1] = tmp7;
+            pc4[1] = tmp9;
+            pc5[1] = tmp11;
+            pc6[1] = tmp13;
+            pc7[1] = tmp15;
+#else
             pc0[1] += tmp1;
             pc1[1] += tmp3;
             pc2[1] += tmp5;
@@ -637,6 +762,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pc5[1] += tmp11;
             pc6[1] += tmp13;
             pc7[1] += tmp15;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            pa0 += temp * 2;
+            pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
 
             pc0 += 2;
             pc1 += 2;
@@ -648,9 +791,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pc7 += 2;
         }
 
-        for (i = (m & 1); i--;)
+        if (m & 1)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 1;
+            pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 8; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -680,7 +841,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 1;
             pb0 += 8;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -739,14 +900,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 8;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
                 tmp0 += a0 * b0;
 
                 b1 = pb0[1];
-                tmp1  += a0 * b1;
+                tmp1 += a0 * b1;
 
                 b2 = pb0[2];
                 tmp2 += a0 * b2;
@@ -779,6 +940,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             tmp6 = alpha * tmp6;
             tmp7 = alpha * tmp7;
 
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc1[0] = tmp1;
+            pc2[0] = tmp2;
+            pc3[0] = tmp3;
+            pc4[0] = tmp4;
+            pc5[0] = tmp5;
+            pc6[0] = tmp6;
+            pc7[0] = tmp7;
+#else
             pc0[0] += tmp0;
             pc1[0] += tmp1;
             pc2[0] += tmp2;
@@ -787,7 +958,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pc5[0] += tmp5;
             pc6[0] += tmp6;
             pc7[0] += tmp7;
+#endif
 
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            pa0 += temp * 1;
+            pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
             pc0 += 1;
             pc1 += 1;
             pc2 += 1;
@@ -798,13 +986,17 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pc7 += 1;
         }
 
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 8; // number of values in A
+#endif
+
         l = (k << 3);
         B = B + l;
         i = (ldc << 3);
         C = C + i;
     }
 
-    for (j = ((n & 4) / 4); j--;)
+    if (n & 4)
     {
         pc0 = C;
         pc1 = pc0 + ldc;
@@ -813,11 +1005,33 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
 
         pa0 = A;
 
-        for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 8;
+            pb0 = B + off * 4;
+#endif
 
-            LD_SP2(pa0, 4, src_a0, src_a1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+            LD_SP2_INC(pa0, 4, src_a0, src_a1);
             src_b0 = LD_SP(pb0);
 
             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
@@ -836,12 +1050,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res6 = src_a0 * src_b;
             res7 = src_a1 * src_b;
 
-            pa0 += 8;
             pb0 += 4;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0 = LD_SP(pb0);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
@@ -860,10 +1073,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res6 += src_a0 * src_b;
                 res7 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 4;
 
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0 = LD_SP(pb0);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
@@ -882,13 +1094,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res6 += src_a0 * src_b;
                 res7 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 4;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0 = LD_SP(pb0);
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
@@ -907,10 +1118,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res6 += src_a0 * src_b;
                 res7 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 4;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+            dst4 = res4 * v_alpha;
+            dst5 = res5 * v_alpha;
+            dst6 = res6 * v_alpha;
+            dst7 = res7 * v_alpha;
+#else
             LD_SP2(pc0, 4, dst0, dst1);
             LD_SP2(pc1, 4, dst2, dst3);
             LD_SP2(pc2, 4, dst4, dst5);
@@ -924,21 +1144,52 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst5 += res5 * v_alpha;
             dst6 += res6 * v_alpha;
             dst7 += res7 * v_alpha;
+#endif
 
-            ST_SP2(dst0, dst1, pc0, 4);
-            ST_SP2(dst2, dst3, pc1, 4);
-            ST_SP2(dst4, dst5, pc2, 4);
-            ST_SP2(dst6, dst7, pc3, 4);
+            ST_SP2_INC(dst0, dst1, pc0, 4);
+            ST_SP2_INC(dst2, dst3, pc1, 4);
+            ST_SP2_INC(dst4, dst5, pc2, 4);
+            ST_SP2_INC(dst6, dst7, pc3, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 8;
+            pb0 += temp * 4;
+#endif
 
-            pc0 += 8;
-            pc1 += 8;
-            pc2 += 8;
-            pc3 += 8;
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 4) / 4); i--;)
+        if (m & 4)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 4;
+            pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
             pb0 = B;
+            temp = k;
+#endif
 
             src_a0 = LD_SP(pa0);
             src_b0 = LD_SP(pb0);
@@ -958,7 +1209,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 4;
             pb0 += 4;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 src_a0 = LD_SP(pa0);
                 src_b0 = LD_SP(pb0);
@@ -997,7 +1248,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 4;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 src_a0 = LD_SP(pa0);
                 src_b0 = LD_SP(pb0);
@@ -1017,7 +1268,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pa0 += 4;
                 pb0 += 4;
             }
-
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+#else
             dst0 = LD_SP(pc0);
             dst1 = LD_SP(pc1);
             dst2 = LD_SP(pc2);
@@ -1027,21 +1283,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst1 += res1 * v_alpha;
             dst2 += res2 * v_alpha;
             dst3 += res3 * v_alpha;
-
+#endif
             ST_SP(dst0, pc0);
             ST_SP(dst1, pc1);
             ST_SP(dst2, pc2);
             ST_SP(dst3, pc3);
 
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 4;
+            pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
             pc0 += 4;
             pc1 += 4;
             pc2 += 4;
             pc3 += 4;
         }
 
-        for (i = ((m & 2) / 2); i--;)
+        if (m & 2)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 2;
+            pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -1065,7 +1355,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 2;
             pb0 += 4;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1112,7 +1402,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 4;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1142,20 +1432,50 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             tmp4 = alpha * tmp4;
             tmp6 = alpha * tmp6;
 
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc1[0] = tmp2;
+            pc2[0] = tmp4;
+            pc3[0] = tmp6;
+#else
             pc0[0] += tmp0;
             pc1[0] += tmp2;
             pc2[0] += tmp4;
             pc3[0] += tmp6;
-
+#endif
             tmp1 = alpha * tmp1;
             tmp3 = alpha * tmp3;
             tmp5 = alpha * tmp5;
             tmp7 = alpha * tmp7;
 
+#if defined(TRMMKERNEL)
+            pc0[1] = tmp1;
+            pc1[1] = tmp3;
+            pc2[1] = tmp5;
+            pc3[1] = tmp7;
+#else
             pc0[1] += tmp1;
             pc1[1] += tmp3;
             pc2[1] += tmp5;
             pc3[1] += tmp7;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2;
+            pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
 
             pc0 += 2;
             pc1 += 2;
@@ -1163,9 +1483,27 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pc3 += 2;
         }
 
-        for (i = (m & 1); i--;)
+        if (m & 1)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 1;
+            pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -1183,7 +1521,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 1;
             pb0 += 4;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1218,7 +1556,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 4;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1242,35 +1580,84 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             tmp2 = alpha * tmp2;
             tmp3 = alpha * tmp3;
 
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc1[0] = tmp1;
+            pc2[0] = tmp2;
+            pc3[0] = tmp3;
+#else
             pc0[0] += tmp0;
             pc1[0] += tmp1;
             pc2[0] += tmp2;
             pc3[0] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 1;
+            pb0 += temp * 4;
+#endif
 
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
             pc0 += 1;
             pc1 += 1;
             pc2 += 1;
             pc3 += 1;
         }
 
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4; // number of values in A
+#endif
+
         l = (k << 2);
         B = B + l;
         i = (ldc << 2);
         C = C + i;
     }
 
-    for (j = ((n & 2) / 2); j--;)
+    if (n & 2)
     {
         pc0 = C;
         pc1 = pc0 + ldc;
 
         pa0 = A;
 
-        for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 8;
+            pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
-            LD_SP2(pa0, 4, src_a0, src_a1);
+            LD_SP2_INC(pa0, 4, src_a0, src_a1);
             src_b0[0] = pb0[0];
             src_b0[1] = pb0[1];
 
@@ -1282,12 +1669,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             res2 = src_a0 * src_b;
             res3 = src_a1 * src_b;
 
-            pa0 += 8;
             pb0 += 2;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0[0] = pb0[0];
                 src_b0[1] = pb0[1];
 
@@ -1299,10 +1685,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res2 += src_a0 * src_b;
                 res3 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 2;
 
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0[0] = pb0[0];
                 src_b0[1] = pb0[1];
 
@@ -1314,13 +1699,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res2 += src_a0 * src_b;
                 res3 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 2;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0[0] = pb0[0];
                 src_b0[1] = pb0[1];
 
@@ -1332,10 +1716,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 res2 += src_a0 * src_b;
                 res3 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 2;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+            dst2 = res2 * v_alpha;
+            dst3 = res3 * v_alpha;
+#else
             LD_SP2(pc0, 4, dst0, dst1);
             LD_SP2(pc1, 4, dst2, dst3);
 
@@ -1343,17 +1732,49 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             dst1 += res1 * v_alpha;
             dst2 += res2 * v_alpha;
             dst3 += res3 * v_alpha;
+#endif
+            ST_SP2_INC(dst0, dst1, pc0, 4);
+            ST_SP2_INC(dst2, dst3, pc1, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 8;
+            pb0 += temp * 2;
+#endif
 
-            ST_SP2(dst0, dst1, pc0, 4);
-            ST_SP2(dst2, dst3, pc1, 4);
-
-            pc0 += 8;
-            pc1 += 8;
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 4) / 4); i--;)
+        if (m & 4)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 4;
+            pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
             pb0 = B;
+            temp = k;
+#endif
 
             src_a0 = LD_SP(pa0);
             src_b0[0] = pb0[0];
@@ -1368,7 +1789,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 4;
             pb0 += 2;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 src_a0 = LD_SP(pa0);
                 src_b0[0] = pb0[0];
@@ -1397,7 +1818,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 2;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 src_a0 = LD_SP(pa0);
                 src_b0[0] = pb0[0];
@@ -1413,22 +1834,60 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 2;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+#else
             dst0 = LD_SP(pc0);
             dst1 = LD_SP(pc1);
 
             dst0 += res0 * v_alpha;
             dst1 += res1 * v_alpha;
-
+#endif
             ST_SP(dst0, pc0);
             ST_SP(dst1, pc1);
 
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 4;
+            pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
             pc0 += 4;
             pc1 += 4;
         }
 
-        for (i = ((m & 2) / 2); i--;)
+        if (m & 2)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 2;
+            pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -1444,7 +1903,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 2;
             pb0 += 2;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1475,7 +1934,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 2;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1493,24 +1952,64 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             }
 
             tmp0 = alpha * tmp0;
+            tmp1 = alpha * tmp1;
             tmp2 = alpha * tmp2;
+            tmp3 = alpha * tmp3;
 
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc1[0] = tmp2;
+            pc0[1] = tmp1;
+            pc1[1] = tmp3;
+#else
             pc0[0] += tmp0;
             pc1[0] += tmp2;
-
-            tmp1 = alpha * tmp1;
-            tmp3 = alpha * tmp3;
-
             pc0[1] += tmp1;
             pc1[1] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2;
+            pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
 
             pc0 += 2;
             pc1 += 2;
         }
 
-        for (i = (m & 1); i--;)
+        if (m & 1)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 1;
+            pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -1522,7 +2021,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 1;
             pb0 += 2;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1545,7 +2044,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 2;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1561,87 +2060,166 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             tmp0 = alpha * tmp0;
             tmp1 = alpha * tmp1;
 
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc1[0] = tmp1;
+#else
             pc0[0] += tmp0;
             pc1[0] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 1;
+            pb0 += temp * 2;
+#endif
 
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
             pc0 += 1;
             pc1 += 1;
         }
 
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2; // number of values in A
+#endif
         l = (k << 1);
         B = B + l;
         i = (ldc << 1);
         C = C + i;
     }
 
-    for (j = (n & 1); j--;)
+    if (n & 1)
     {
         pc0 = C;
         pa0 = A;
 
-        for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 3); i--;)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 8;
+            pb0 = B + off * 1;
+#endif
 
-            LD_SP2(pa0, 4, src_a0, src_a1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 8; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+            LD_SP2_INC(pa0, 4, src_a0, src_a1);
             src_b0[0] = pb0[0];
 
             src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
             res0 = src_a0 * src_b;
             res1 = src_a1 * src_b;
 
-            pa0 += 8;
             pb0 += 1;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0[0] = pb0[0];
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
                 res1 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 1;
 
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0[0] = pb0[0];
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
                 res1 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 1;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
-                LD_SP2(pa0, 4, src_a0, src_a1);
+                LD_SP2_INC(pa0, 4, src_a0, src_a1);
                 src_b0[0] = pb0[0];
 
                 src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
                 res0 += src_a0 * src_b;
                 res1 += src_a1 * src_b;
 
-                pa0 += 8;
                 pb0 += 1;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+            dst1 = res1 * v_alpha;
+#else
             LD_SP2(pc0, 4, dst0, dst1);
 
             dst0 += res0 * v_alpha;
             dst1 += res1 * v_alpha;
+#endif
+            ST_SP2_INC(dst0, dst1, pc0, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 8;
+            pb0 += temp * 1;
+#endif
 
-            ST_SP2(dst0, dst1, pc0, 4);
-
-            pc0 += 8;
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+#endif
         }
 
-        for (i = ((m & 4) / 4); i--;)
+        if (m & 4)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 4;
+            pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
             pb0 = B;
+            temp = k;
+#endif
 
             src_a0 = LD_SP(pa0);
             src_b0[0] = pb0[0];
@@ -1652,7 +2230,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 4;
             pb0 += 1;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 src_a0 = LD_SP(pa0);
                 src_b0[0] = pb0[0];
@@ -1673,7 +2251,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 src_a0 = LD_SP(pa0);
                 src_b0[0] = pb0[0];
@@ -1685,18 +2263,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
+#if defined(TRMMKERNEL)
+            dst0 = res0 * v_alpha;
+#else
             dst0 = LD_SP(pc0);
 
             dst0 += res0 * v_alpha;
-
+#endif
             ST_SP(dst0, pc0);
 
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 4;
+            pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
             pc0 += 4;
         }
 
-        for (i = (m & 2) / 2; i--;)
+        if (m & 2)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2;
+            pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
             pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -1708,7 +2323,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 2;
             pb0 += 1;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1731,7 +2346,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1744,18 +2359,55 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
-            tmp0 = alpha * tmp0;
+#if defined(TRMMKERNEL)
+            pc0[0] = tmp0;
+            pc0[1] = tmp1;
+#else
             pc0[0] += tmp0;
-
-            tmp1 = alpha * tmp1;
             pc0[1] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2;
+            pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
 
             pc0 += 2;
         }
 
-        for (i = (m & 1); i--;)
+        if (m & 1)
         {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             pb0 = B;
+#else
+            pa0 += off * 1;
+            pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
 
             a0 = pa0[0];
             b0 = pb0[0];
@@ -1764,7 +2416,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
             pa0 += 1;
             pb0 += 1;
 
-            for (l = ((k - 1) / 2); l--;)
+            for (l = ((temp - 1) >> 1); l--;)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1781,7 +2433,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
-            if ((k - 1) & 1)
+            if ((temp - 1) & 1)
             {
                 a0 = pa0[0];
                 b0 = pb0[0];
@@ -1791,11 +2443,35 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
                 pb0 += 1;
             }
 
+#if defined(TRMMKERNEL)
+            pc0[0] = alpha * tmp0;
+#else
             pc0[0] += alpha * tmp0;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 1;
+            pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
 
             pc0 += 1;
         }
 
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1; // number of values in A
+#endif
         l = (k << 0);
         B = B + l;
         i = (ldc << 0);
index 71048f1..8618c44 100644 (file)
@@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include "macros_msa.h"
 
-int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
-          FLOAT * __restrict dst)
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
 {
     BLASLONG i, j;
-    FLOAT *psrc0;
-    FLOAT *psrc1, *psrc2, *psrc3, *psrc4;
-    FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
-    FLOAT *pdst;
+    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+    FLOAT *psrc8, *pdst;
     v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
     v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
     v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
@@ -58,22 +55,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (m >> 3); i--;)
         {
-            LD_SP2(psrc1, 4, src0, src1);
-            LD_SP2(psrc2, 4, src2, src3);
-            LD_SP2(psrc3, 4, src4, src5);
-            LD_SP2(psrc4, 4, src6, src7);
-            LD_SP2(psrc5, 4, src8, src9);
-            LD_SP2(psrc6, 4, src10, src11);
-            LD_SP2(psrc7, 4, src12, src13);
-            LD_SP2(psrc8, 4, src14, src15);
-            psrc1 += 8;
-            psrc2 += 8;
-            psrc3 += 8;
-            psrc4 += 8;
-            psrc5 += 8;
-            psrc6 += 8;
-            psrc7 += 8;
-            psrc8 += 8;
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
+            LD_SP2_INC(psrc3, 4, src4, src5);
+            LD_SP2_INC(psrc4, 4, src6, src7);
+            LD_SP2_INC(psrc5, 4, src8, src9);
+            LD_SP2_INC(psrc6, 4, src10, src11);
+            LD_SP2_INC(psrc7, 4, src12, src13);
+            LD_SP2_INC(psrc8, 4, src14, src15);
 
             TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6);
             TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5,
@@ -83,15 +72,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13,
                                dst15);
 
-            ST_SP2(dst0, dst1, pdst, 4);
-            ST_SP2(dst2, dst3, pdst + 8, 4);
-            ST_SP2(dst4, dst5, pdst + 16, 4);
-            ST_SP2(dst6, dst7, pdst + 24, 4);
-            ST_SP2(dst8, dst9, pdst + 32, 4);
-            ST_SP2(dst10, dst11, pdst + 40, 4);
-            ST_SP2(dst12, dst13, pdst + 48, 4);
-            ST_SP2(dst14, dst15, pdst + 56, 4);
-            pdst += 64;
+            ST_SP2_INC(dst0, dst1, pdst, 4);
+            ST_SP2_INC(dst2, dst3, pdst, 4);
+            ST_SP2_INC(dst4, dst5, pdst, 4);
+            ST_SP2_INC(dst6, dst7, pdst, 4);
+            ST_SP2_INC(dst8, dst9, pdst, 4);
+            ST_SP2_INC(dst10, dst11, pdst, 4);
+            ST_SP2_INC(dst12, dst13, pdst, 4);
+            ST_SP2_INC(dst14, dst15, pdst, 4);
         }
 
         for (i = (m & 7); i--;)
@@ -128,9 +116,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
             TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
 
-            ST_SP2(dst0, dst1, pdst, 4);
-            ST_SP2(dst2, dst3, pdst + 8, 4);
-            pdst += 16;
+            ST_SP2_INC(dst0, dst1, pdst, 4);
+            ST_SP2_INC(dst2, dst3, pdst, 4);
         }
 
         for (i = (m & 3); i--;)
index 7d4aecb..3542eca 100644 (file)
@@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include "macros_msa.h"
 
-int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
-          FLOAT * __restrict dst)
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
 {
     BLASLONG i, j;
-    FLOAT *psrc0;
-    FLOAT *psrc1, *psrc2, *psrc3, *psrc4;
-    FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
-    FLOAT *pdst0,  *pdst1, *pdst2, *pdst3, *pdst4;
+    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+    FLOAT *psrc8, *pdst0,  *pdst1, *pdst2, *pdst3, *pdst4;
     v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
     v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
 
@@ -63,22 +60,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 3); i--;)
         {
-            LD_SP2(psrc1, 4, src0, src1);
-            LD_SP2(psrc2, 4, src2, src3);
-            LD_SP2(psrc3, 4, src4, src5);
-            LD_SP2(psrc4, 4, src6, src7);
-            LD_SP2(psrc5, 4, src8, src9);
-            LD_SP2(psrc6, 4, src10, src11);
-            LD_SP2(psrc7, 4, src12, src13);
-            LD_SP2(psrc8, 4, src14, src15);
-            psrc1 += 8;
-            psrc2 += 8;
-            psrc3 += 8;
-            psrc4 += 8;
-            psrc5 += 8;
-            psrc6 += 8;
-            psrc7 += 8;
-            psrc8 += 8;
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
+            LD_SP2_INC(psrc3, 4, src4, src5);
+            LD_SP2_INC(psrc4, 4, src6, src7);
+            LD_SP2_INC(psrc5, 4, src8, src9);
+            LD_SP2_INC(psrc6, 4, src10, src11);
+            LD_SP2_INC(psrc7, 4, src12, src13);
+            LD_SP2_INC(psrc8, 4, src14, src15);
 
             ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
             ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15,
@@ -105,8 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc7 += 4;
             psrc8 += 4;
 
-            ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4);
-            pdst2 += 32;
+            ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4);
         }
 
         if (n & 2)
@@ -155,14 +143,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 3); i--;)
         {
-            LD_SP2(psrc1, 4, src0, src1);
-            LD_SP2(psrc2, 4, src2, src3);
-            LD_SP2(psrc3, 4, src4, src5);
-            LD_SP2(psrc4, 4, src6, src7);
-            psrc1 += 8;
-            psrc2 += 8;
-            psrc3 += 8;
-            psrc4 += 8;
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
+            LD_SP2_INC(psrc3, 4, src4, src5);
+            LD_SP2_INC(psrc4, 4, src6, src7);
 
             ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
             pdst1 += 8 * m;
@@ -179,8 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc3 += 4;
             psrc4 += 4;
 
-            ST_SP4(src0, src1, src2, src3, pdst2, 4);
-            pdst2 += 16;
+            ST_SP4_INC(src0, src1, src2, src3, pdst2, 4);
         }
 
         if (n & 2)
@@ -215,10 +198,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 3); i--;)
         {
-            LD_SP2(psrc1, 4, src0, src1);
-            LD_SP2(psrc2, 4, src2, src3);
-            psrc1 += 8;
-            psrc2 += 8;
+            LD_SP2_INC(psrc1, 4, src0, src1);
+            LD_SP2_INC(psrc2, 4, src2, src3);
 
             ST_SP4(src0, src1, src2, src3, pdst1, 4);
             pdst1 += 8 * m;
@@ -231,8 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
             psrc1 += 4;
             psrc2 += 4;
 
-            ST_SP2(src0, src1, pdst2, 4);
-            pdst2 += 8;
+            ST_SP2_INC(src0, src1, pdst2, 4);
         }
 
         if (n & 2)
@@ -260,8 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
 
         for (i = (n >> 3); i--;)
         {
-            LD_SP2(psrc1, 4, src0, src1);
-            psrc1 += 8;
+            LD_SP2_INC(psrc1, 4, src0, src1);
 
             ST_SP2(src0, src1, pdst1, 4);
             pdst1 += 8 * m;
@@ -288,5 +267,5 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
         }
     }
 
-  return 0;
+    return 0;
 }
index 516b975..53891e6 100644 (file)
@@ -166,7 +166,7 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
     src_a = LD_SP(a + 32);
     SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
-    COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36);
+    src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
 
     res_c4 *= src_a36;
     res_c12 *= src_a36;
@@ -220,9 +220,9 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c0 -= res_c2 * src_a16;
     res_c8 -= res_c10 * src_a16;
 
-    COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9);
-    COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8);
-    COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
+    src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
+    src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
+    src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
 
     res_c1 *= src_a9;
     res_c9 *= src_a9;
@@ -306,7 +306,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         bb += 4;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         LD_SP2(aa, 4, src_a0, src_a1);
 
@@ -374,7 +374,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
     src_a = LD_SP(a + 32);
     SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
-    COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36);
+    src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
 
     res_c4 *= src_a36;
     res_c3 -= res_c4 * src_a35;
@@ -399,9 +399,9 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c1 -= res_c2 * src_a17;
     res_c0 -= res_c2 * src_a16;
 
-    COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9);
-    COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8);
-    COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
+    src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
+    src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
+    src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
 
     res_c1 *= src_a9;
     res_c0 -= res_c1 * src_a8;
@@ -826,9 +826,9 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
     src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
 
-    COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5);
-    COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4);
-    COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
+    src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
+    src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
+    src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
 
     res_c3 *= src_a15;
     res_c7 *= src_a15;
@@ -916,7 +916,7 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         bb += 4;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         src_a0 = LD_SP(aa);
 
@@ -940,9 +940,9 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
     src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
     src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
-    COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5);
-    COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4);
-    COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
+    src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
+    src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
+    src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
 
     res_c3 *= src_a15;
     res_c2 -= res_c3 * src_a14;
index c087fda..5834d77 100644 (file)
@@ -162,7 +162,7 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
     src_a = LD_SP(a + 27);
     SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
-    COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31);
+    src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
 
     res_c3 *= src_a27;
     res_c11 *= src_a27;
@@ -216,9 +216,9 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c7 -= res_c5 * src_a47;
     res_c15 -= res_c13 * src_a47;
 
-    COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54);
-    COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55);
-    COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63);
+    src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
+    src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
+    src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
 
     res_c6 *= src_a54;
     res_c14 *= src_a54;
@@ -334,7 +334,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
     src_a = LD_SP(a + 27);
     SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
-    COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31);
+    src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
 
     res_c3 *= src_a27;
     res_c4 -= res_c3 * src_a28;
@@ -359,9 +359,9 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c5 * src_a46;
     res_c7 -= res_c5 * src_a47;
 
-    COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54);
-    COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55);
-    COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63);
+    src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
+    src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
+    src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
 
     res_c6 *= src_a54;
     res_c7 -= res_c6 * src_a55;
@@ -780,7 +780,7 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         b += 8;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         src_a0 = LD_SP(a);
 
@@ -813,9 +813,9 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
     src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
     src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
-    COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10);
-    COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11);
-    COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15);
+    src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
+    src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
+    src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
 
     res_c0 *= src_a0;
     res_c4 *= src_a0;
@@ -902,7 +902,7 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         b += 4;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         src_a0 = LD_SP(a);
 
@@ -926,9 +926,9 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
     src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
     src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
-    COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10);
-    COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11);
-    COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15);
+    src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
+    src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
+    src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
 
     res_c0 *= src_a0;
     res_c1 -= res_c0 * src_a1;
index 69d7b5f..642ee37 100644 (file)
@@ -144,7 +144,7 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
     src_b = LD_SP(b + 27);
     SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
-    COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31);
+    src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
 
     src_c4 *= src_b18;
     src_c5 *= src_b18;
@@ -184,9 +184,9 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
     src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
 
-    COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54);
-    COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55);
-    COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63);
+    src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
+    src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
+    src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
 
     src_c8 *= src_b36;
     src_c9 *= src_b36;
@@ -275,7 +275,7 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         b += 4;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         LD_SP2(a, 4, src_a0, src_a1);
 
@@ -300,9 +300,9 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
     src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
     src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
-    COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10);
-    COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11);
-    COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15);
+    src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
+    src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
+    src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -351,8 +351,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     {
         LD_SP2(a, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
-        COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+        src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -364,8 +364,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
         LD_SP2(a, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
-        COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+        src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -376,12 +376,12 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         b += 2;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         LD_SP2(a, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
-        COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+        src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -392,9 +392,9 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         b += 2;
     }
 
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
-    COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
-    COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+    src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+    src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -419,7 +419,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     {
         LD_SP2(a, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -429,7 +429,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
         LD_SP2(a, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -439,7 +439,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
         LD_SP2(a, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -449,7 +449,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
         LD_SP2(a, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -458,13 +458,13 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         b += 1;
     }
 
-    if (bk & 3)
+    if ((bk & 3) && (bk > 0))
     {
         if (bk & 2)
         {
             LD_SP2(a, 4, src_a0, src_a1);
 
-            COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+            src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
             src_c0 -= src_a0 * src_b0;
             src_c1 -= src_a1 * src_b0;
@@ -474,7 +474,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
             LD_SP2(a, 4, src_a0, src_a1);
 
-            COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+            src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
             src_c0 -= src_a0 * src_b0;
             src_c1 -= src_a1 * src_b0;
@@ -487,7 +487,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         {
             LD_SP2(a, 4, src_a0, src_a1);
 
-            COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+            src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
             src_c0 -= src_a0 * src_b0;
             src_c1 -= src_a1 * src_b0;
@@ -497,7 +497,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         }
     }
 
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -574,7 +574,7 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
     src_b = LD_SP(b + 27);
     SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
-    COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31);
+    src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
 
     src_b = LD_SP(b + 36);
     SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
@@ -584,9 +584,9 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
     src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
 
-    COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54);
-    COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55);
-    COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63);
+    src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
+    src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
+    src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
 
     src_c0 *= src_b0;
     src_c1 -= src_c0 * src_b1;
@@ -686,7 +686,7 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         b += 4;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         src_a0 = LD_SP(a);
 
@@ -707,9 +707,9 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
     src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
     src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
-    COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10);
-    COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11);
-    COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15);
+    src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
+    src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
+    src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
 
     src_c0 *= src_b0;
     src_c1 -= src_c0 * src_b1;
@@ -789,7 +789,7 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         b += 2;
     }
 
-    if (bk & 3)
+    if ((bk & 3) && (bk > 0))
     {
         if (bk & 2)
         {
@@ -831,9 +831,9 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         }
     }
 
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
-    COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
-    COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+    src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+    src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
 
     src_c0 *= src_b0;
     src_c1 -= src_c0 * src_b1;
index eefd3a6..21e41c8 100644 (file)
@@ -158,7 +158,7 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
     src_b = LD_SP(b + 32);
     SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
-    COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36);
+    src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
 
     src_c8 *= src_b36;
     src_c9 *= src_b36;
@@ -203,9 +203,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     ST_SP2(src_c4, src_c5, c_nxt2line, 4);
     ST_SP2(src_c6, src_c7, c_nxt3line, 4);
 
-    COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9);
-    COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8);
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+    src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
+    src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
     src_c2 *= src_b9;
     src_c3 *= src_b9;
@@ -273,7 +273,7 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         bb += 4;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         LD_SP2(aa, 4, src_a0, src_a1);
 
@@ -298,9 +298,9 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
     src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
     src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
-    COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5);
-    COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4);
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+    src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
+    src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
     src_c7 *= src_b15;
     src_c6 *= src_b15;
@@ -350,8 +350,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     {
         LD_SP2(aa, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
-        COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+        src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -363,8 +363,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
         LD_SP2(aa, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
-        COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+        src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -375,12 +375,12 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         bb += 2;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         LD_SP2(aa, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
-        COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+        src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -391,9 +391,9 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     a -= 16;
     b -= 4;
 
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
-    COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2);
-    COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+    src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
+    src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
 
     src_c2 *= src_b3;
     src_c3 *= src_b3;
@@ -419,7 +419,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
     {
         LD_SP2(aa, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -429,7 +429,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
 
         LD_SP2(aa, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -439,7 +439,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
 
         LD_SP2(aa, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -449,7 +449,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
 
         LD_SP2(aa, 4, src_a0, src_a1);
 
-        COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+        src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
 
         src_c0 -= src_a0 * src_b0;
         src_c1 -= src_a1 * src_b0;
@@ -458,13 +458,13 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
         bb += 1;
     }
 
-    if (bk & 3)
+    if ((bk & 3) && (bk > 0))
     {
         if (bk & 2)
         {
             LD_SP2(aa, 4, src_a0, src_a1);
 
-            COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+            src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
 
             src_c0 -= src_a0 * src_b0;
             src_c1 -= src_a1 * src_b0;
@@ -474,7 +474,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
 
             LD_SP2(aa, 4, src_a0, src_a1);
 
-            COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+            src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
 
             src_c0 -= src_a0 * src_b0;
             src_c1 -= src_a1 * src_b0;
@@ -487,7 +487,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
         {
             LD_SP2(aa, 4, src_a0, src_a1);
 
-            COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+            src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
 
             src_c0 -= src_a0 * src_b0;
             src_c1 -= src_a1 * src_b0;
@@ -497,7 +497,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
     a -= 8;
     b -= 1;
 
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -579,7 +579,7 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
 
     src_b = LD_SP(b + 32);
     SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
-    COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36);
+    src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
 
     src_b = LD_SP(b + 24);
     SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
@@ -589,9 +589,9 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
     src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
 
-    COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9);
-    COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8);
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+    src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
+    src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
     src_c7 *= src_b63;
     src_c6 -= src_c7 * src_b62;
@@ -695,7 +695,7 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         bb += 4;
     }
 
-    if (bk & 1)
+    if ((bk & 1) && (bk > 0))
     {
         src_a = LD_SP(aa);
 
@@ -717,9 +717,9 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
     src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
     src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
-    COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5);
-    COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4);
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+    src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
+    src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
     src_c3 *= src_b15;
     src_c2 -= src_c3 * src_b14;
@@ -800,7 +800,7 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
         bb += 2;
     }
 
-    if (bk & 3)
+    if ((bk & 3) && (bk > 0))
     {
         if (bk & 2)
         {
@@ -842,9 +842,9 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     a -= 8;
     b -= 4;
 
-    COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
-    COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2);
-    COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+    src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
+    src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
+    src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
 
     src_c1 *= src_b3;
     src_c0 -= src_c1 * src_b2;
diff --git a/kernel/mips/zgemm_kernel_4x4_msa.c b/kernel/mips/zgemm_kernel_4x4_msa.c
new file mode 100644 (file)
index 0000000..a185c69
--- /dev/null
@@ -0,0 +1,1589 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4)    \
+{                                                        \
+    LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);  \
+    LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3);  \
+                                                         \
+    PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);     \
+    PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i);     \
+                                                         \
+    /* 0th col */                                        \
+    SPLATI_D2_DP(src_b0, src_br, src_bi);                \
+    res0_r OP0## = src_a0r * src_br;                     \
+    res0_r OP1## = src_a0i * src_bi;                     \
+    res0_i OP2## = OP4 src_a0r * src_bi;                 \
+    res0_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res1_r OP0## = src_a1r * src_br;                     \
+    res1_r OP1## = src_a1i * src_bi;                     \
+    res1_i OP2## = OP4 src_a1r * src_bi;                 \
+    res1_i OP3## = src_a1i * src_br;                     \
+                                                         \
+    /* 1st col */                                        \
+    SPLATI_D2_DP(src_b1, src_br, src_bi);                \
+    res2_r OP0## = src_a0r * src_br;                     \
+    res2_r OP1## = src_a0i * src_bi;                     \
+    res2_i OP2## = OP4 src_a0r * src_bi;                 \
+    res2_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res3_r OP0## = src_a1r * src_br;                     \
+    res3_r OP1## = src_a1i * src_bi;                     \
+    res3_i OP2## = OP4 src_a1r * src_bi;                 \
+    res3_i OP3## = src_a1i * src_br;                     \
+                                                         \
+    /* 2nd col */                                        \
+    SPLATI_D2_DP(src_b2, src_br, src_bi);                \
+    res4_r OP0## = src_a0r * src_br;                     \
+    res4_r OP1## = src_a0i * src_bi;                     \
+    res4_i OP2## = OP4 src_a0r * src_bi;                 \
+    res4_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res5_r OP0## = src_a1r * src_br;                     \
+    res5_r OP1## = src_a1i * src_bi;                     \
+    res5_i OP2## = OP4 src_a1r * src_bi;                 \
+    res5_i OP3## = src_a1i * src_br;                     \
+                                                         \
+    /* 3rd col */                                        \
+    SPLATI_D2_DP(src_b3, src_br, src_bi);                \
+    res6_r OP0## = src_a0r * src_br;                     \
+    res6_r OP1## = src_a0i * src_bi;                     \
+    res6_i OP2## = OP4 src_a0r * src_bi;                 \
+    res6_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res7_r OP0## = src_a1r * src_br;                     \
+    res7_r OP1## = src_a1i * src_bi;                     \
+    res7_i OP2## = OP4 src_a1r * src_bi;                 \
+    res7_i OP3## = src_a1i * src_br;                     \
+}
+
+#define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4)    \
+{                                                        \
+    LD_DP2_INC(pa0, 2, src_a0, src_a1);                  \
+    LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3);  \
+                                                         \
+    PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);     \
+                                                         \
+    /* 0th col */                                        \
+    SPLATI_D2_DP(src_b0, src_br, src_bi);                \
+    res0_r OP0## = src_a0r * src_br;                     \
+    res0_r OP1## = src_a0i * src_bi;                     \
+    res0_i OP2## = OP4 src_a0r * src_bi;                 \
+    res0_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    /* 1st col */                                        \
+    SPLATI_D2_DP(src_b1, src_br, src_bi);                \
+    res2_r OP0## = src_a0r * src_br;                     \
+    res2_r OP1## = src_a0i * src_bi;                     \
+    res2_i OP2## = OP4 src_a0r * src_bi;                 \
+    res2_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    /* 2nd col */                                        \
+    SPLATI_D2_DP(src_b2, src_br, src_bi);                \
+    res4_r OP0## = src_a0r * src_br;                     \
+    res4_r OP1## = src_a0i * src_bi;                     \
+    res4_i OP2## = OP4 src_a0r * src_bi;                 \
+    res4_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    /* 3rd col */                                        \
+    SPLATI_D2_DP(src_b3, src_br, src_bi);                \
+    res6_r OP0## = src_a0r * src_br;                     \
+    res6_r OP1## = src_a0i * src_bi;                     \
+    res6_i OP2## = OP4 src_a0r * src_bi;                 \
+    res6_i OP3## = src_a0i * src_br;                     \
+}
+
+#define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4)    \
+{                                                        \
+    src_a0 = LD_DP(pa0);                                 \
+    LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3);  \
+                                                         \
+    PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i);     \
+                                                         \
+    /* 0th and 1st col */                                \
+    PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi);       \
+    res0_r OP0## = src_a0r * src_br;                     \
+    res0_r OP1## = src_a0i * src_bi;                     \
+    res0_i OP2## = OP4 src_a0r * src_bi;                 \
+    res0_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    /* 2nd and 3rd col */                                \
+    PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi);       \
+    res1_r OP0## = src_a0r * src_br;                     \
+    res1_r OP1## = src_a0i * src_bi;                     \
+    res1_i OP2## = OP4 src_a0r * src_bi;                 \
+    res1_i OP3## = src_a0i * src_br;                     \
+}
+
+#define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4)    \
+{                                                        \
+    LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);  \
+    LD_DP2_INC(pb0, 2, src_b0, src_b1);                  \
+                                                         \
+    PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);     \
+    PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i);     \
+                                                         \
+    /* 0th col */                                        \
+    SPLATI_D2_DP(src_b0, src_br, src_bi);                \
+    res0_r OP0## = src_a0r * src_br;                     \
+    res0_r OP1## = src_a0i * src_bi;                     \
+    res0_i OP2## = OP4 src_a0r * src_bi;                 \
+    res0_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res1_r OP0## = src_a1r * src_br;                     \
+    res1_r OP1## = src_a1i * src_bi;                     \
+    res1_i OP2## = OP4 src_a1r * src_bi;                 \
+    res1_i OP3## = src_a1i * src_br;                     \
+                                                         \
+    /* 1st col */                                        \
+    SPLATI_D2_DP(src_b1, src_br, src_bi);                \
+    res2_r OP0## = src_a0r * src_br;                     \
+    res2_r OP1## = src_a0i * src_bi;                     \
+    res2_i OP2## = OP4 src_a0r * src_bi;                 \
+    res2_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res3_r OP0## = src_a1r * src_br;                     \
+    res3_r OP1## = src_a1i * src_bi;                     \
+    res3_i OP2## = OP4 src_a1r * src_bi;                 \
+    res3_i OP3## = src_a1i * src_br;                     \
+}
+
+#define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4)  \
+{                                                      \
+    LD_DP2_INC(pa0, 2, src_a0, src_a1);                \
+    LD_DP2_INC(pb0, 2, src_b0, src_b1);                \
+                                                       \
+    PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);   \
+                                                       \
+    /* 0th col */                                      \
+    SPLATI_D2_DP(src_b0, src_br, src_bi);              \
+    res0_r OP0## = src_a0r * src_br;                   \
+    res0_r OP1## = src_a0i * src_bi;                   \
+    res0_i OP2## = OP4 src_a0r * src_bi;               \
+    res0_i OP3## = src_a0i * src_br;                   \
+                                                       \
+    /* 1st col */                                      \
+    SPLATI_D2_DP(src_b1, src_br, src_bi);              \
+    res2_r OP0## = src_a0r * src_br;                   \
+    res2_r OP1## = src_a0i * src_bi;                   \
+    res2_i OP2## = OP4 src_a0r * src_bi;               \
+    res2_i OP3## = src_a0i * src_br;                   \
+}
+
+#define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4)  \
+{                                                      \
+    src_a0 = LD_DP(pa0);                               \
+    LD_DP2_INC(pb0, 2, src_b0, src_b1);                \
+                                                       \
+    PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i);   \
+                                                       \
+    /* 0th and 1st col */                              \
+    PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi);     \
+    res0_r OP0## = src_a0r * src_br;                   \
+    res0_r OP1## = src_a0i * src_bi;                   \
+    res0_i OP2## = OP4 src_a0r * src_bi;               \
+    res0_i OP3## = src_a0i * src_br;                   \
+}
+
+#define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4)    \
+{                                                        \
+    LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);  \
+    src_b0 = LD_DP(pb0);                                 \
+                                                         \
+    PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);     \
+    PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i);     \
+                                                         \
+    /* 0th col */                                        \
+    SPLATI_D2_DP(src_b0, src_br, src_bi);                \
+    res0_r OP0## = src_a0r * src_br;                     \
+    res0_r OP1## = src_a0i * src_bi;                     \
+    res0_i OP2## = OP4 src_a0r * src_bi;                 \
+    res0_i OP3## = src_a0i * src_br;                     \
+                                                         \
+    res1_r OP0## = src_a1r * src_br;                     \
+    res1_r OP1## = src_a1i * src_bi;                     \
+    res1_i OP2## = OP4 src_a1r * src_bi;                 \
+    res1_i OP3## = src_a1i * src_br;                     \
+}
+
+#define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4)  \
+{                                                      \
+    LD_DP2_INC(pa0, 2, src_a0, src_a1);                \
+    src_b0 = LD_DP(pb0);                               \
+                                                       \
+    PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i);   \
+                                                       \
+    /* 0th col */                                      \
+    SPLATI_D2_DP(src_b0, src_br, src_bi);              \
+    res0_r OP0## = src_a0r * src_br;                   \
+    res0_r OP1## = src_a0i * src_bi;                   \
+    res0_i OP2## = OP4 src_a0r * src_bi;               \
+    res0_i OP3## = src_a0i * src_br;                   \
+}
+
+#define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4)  \
+{                                                  \
+    /* 0th col */                                  \
+    a0_r = pa0[0];                                 \
+    a0_i = pa0[1];                                 \
+    b0_r = pb0[0];                                 \
+    b0_i = pb0[1];                                 \
+                                                   \
+    res0 OP0## = a0_r * b0_r;                      \
+    res0 OP1## = a0_i * b0_i;                      \
+    res1 OP2## = OP4 a0_r * b0_i;                  \
+    res1 OP3## = a0_i * b0_r;                      \
+}
+
+#define ZGEMM_SCALE_4X4_MSA                      \
+{                                                \
+    LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res0_r;                  \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i += alpha_r * res0_i;                  \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r += alpha_r * res1_r;                  \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i += alpha_r * res1_i;                  \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);      \
+                                                 \
+    PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i);   \
+    PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res2_r;                  \
+    dst0_r -= alpha_i * res2_i;                  \
+    dst0_i += alpha_r * res2_i;                  \
+    dst0_i += alpha_i * res2_r;                  \
+                                                 \
+    dst1_r += alpha_r * res3_r;                  \
+    dst1_r -= alpha_i * res3_i;                  \
+    dst1_i += alpha_r * res3_i;                  \
+    dst1_i += alpha_i * res3_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
+                                                 \
+    ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
+    ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);  \
+                                                 \
+    LD_DP4(pc2, 2, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res4_r;                  \
+    dst0_r -= alpha_i * res4_i;                  \
+    dst0_i += alpha_r * res4_i;                  \
+    dst0_i += alpha_i * res4_r;                  \
+                                                 \
+    dst1_r += alpha_r * res5_r;                  \
+    dst1_r -= alpha_i * res5_i;                  \
+    dst1_i += alpha_r * res5_i;                  \
+    dst1_i += alpha_i * res5_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    LD_DP4(pc3, 2, dst4, dst5, dst6, dst7);      \
+                                                 \
+    PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i);   \
+    PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res6_r;                  \
+    dst0_r -= alpha_i * res6_i;                  \
+    dst0_i += alpha_r * res6_i;                  \
+    dst0_i += alpha_i * res6_r;                  \
+                                                 \
+    dst1_r += alpha_r * res7_r;                  \
+    dst1_r -= alpha_i * res7_i;                  \
+    dst1_i += alpha_r * res7_i;                  \
+    dst1_i += alpha_i * res7_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
+                                                 \
+    ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);  \
+    ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);  \
+}
+
+#define ZGEMM_SCALE_2X4_MSA                     \
+{                                               \
+    LD_DP2(pc0, 2, dst0, dst1);                 \
+                                                \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res0_r;                 \
+    dst0_r -= alpha_i * res0_i;                 \
+    dst0_i += alpha_r * res0_i;                 \
+    dst0_i += alpha_i * res0_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    LD_DP2(pc1, 2, dst2, dst3);                 \
+                                                \
+    PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res2_r;                 \
+    dst0_r -= alpha_i * res2_i;                 \
+    dst0_i += alpha_r * res2_i;                 \
+    dst0_i += alpha_i * res2_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);    \
+                                                \
+    ST_DP2_INC(dst0, dst1, pc0, 2);             \
+    ST_DP2_INC(dst2, dst3, pc1, 2);             \
+                                                \
+    LD_DP2(pc2, 2, dst0, dst1);                 \
+                                                \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res4_r;                 \
+    dst0_r -= alpha_i * res4_i;                 \
+    dst0_i += alpha_r * res4_i;                 \
+    dst0_i += alpha_i * res4_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    LD_DP2(pc3, 2, dst2, dst3);                 \
+                                                \
+    PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res6_r;                 \
+    dst0_r -= alpha_i * res6_i;                 \
+    dst0_i += alpha_r * res6_i;                 \
+    dst0_i += alpha_i * res6_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);    \
+                                                \
+    ST_DP2_INC(dst0, dst1, pc2, 2);             \
+    ST_DP2_INC(dst2, dst3, pc3, 2);             \
+}
+
+#define ZGEMM_SCALE_1X4_MSA                     \
+{                                               \
+    dst0 = LD_DP(pc0);                          \
+    dst1 = LD_DP(pc1);                          \
+                                                \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res0_r;                 \
+    dst0_r -= alpha_i * res0_i;                 \
+    dst0_i += alpha_r * res0_i;                 \
+    dst0_i += alpha_i * res0_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    dst2 = LD_DP(pc2);                          \
+    dst3 = LD_DP(pc3);                          \
+                                                \
+    PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res1_r;                 \
+    dst0_r -= alpha_i * res1_i;                 \
+    dst0_i += alpha_r * res1_i;                 \
+    dst0_i += alpha_i * res1_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);    \
+                                                \
+    ST_DP(dst0, pc0);                           \
+    ST_DP(dst1, pc1);                           \
+    ST_DP(dst2, pc2);                           \
+    ST_DP(dst3, pc3);                           \
+}
+
+#define ZGEMM_SCALE_4X2_MSA                      \
+{                                                \
+    LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res0_r;                  \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i += alpha_r * res0_i;                  \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r += alpha_r * res1_r;                  \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i += alpha_r * res1_i;                  \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);      \
+                                                 \
+    PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i);   \
+    PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res2_r;                  \
+    dst0_r -= alpha_i * res2_i;                  \
+    dst0_i += alpha_r * res2_i;                  \
+    dst0_i += alpha_i * res2_r;                  \
+                                                 \
+    dst1_r += alpha_r * res3_r;                  \
+    dst1_r -= alpha_i * res3_i;                  \
+    dst1_i += alpha_r * res3_i;                  \
+    dst1_i += alpha_i * res3_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
+                                                 \
+    ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
+    ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);  \
+}
+
+#define ZGEMM_SCALE_2X2_MSA                     \
+{                                               \
+    LD_DP2(pc0, 2, dst0, dst1);                 \
+                                                \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res0_r;                 \
+    dst0_r -= alpha_i * res0_i;                 \
+    dst0_i += alpha_r * res0_i;                 \
+    dst0_i += alpha_i * res0_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_DP2_INC(dst0, dst1, pc0, 2);             \
+                                                \
+    LD_DP2(pc1, 2, dst2, dst3);                 \
+                                                \
+    PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res2_r;                 \
+    dst0_r -= alpha_i * res2_i;                 \
+    dst0_i += alpha_r * res2_i;                 \
+    dst0_i += alpha_i * res2_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);    \
+                                                \
+    ST_DP2_INC(dst2, dst3, pc1, 2);             \
+}
+
+#define ZGEMM_SCALE_1X2_MSA                     \
+{                                               \
+    dst0 = LD_DP(pc0);                          \
+    dst1 = LD_DP(pc1);                          \
+                                                \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res0_r;                 \
+    dst0_r -= alpha_i * res0_i;                 \
+    dst0_i += alpha_r * res0_i;                 \
+    dst0_i += alpha_i * res0_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_DP(dst0, pc0);                           \
+    ST_DP(dst1, pc1);                           \
+}
+
+#define ZGEMM_SCALE_4X1_MSA                      \
+{                                                \
+    LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);      \
+                                                 \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);   \
+    PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i);   \
+                                                 \
+    dst0_r += alpha_r * res0_r;                  \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i += alpha_r * res0_i;                  \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r += alpha_r * res1_r;                  \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i += alpha_r * res1_i;                  \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
+}
+
+#define ZGEMM_SCALE_2X1_MSA                     \
+{                                               \
+    LD_DP2(pc0, 2, dst0, dst1);                 \
+                                                \
+    PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i);  \
+                                                \
+    dst0_r += alpha_r * res0_r;                 \
+    dst0_r -= alpha_i * res0_i;                 \
+    dst0_i += alpha_r * res0_i;                 \
+    dst0_i += alpha_i * res0_r;                 \
+                                                \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);    \
+                                                \
+    ST_DP2_INC(dst0, dst1, pc0, 2);             \
+}
+
+#define ZGEMM_SCALE_1X1       \
+{                             \
+    pc0[0] += alphar * res0;  \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] += alphar * res1;  \
+    pc0[1] += alphai * res0;  \
+}
+
+#define ZGEMM_TRMM_SCALE_4X4_MSA                 \
+{                                                \
+    dst0_r = alpha_r * res0_r;                   \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i = alpha_r * res0_i;                   \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r = alpha_r * res1_r;                   \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i = alpha_r * res1_i;                   \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    dst0_r = alpha_r * res2_r;                   \
+    dst0_r -= alpha_i * res2_i;                  \
+    dst0_i = alpha_r * res2_i;                   \
+    dst0_i += alpha_i * res2_r;                  \
+                                                 \
+    dst1_r = alpha_r * res3_r;                   \
+    dst1_r -= alpha_i * res3_i;                  \
+    dst1_i = alpha_r * res3_i;                   \
+    dst1_i += alpha_i * res3_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
+                                                 \
+    ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
+    ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);  \
+                                                 \
+    dst0_r = alpha_r * res4_r;                   \
+    dst0_r -= alpha_i * res4_i;                  \
+    dst0_i = alpha_r * res4_i;                   \
+    dst0_i += alpha_i * res4_r;                  \
+                                                 \
+    dst1_r = alpha_r * res5_r;                   \
+    dst1_r -= alpha_i * res5_i;                  \
+    dst1_i = alpha_r * res5_i;                   \
+    dst1_i += alpha_i * res5_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    dst0_r = alpha_r * res6_r;                   \
+    dst0_r -= alpha_i * res6_i;                  \
+    dst0_i = alpha_r * res6_i;                   \
+    dst0_i += alpha_i * res6_r;                  \
+                                                 \
+    dst1_r = alpha_r * res7_r;                   \
+    dst1_r -= alpha_i * res7_i;                  \
+    dst1_i = alpha_r * res7_i;                   \
+    dst1_i += alpha_i * res7_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
+                                                 \
+    ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);  \
+    ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);  \
+}
+
+#define ZGEMM_TRMM_SCALE_2X4_MSA              \
+{                                             \
+    dst0_r = alpha_r * res0_r;                \
+    dst0_r -= alpha_i * res0_i;               \
+    dst0_i = alpha_r * res0_i;                \
+    dst0_i += alpha_i * res0_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    dst0_r = alpha_r * res2_r;                \
+    dst0_r -= alpha_i * res2_i;               \
+    dst0_i = alpha_r * res2_i;                \
+    dst0_i += alpha_i * res2_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);  \
+                                              \
+    ST_DP2_INC(dst0, dst1, pc0, 2);           \
+    ST_DP2_INC(dst2, dst3, pc1, 2);           \
+                                              \
+    dst0_r = alpha_r * res4_r;                \
+    dst0_r -= alpha_i * res4_i;               \
+    dst0_i = alpha_r * res4_i;                \
+    dst0_i += alpha_i * res4_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    dst0_r = alpha_r * res6_r;                \
+    dst0_r -= alpha_i * res6_i;               \
+    dst0_i = alpha_r * res6_i;                \
+    dst0_i += alpha_i * res6_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);  \
+                                              \
+    ST_DP2_INC(dst0, dst1, pc2, 2);           \
+    ST_DP2_INC(dst2, dst3, pc3, 2);           \
+}
+
+#define ZGEMM_TRMM_SCALE_1X4_MSA              \
+{                                             \
+    dst0_r = alpha_r * res0_r;                \
+    dst0_r -= alpha_i * res0_i;               \
+    dst0_i = alpha_r * res0_i;                \
+    dst0_i += alpha_i * res0_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    dst0_r = alpha_r * res1_r;                \
+    dst0_r -= alpha_i * res1_i;               \
+    dst0_i = alpha_r * res1_i;                \
+    dst0_i += alpha_i * res1_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);  \
+                                              \
+    ST_DP(dst0, pc0);                         \
+    ST_DP(dst1, pc1);                         \
+    ST_DP(dst2, pc2);                         \
+    ST_DP(dst3, pc3);                         \
+}
+
+#define ZGEMM_TRMM_SCALE_4X2_MSA                 \
+{                                                \
+    dst0_r = alpha_r * res0_r;                   \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i = alpha_r * res0_i;                   \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r = alpha_r * res1_r;                   \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i = alpha_r * res1_i;                   \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    dst0_r = alpha_r * res2_r;                   \
+    dst0_r -= alpha_i * res2_i;                  \
+    dst0_i = alpha_r * res2_i;                   \
+    dst0_i += alpha_i * res2_r;                  \
+                                                 \
+    dst1_r = alpha_r * res3_r;                   \
+    dst1_r -= alpha_i * res3_i;                  \
+    dst1_i = alpha_r * res3_i;                   \
+    dst1_i += alpha_i * res3_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7);     \
+                                                 \
+    ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
+    ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);  \
+}
+
+#define ZGEMM_TRMM_SCALE_2X2_MSA              \
+{                                             \
+    dst0_r = alpha_r * res0_r;                \
+    dst0_r -= alpha_i * res0_i;               \
+    dst0_i = alpha_r * res0_i;                \
+    dst0_i += alpha_i * res0_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_DP2_INC(dst0, dst1, pc0, 2);           \
+                                              \
+    dst0_r = alpha_r * res2_r;                \
+    dst0_r -= alpha_i * res2_i;               \
+    dst0_i = alpha_r * res2_i;                \
+    dst0_i += alpha_i * res2_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3);  \
+                                              \
+    ST_DP2_INC(dst2, dst3, pc1, 2);           \
+}
+
+#define ZGEMM_TRMM_SCALE_1X2_MSA              \
+{                                             \
+    dst0_r = alpha_r * res0_r;                \
+    dst0_r -= alpha_i * res0_i;               \
+    dst0_i = alpha_r * res0_i;                \
+    dst0_i += alpha_i * res0_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_DP(dst0, pc0);                         \
+    ST_DP(dst1, pc1);                         \
+}
+
+#define ZGEMM_TRMM_SCALE_4X1_MSA                 \
+{                                                \
+    dst0_r = alpha_r * res0_r;                   \
+    dst0_r -= alpha_i * res0_i;                  \
+    dst0_i = alpha_r * res0_i;                   \
+    dst0_i += alpha_i * res0_r;                  \
+                                                 \
+    dst1_r = alpha_r * res1_r;                   \
+    dst1_r -= alpha_i * res1_i;                  \
+    dst1_i = alpha_r * res1_i;                   \
+    dst1_i += alpha_i * res1_r;                  \
+                                                 \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);     \
+    ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3);     \
+                                                 \
+    ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);  \
+}
+
+#define ZGEMM_TRMM_SCALE_2X1_MSA              \
+{                                             \
+    dst0_r = alpha_r * res0_r;                \
+    dst0_r -= alpha_i * res0_i;               \
+    dst0_i = alpha_r * res0_i;                \
+    dst0_i += alpha_i * res0_r;               \
+                                              \
+    ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1);  \
+                                              \
+    ST_DP2_INC(dst0, dst1, pc0, 2);           \
+}
+
+#define ZGEMM_TRMM_SCALE_1X1  \
+{                             \
+    pc0[0] = alphar * res0;   \
+    pc0[0] -= alphai * res1;  \
+    pc0[1] = alphar * res1;   \
+    pc0[1] += alphai * res0;  \
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
+          FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
+#ifdef TRMMKERNEL
+          , BLASLONG offset
+#endif
+          )
+{
+    BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+    BLASLONG off;
+#endif
+    FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
+    FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i;
+    v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3;
+    v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
+    v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i;
+    v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
+    v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
+
+    alpha_r = COPY_DOUBLE_TO_VECTOR(alphar);
+    alpha_i = COPY_DOUBLE_TO_VECTOR(alphai);
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    off = -offset;
+#endif
+
+    for (j = (n >> 2); j--;)
+    {
+        pc0 = C;
+        pc1 = pc0 + 2 * ldc;
+        pc2 = pc1 + 2 * ldc;
+        pc3 = pc2 + 2 * ldc;
+
+        pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 2); i--;)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 4;
+            pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_4X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_4X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_4X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_4X4_MSA(, -, , -, -);
+#endif
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_4X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_4X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_4X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_4X4_MSA(+, -, -, -,);
+#endif
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_4X4_MSA
+#else
+            ZGEMM_SCALE_4X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2 * 4;
+            pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 2)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 2;
+            pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_2X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_2X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_2X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_2X4_MSA(, -, , -, -);
+#endif
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_2X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_2X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_2X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_2X4_MSA(+, -, -, -,);
+#endif
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_2X4_MSA
+#else
+            ZGEMM_SCALE_2X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2 * 2;
+            pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 1)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 1;
+            pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 4; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_1X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_1X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_1X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_1X4_MSA(, -, , -, -);
+#endif
+
+            pa0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_1X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_1X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_1X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_1X4_MSA(+, -, -, -,);
+#endif
+
+                pa0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_1X4_MSA
+#else
+            ZGEMM_SCALE_1X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            pa0 += temp * 2 * 1;
+            pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
+
+            pc0 += 2;
+            pc1 += 2;
+            pc2 += 2;
+            pc3 += 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4; // number of values in A
+#endif
+
+        l = k << 3;
+        B = B + l;
+        i = ldc << 3;
+        C = C + i;
+    }
+
+    if (n & 2)
+    {
+        pc0 = C;
+        pc1 = pc0 + 2 * ldc;
+
+        pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 2); i--;)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 4;
+            pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_4X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_4X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_4X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_4X2_MSA(, -, , -, -);
+#endif
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_4X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_4X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_4X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_4X2_MSA(+, -, -, -,);
+#endif
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_4X2_MSA
+#else
+            ZGEMM_SCALE_4X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2 * 4;
+            pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 2)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 2;
+            pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_2X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_2X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_2X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_2X2_MSA(, -, , -, -);
+#endif
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_2X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_2X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_2X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_2X2_MSA(+, -, -, -,);
+#endif
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_2X2_MSA
+#else
+            ZGEMM_SCALE_2X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2 * 2;
+            pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 1)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 1;
+            pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 2; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_1X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_1X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_1X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_1X2_MSA(, -, , -, -);
+#endif
+
+            pa0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_1X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_1X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_1X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_1X2_MSA(+, -, -, -,);
+#endif
+
+                pa0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_1X2_MSA
+#else
+            ZGEMM_SCALE_1X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            pa0 += temp * 2 * 1;
+            pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
+
+            pc0 += 2;
+            pc1 += 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2; // number of values in A
+#endif
+
+        l = k << 2;
+        B = B + l;
+        i = ldc << 2;
+        C = C + i;
+    }
+
+    if (n & 1)
+    {
+        pc0 = C;
+        pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        for (i = (m >> 2); i--;)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 4;
+            pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 4; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_4X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_4X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_4X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_4X1_MSA(, -, , -, -);
+#endif
+
+            pb0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_4X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_4X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_4X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_4X1_MSA(+, -, -, -,);
+#endif
+
+                pb0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_4X1_MSA
+#else
+            ZGEMM_SCALE_4X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2 * 4;
+            pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 2)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 2;
+            pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 2; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_2X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_2X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_2X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_2X1_MSA(, -, , -, -);
+#endif
+
+            pb0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_2X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_2X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_2X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_2X1_MSA(+, -, -, -,);
+#endif
+
+                pb0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_2X1_MSA
+#else
+            ZGEMM_SCALE_2X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2 * 2;
+            pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+#endif
+        }
+
+        if (m & 1)
+        {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            pb0 = B;
+#else
+            pa0 += off * 2 * 1;
+            pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = k - off;
+#elif defined(LEFT)
+            temp = off + 1; // number of values in A
+#else
+            temp = off + 1; // number of values in B
+#endif
+#else
+            pb0 = B;
+            temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+            ZGEMM_KERNEL_1X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+            ZGEMM_KERNEL_1X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+            ZGEMM_KERNEL_1X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+            ZGEMM_KERNEL_1X1(, -, , -, -);
+#endif
+
+            pa0 += 2;
+            pb0 += 2;
+
+            for (l = (temp - 1); l--;)
+            {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                ZGEMM_KERNEL_1X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                ZGEMM_KERNEL_1X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                ZGEMM_KERNEL_1X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                ZGEMM_KERNEL_1X1(+, -, -, -,);
+#endif
+
+                pa0 += 2;
+                pb0 += 2;
+            }
+
+#if defined(TRMMKERNEL)
+            ZGEMM_TRMM_SCALE_1X1
+#else
+            ZGEMM_SCALE_1X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = k - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            pa0 += temp * 2 * 1;
+            pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+#endif
+
+            pc0 += 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1; // number of values in A
+#endif
+
+        l = k << 1;
+        B = B + l;
+        i = ldc << 1;
+        C = C + i;
+    }
+    return 0;
+}
diff --git a/kernel/mips/zgemm_ncopy_4_msa.c b/kernel/mips/zgemm_ncopy_4_msa.c
new file mode 100644 (file)
index 0000000..3ef46a5
--- /dev/null
@@ -0,0 +1,144 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+    BLASLONG i, j;
+    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
+    v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+    v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    psrc0 = src;
+    pdst = dst;
+    lda *= 2;
+
+    for (j = (n >> 2); j--;)
+    {
+        psrc1  = psrc0;
+        psrc2  = psrc1 + lda;
+        psrc3  = psrc2 + lda;
+        psrc4  = psrc3 + lda;
+        psrc0 += 4 * lda;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+            LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+            LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
+
+            ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2);
+            ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15,
+                       pdst, 2);
+        }
+
+        if (m & 2)
+        {
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src4, src5);
+            LD_DP2_INC(psrc3, 2, src8, src9);
+            LD_DP2_INC(psrc4, 2, src12, src13);
+
+            ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2);
+        }
+
+        if (m & 1)
+        {
+            src0 = LD_DP(psrc1);
+            src4 = LD_DP(psrc2);
+            src8 = LD_DP(psrc3);
+            src12 = LD_DP(psrc4);
+            psrc1 += 2;
+            psrc2 += 2;
+            psrc3 += 2;
+            psrc4 += 2;
+
+            ST_DP4_INC(src0, src4, src8, src12, pdst, 2);
+        }
+    }
+
+    if (n & 2)
+    {
+        psrc1  = psrc0;
+        psrc2  = psrc1 + lda;
+        psrc0 += 2 * lda;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+
+            ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2);
+        }
+
+        if (m & 2)
+        {
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src4, src5);
+
+            ST_DP4_INC(src0, src4, src1, src5, pdst, 2);
+        }
+
+        if (m & 1)
+        {
+            src0 = LD_DP(psrc1);
+            src4 = LD_DP(psrc2);
+            psrc1 += 2;
+            psrc2 += 2;
+
+            ST_DP2_INC(src0, src4, pdst, 2);
+        }
+    }
+
+    if (n & 1)
+    {
+        psrc1  = psrc0;
+
+        for (i = (m >> 2); i--;)
+        {
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            ST_DP4_INC(src0, src1, src2, src3, pdst, 2);
+        }
+
+        if (m & 2)
+        {
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            ST_DP2_INC(src0, src1, pdst, 2);
+        }
+
+        if (m & 1)
+        {
+            src0 = LD_DP(psrc1);
+            ST_DP(src0, pdst);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/mips/zgemm_tcopy_4_msa.c b/kernel/mips/zgemm_tcopy_4_msa.c
new file mode 100644 (file)
index 0000000..70314cb
--- /dev/null
@@ -0,0 +1,161 @@
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+    BLASLONG i, j;
+    FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
+    FLOAT *pdst0, *pdst1, *pdst2, *pdst3;
+    v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+    v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+
+    psrc0 = src;
+    pdst0 = dst;
+    lda *= 2;
+
+    pdst2 = dst + 2 * m  * (n & ~3);
+    pdst3 = dst + 2 * m  * (n & ~1);
+
+    for (j = (m >> 2); j--;)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc1 + lda;
+        psrc3 = psrc2 + lda;
+        psrc4 = psrc3 + lda;
+        psrc0 += 4 * lda;
+
+        pdst1 = pdst0;
+        pdst0 += 32;
+
+        for (i = (n >> 2); i--;)
+        {
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+            LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+            LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
+
+            ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+            ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
+                   pdst1 + 16, 2);
+            pdst1 += m * 8;
+        }
+
+        if (n & 2)
+        {
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+            LD_DP2_INC(psrc3, 2, src4, src5);
+            LD_DP2_INC(psrc4, 2, src6, src7);
+
+            ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
+        }
+
+        if (n & 1)
+        {
+            src0 = LD_DP(psrc1);
+            src1 = LD_DP(psrc2);
+            src2 = LD_DP(psrc3);
+            src3 = LD_DP(psrc4);
+            psrc1 += 2;
+            psrc2 += 2;
+            psrc3 += 2;
+            psrc4 += 2;
+
+            ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
+        }
+    }
+
+    if (m & 2)
+    {
+        psrc1 = psrc0;
+        psrc2 = psrc1 + lda;
+        psrc0 += 2 * lda;
+
+        pdst1 = pdst0;
+        pdst0 += 16;
+
+        for (i = (n >> 2); i--;)
+        {
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+
+            ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+
+            pdst1 += m * 8;
+        }
+
+        if (n & 2)
+        {
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            LD_DP2_INC(psrc2, 2, src2, src3);
+
+            ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
+        }
+
+        if (n & 1)
+        {
+            src0 = LD_DP(psrc1);
+            src1 = LD_DP(psrc2);
+
+            ST_DP2_INC(src0, src1, pdst3, 2);
+
+            psrc1 += 2;
+            psrc2 += 2;
+        }
+    }
+
+    if (m & 1)
+    {
+        psrc1 = psrc0;
+        pdst1 = pdst0;
+
+        for (i = (n >> 2); i--;)
+        {
+            LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+            ST_DP4(src0, src1, src2, src3, pdst1, 2);
+
+            pdst1 += m * 8;
+        }
+
+        if (n & 2)
+        {
+            LD_DP2_INC(psrc1, 2, src0, src1);
+            ST_DP2_INC(src0, src1, pdst2, 2);
+        }
+
+        if (n & 1)
+        {
+            src0 = LD_DP(psrc1);
+            ST_DP(src0, pdst3);
+        }
+    }
+
+    return 0;
+}
diff --git a/param.h b/param.h
index fdc9d11..dd58744 100644 (file)
--- a/param.h
+++ b/param.h
@@ -2188,11 +2188,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_UNROLL_M  8
 #define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  2
-#define CGEMM_DEFAULT_UNROLL_N  2
-
-#define ZGEMM_DEFAULT_UNROLL_M  2
-#define ZGEMM_DEFAULT_UNROLL_N  2
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+                                
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
 
 #define SGEMM_DEFAULT_P  128
 #define DGEMM_DEFAULT_P  128
@@ -2227,11 +2227,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_UNROLL_M  8
 #define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  2
-#define CGEMM_DEFAULT_UNROLL_N  2
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
 
-#define ZGEMM_DEFAULT_UNROLL_M  2
-#define ZGEMM_DEFAULT_UNROLL_N  2
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
 
 #define SGEMM_DEFAULT_P  128
 #define DGEMM_DEFAULT_P  128