CGEMVTKERNEL = ../mips/zgemv_t.c
ZGEMVTKERNEL = ../mips/zgemv_t.c
-STRMMKERNEL = ../generic/trmmkernel_2x2.c
-DTRMMKERNEL = ../generic/trmmkernel_2x2.c
-CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
-ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
-
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
-CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c
+CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c
+CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c
+CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c
+CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c
+CGEMMINCOPYOBJ = cgemm_incopy.o
+CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
-ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
-ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
+ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c
+ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c
+ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define CGEMM_KERNEL_8X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
+ LD_SP2_INC(pb0, 4, src_b0, src_b1); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = (OP4 src_a0r) * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = (OP4 src_a1r) * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = (OP4 src_a0r) * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ res3_r OP0## = src_a1r * src_br; \
+ res3_r OP1## = src_a1i * src_bi; \
+ res3_i OP2## = (OP4 src_a1r) * src_bi; \
+ res3_i OP3## = src_a1i * src_br; \
+ \
+ /* 2nd col */ \
+ SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
+ res4_r OP0## = src_a0r * src_br; \
+ res4_r OP1## = src_a0i * src_bi; \
+ res4_i OP2## = (OP4 src_a0r) * src_bi; \
+ res4_i OP3## = src_a0i * src_br; \
+ \
+ res5_r OP0## = src_a1r * src_br; \
+ res5_r OP1## = src_a1i * src_bi; \
+ res5_i OP2## = (OP4 src_a1r) * src_bi; \
+ res5_i OP3## = src_a1i * src_br; \
+ \
+ /* 3rd col */ \
+ SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
+ res6_r OP0## = src_a0r * src_br; \
+ res6_r OP1## = src_a0i * src_bi; \
+ res6_i OP2## = (OP4 src_a0r) * src_bi; \
+ res6_i OP3## = src_a0i * src_br; \
+ \
+ res7_r OP0## = src_a1r * src_br; \
+ res7_r OP1## = src_a1i * src_bi; \
+ res7_i OP2## = (OP4 src_a1r) * src_bi; \
+ res7_i OP3## = src_a1i * src_br; \
+}
+
+#define CGEMM_KERNEL_8X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
+ src_b0 = LD_SP(pb0); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = (OP4 src_a0r) * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = (OP4 src_a1r) * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = (OP4 src_a0r) * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ res3_r OP0## = src_a1r * src_br; \
+ res3_r OP1## = src_a1i * src_bi; \
+ res3_i OP2## = (OP4 src_a1r) * src_bi; \
+ res3_i OP3## = src_a1i * src_br; \
+}
+
+#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
+ src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
+ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_W2_SP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = (OP4 src_a0r) * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = (OP4 src_a1r) * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+}
+
+#define CGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP2_INC(pa0, 4, src_a0, src_a1); \
+ LD_SP2_INC(pb0, 4, src_b0, src_b1); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ /* 2nd col */ \
+ SPLATI_W2_SP(src_b1, 0, src_br, src_bi); \
+ res4_r OP0## = src_a0r * src_br; \
+ res4_r OP1## = src_a0i * src_bi; \
+ res4_i OP2## = OP4 src_a0r * src_bi; \
+ res4_i OP3## = src_a0i * src_br; \
+ \
+ /* 3rd col */ \
+ SPLATI_W2_SP(src_b1, 2, src_br, src_bi); \
+ res6_r OP0## = src_a0r * src_br; \
+ res6_r OP1## = src_a0i * src_bi; \
+ res6_i OP2## = OP4 src_a0r * src_bi; \
+ res6_i OP3## = src_a0i * src_br; \
+}
+
+#define CGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP2_INC(pa0, 4, src_a0, src_a1); \
+ src_b0 = LD_SP(pb0); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_W2_SP(src_b0, 0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_W2_SP(src_b0, 2, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+}
+
+#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_SP2_INC(pa0, 4, src_a0, src_a1); \
+ src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
+ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
+ \
+ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+}
+
+#define CGEMM_KERNEL_2X4(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ a1_r = pa0[2]; \
+ a1_i = pa0[3]; \
+ res2 OP0## = a1_r * b0_r; \
+ res2 OP1## = a1_i * b0_i; \
+ res3 OP2## = OP4 a1_r * b0_i; \
+ res3 OP3## = a1_i * b0_r; \
+ \
+ /* 1st col */ \
+ b1_r = pb0[2]; \
+ b1_i = pb0[3]; \
+ res4 OP0## = a0_r * b1_r; \
+ res4 OP1## = a0_i * b1_i; \
+ res5 OP2## = OP4 a0_r * b1_i; \
+ res5 OP3## = a0_i * b1_r; \
+ \
+ res6 OP0## = a1_r * b1_r; \
+ res6 OP1## = a1_i * b1_i; \
+ res7 OP2## = OP4 a1_r * b1_i; \
+ res7 OP3## = a1_i * b1_r; \
+ \
+ /* 2nd col */ \
+ b2_r = pb0[4]; \
+ b2_i = pb0[5]; \
+ res8 OP0## = a0_r * b2_r; \
+ res8 OP1## = a0_i * b2_i; \
+ res9 OP2## = OP4 a0_r * b2_i; \
+ res9 OP3## = a0_i * b2_r; \
+ \
+ res10 OP0## = a1_r * b2_r; \
+ res10 OP1## = a1_i * b2_i; \
+ res11 OP2## = OP4 a1_r * b2_i; \
+ res11 OP3## = a1_i * b2_r; \
+ \
+ /* 3rd col */ \
+ b3_r = pb0[6]; \
+ b3_i = pb0[7]; \
+ res12 OP0## = a0_r * b3_r; \
+ res12 OP1## = a0_i * b3_i; \
+ res13 OP2## = OP4 a0_r * b3_i; \
+ res13 OP3## = a0_i * b3_r; \
+ \
+ res14 OP0## = a1_r * b3_r; \
+ res14 OP1## = a1_i * b3_i; \
+ res15 OP2## = OP4 a1_r * b3_i; \
+ res15 OP3## = a1_i * b3_r; \
+}
+
+#define CGEMM_KERNEL_2X2(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ a1_r = pa0[2]; \
+ a1_i = pa0[3]; \
+ res2 OP0## = a1_r * b0_r; \
+ res2 OP1## = a1_i * b0_i; \
+ res3 OP2## = OP4 a1_r * b0_i; \
+ res3 OP3## = a1_i * b0_r; \
+ \
+ /* 1st col */ \
+ b1_r = pb0[2]; \
+ b1_i = pb0[3]; \
+ res4 OP0## = a0_r * b1_r; \
+ res4 OP1## = a0_i * b1_i; \
+ res5 OP2## = OP4 a0_r * b1_i; \
+ res5 OP3## = a0_i * b1_r; \
+ \
+ res6 OP0## = a1_r * b1_r; \
+ res6 OP1## = a1_i * b1_i; \
+ res7 OP2## = OP4 a1_r * b1_i; \
+ res7 OP3## = a1_i * b1_r; \
+}
+
+#define CGEMM_KERNEL_2X1(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ a1_r = pa0[2]; \
+ a1_i = pa0[3]; \
+ res2 OP0## = a1_r * b0_r; \
+ res2 OP1## = a1_i * b0_i; \
+ res3 OP2## = OP4 a1_r * b0_i; \
+ res3 OP3## = a1_i * b0_r; \
+}
+
+#define CGEMM_KERNEL_1X4(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ /* 1st col */ \
+ b1_r = pb0[2]; \
+ b1_i = pb0[3]; \
+ res2 OP0## = a0_r * b1_r; \
+ res2 OP1## = a0_i * b1_i; \
+ res3 OP2## = OP4 a0_r * b1_i; \
+ res3 OP3## = a0_i * b1_r; \
+ \
+ /* 2nd col */ \
+ b2_r = pb0[4]; \
+ b2_i = pb0[5]; \
+ res4 OP0## = a0_r * b2_r; \
+ res4 OP1## = a0_i * b2_i; \
+ res5 OP2## = OP4 a0_r * b2_i; \
+ res5 OP3## = a0_i * b2_r; \
+ \
+ /* 3rd col */ \
+ b3_r = pb0[6]; \
+ b3_i = pb0[7]; \
+ res6 OP0## = a0_r * b3_r; \
+ res6 OP1## = a0_i * b3_i; \
+ res7 OP2## = OP4 a0_r * b3_i; \
+ res7 OP3## = a0_i * b3_r; \
+}
+
+#define CGEMM_KERNEL_1X2(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+ \
+ /* 1st col */ \
+ b1_r = pb0[2]; \
+ b1_i = pb0[3]; \
+ res2 OP0## = a0_r * b1_r; \
+ res2 OP1## = a0_i * b1_i; \
+ res3 OP2## = OP4 a0_r * b1_i; \
+ res3 OP3## = a0_i * b1_r; \
+}
+
+#define CGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+}
+
+#define CGEMM_SCALE_8X4_MSA \
+{ \
+ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+ \
+ LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r += alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i += alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
+ \
+ LD_SP4(pc2, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i += alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ dst1_r += alpha_r * res5_r; \
+ dst1_r -= alpha_i * res5_i; \
+ dst1_i += alpha_r * res5_i; \
+ dst1_i += alpha_i * res5_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
+ \
+ LD_SP4(pc3, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i += alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ dst1_r += alpha_r * res7_r; \
+ dst1_r -= alpha_i * res7_i; \
+ dst1_i += alpha_r * res7_i; \
+ dst1_i += alpha_i * res7_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
+}
+
+#define CGEMM_SCALE_8X2_MSA \
+{ \
+ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+ \
+ LD_SP4(pc1, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r += alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i += alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
+}
+
+#define CGEMM_SCALE_8X1_MSA \
+{ \
+ LD_SP4(pc0, 4, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_W2_SP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+}
+
+#define CGEMM_SCALE_4X4_MSA \
+{ \
+ LD_SP2(pc0, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+ \
+ LD_SP2(pc1, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc1, 4); \
+ \
+ LD_SP2(pc2, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i += alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc2, 4); \
+ \
+ LD_SP2(pc3, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i += alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc3, 4); \
+}
+
+#define CGEMM_SCALE_4X2_MSA \
+{ \
+ LD_SP2(pc0, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+ \
+ LD_SP2(pc1, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc1, 4); \
+}
+
+#define CGEMM_SCALE_4X1_MSA \
+{ \
+ LD_SP2(pc0, 4, dst0, dst1); \
+ \
+ PCKEVOD_W2_SP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+}
+
+#define CGEMM_SCALE_2X4 \
+{ \
+ /* 0th col */ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ pc0[2] += alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] += alphar * res3; \
+ pc0[3] += alphai * res2; \
+ \
+ /* 1st col */ \
+ pc1[0] += alphar * res4; \
+ pc1[0] -= alphai * res5; \
+ pc1[1] += alphar * res5; \
+ pc1[1] += alphai * res4; \
+ pc1[2] += alphar * res6; \
+ pc1[2] -= alphai * res7; \
+ pc1[3] += alphar * res7; \
+ pc1[3] += alphai * res6; \
+ \
+ /* 2nd col */ \
+ pc2[0] += alphar * res8; \
+ pc2[0] -= alphai * res9; \
+ pc2[1] += alphar * res9; \
+ pc2[1] += alphai * res8; \
+ pc2[2] += alphar * res10; \
+ pc2[2] -= alphai * res11; \
+ pc2[3] += alphar * res11; \
+ pc2[3] += alphai * res10; \
+ \
+ /* 3rd col */ \
+ pc3[0] += alphar * res12; \
+ pc3[0] -= alphai * res13; \
+ pc3[1] += alphar * res13; \
+ pc3[1] += alphai * res12; \
+ pc3[2] += alphar * res14; \
+ pc3[2] -= alphai * res15; \
+ pc3[3] += alphar * res15; \
+ pc3[3] += alphai * res14; \
+}
+
+#define CGEMM_SCALE_2X2 \
+{ \
+ /* 0th col */ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ pc0[2] += alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] += alphar * res3; \
+ pc0[3] += alphai * res2; \
+ \
+ /* 1st col */ \
+ pc1[0] += alphar * res4; \
+ pc1[0] -= alphai * res5; \
+ pc1[1] += alphar * res5; \
+ pc1[1] += alphai * res4; \
+ pc1[2] += alphar * res6; \
+ pc1[2] -= alphai * res7; \
+ pc1[3] += alphar * res7; \
+ pc1[3] += alphai * res6; \
+}
+
+#define CGEMM_SCALE_2X1 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc0[2] += alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] += alphar * res3; \
+ pc0[3] += alphai * res2; \
+}
+
+#define CGEMM_SCALE_1X4 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc1[0] += alphar * res2; \
+ pc1[0] -= alphai * res3; \
+ pc1[1] += alphar * res3; \
+ pc1[1] += alphai * res2; \
+ \
+ pc2[0] += alphar * res4; \
+ pc2[0] -= alphai * res5; \
+ pc2[1] += alphar * res5; \
+ pc2[1] += alphai * res4; \
+ \
+ pc3[0] += alphar * res6; \
+ pc3[0] -= alphai * res7; \
+ pc3[1] += alphar * res7; \
+ pc3[1] += alphai * res6; \
+}
+
+#define CGEMM_SCALE_1X2 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc1[2] += alphar * res2; \
+ pc1[2] -= alphai * res3; \
+ pc1[3] += alphar * res3; \
+ pc1[3] += alphai * res2; \
+}
+
+#define CGEMM_SCALE_1X1 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+}
+
+#define CGEMM_TRMM_SCALE_8X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r = alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i = alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
+ \
+ dst0_r = alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i = alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ dst1_r = alpha_r * res5_r; \
+ dst1_r -= alpha_i * res5_i; \
+ dst1_i = alpha_r * res5_i; \
+ dst1_i += alpha_i * res5_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc2, 4); \
+ \
+ dst0_r = alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i = alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ dst1_r = alpha_r * res7_r; \
+ dst1_r -= alpha_i * res7_i; \
+ dst1_i = alpha_r * res7_i; \
+ dst1_i += alpha_i * res7_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc3, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_8X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r = alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i = alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc1, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_8X1_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_W2_SP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_SP4_INC(dst0, dst1, dst2, dst3, pc0, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_4X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc1, 4); \
+ \
+ dst0_r = alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i = alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc2, 4); \
+ \
+ dst0_r = alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i = alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc3, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_4X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc1, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_4X1_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_W2_SP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_SP2_INC(dst0, dst1, pc0, 4); \
+}
+
+#define CGEMM_TRMM_SCALE_2X4 \
+{ \
+ /* 0th col */ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ pc0[2] = alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] = alphar * res3; \
+ pc0[3] += alphai * res2; \
+ \
+ /* 1st col */ \
+ pc1[0] = alphar * res4; \
+ pc1[0] -= alphai * res5; \
+ pc1[1] = alphar * res5; \
+ pc1[1] += alphai * res4; \
+ pc1[2] = alphar * res6; \
+ pc1[2] -= alphai * res7; \
+ pc1[3] = alphar * res7; \
+ pc1[3] += alphai * res6; \
+ \
+ /* 2nd col */ \
+ pc2[0] = alphar * res8; \
+ pc2[0] -= alphai * res9; \
+ pc2[1] = alphar * res9; \
+ pc2[1] += alphai * res8; \
+ pc2[2] = alphar * res10; \
+ pc2[2] -= alphai * res11; \
+ pc2[3] = alphar * res11; \
+ pc2[3] += alphai * res10; \
+ \
+ /* 3rd col */ \
+ pc3[0] = alphar * res12; \
+ pc3[0] -= alphai * res13; \
+ pc3[1] = alphar * res13; \
+ pc3[1] += alphai * res12; \
+ pc3[2] = alphar * res14; \
+ pc3[2] -= alphai * res15; \
+ pc3[3] = alphar * res15; \
+ pc3[3] += alphai * res14; \
+}
+
+#define CGEMM_TRMM_SCALE_2X2 \
+{ \
+ /* 0th col */ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ pc0[2] = alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] = alphar * res3; \
+ pc0[3] += alphai * res2; \
+ \
+ /* 1st col */ \
+ pc1[0] = alphar * res4; \
+ pc1[0] -= alphai * res5; \
+ pc1[1] = alphar * res5; \
+ pc1[1] += alphai * res4; \
+ pc1[2] = alphar * res6; \
+ pc1[2] -= alphai * res7; \
+ pc1[3] = alphar * res7; \
+ pc1[3] += alphai * res6; \
+}
+
+#define CGEMM_TRMM_SCALE_2X1 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc0[2] = alphar * res2; \
+ pc0[2] -= alphai * res3; \
+ pc0[3] = alphar * res3; \
+ pc0[3] += alphai * res2; \
+}
+
+#define CGEMM_TRMM_SCALE_1X4 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc1[0] = alphar * res2; \
+ pc1[0] -= alphai * res3; \
+ pc1[1] = alphar * res3; \
+ pc1[1] += alphai * res2; \
+ \
+ pc2[0] = alphar * res4; \
+ pc2[0] -= alphai * res5; \
+ pc2[1] = alphar * res5; \
+ pc2[1] += alphai * res4; \
+ \
+ pc3[0] = alphar * res6; \
+ pc3[0] -= alphai * res7; \
+ pc3[1] = alphar * res7; \
+ pc3[1] += alphai * res6; \
+}
+
+#define CGEMM_TRMM_SCALE_1X2 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+ \
+ pc1[2] = alphar * res2; \
+ pc1[2] -= alphai * res3; \
+ pc1[3] = alphar * res3; \
+ pc1[3] += alphai * res2; \
+}
+
+#define CGEMM_TRMM_SCALE_1X1 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
+ FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc
+#ifdef TRMMKERNEL
+ , BLASLONG offset
+#endif
+ )
+{
+ BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+ FLOAT *pc0, *pc1, *pc2, *pc3;
+ FLOAT *pa0, *pb0;
+ FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
+ FLOAT res8, res9, res10, res11, res12, res13, res14, res15;
+ FLOAT a0_r, a1_r;
+ FLOAT a0_i, a1_i;
+ FLOAT b0_r, b1_r, b2_r, b3_r;
+ FLOAT b0_i, b1_i, b2_i, b3_i;
+ v4f32 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1;
+ v4f32 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
+ v4f32 dst0, dst1, dst2, dst3;
+ v4f32 alpha_r, alpha_i;
+ v4f32 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
+ v4f32 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
+ v4f32 dst0_r, dst0_i, dst1_r, dst1_i;
+
+ alpha_r = COPY_FLOAT_TO_VECTOR(alphar);
+ alpha_i = COPY_FLOAT_TO_VECTOR(alphai);
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ for (j = (n >> 2); j--;)
+ {
+ pc0 = C;
+ pc1 = pc0 + 2 * ldc;
+ pc2 = pc1 + 2 * ldc;
+ pc3 = pc2 + 2 * ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 8;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X4_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X4_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_8X4_MSA
+#else
+ CGEMM_SCALE_8X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 8;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X4_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X4_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_4X4_MSA
+#else
+ CGEMM_SCALE_4X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X4(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X4(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X4(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X4(, -, , -, -);
+#endif
+
+ pa0 += 4;
+ pb0 += 8;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X4(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X4(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X4(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X4(+, -, -, -,);
+#endif
+
+ pa0 += 4;
+ pb0 += 8;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_2X4
+#else
+ CGEMM_SCALE_2X4
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 4;
+ pc1 += 4;
+ pc2 += 4;
+ pc3 += 4;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X4(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X4(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X4(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X4(, -, , -, -);
+#endif
+
+ pa0 += 2;
+ pb0 += 8;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X4(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X4(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X4(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X4(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ pb0 += 8;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_1X4
+#else
+ CGEMM_SCALE_1X4
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ pc2 += 2;
+ pc3 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
+ l = k << 3;
+ B = B + l;
+ i = ldc << 3;
+ C = C + i;
+ }
+
+ if (n & 2)
+ {
+ pc0 = C;
+ pc1 = pc0 + 2 * ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 8;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X2_MSA(, -, , -, -);
+#endif
+
+ pb0 += 4;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X2_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_8X2_MSA
+#else
+ CGEMM_SCALE_8X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 8;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X2_MSA(, -, , -, -);
+#endif
+
+ pb0 += 4;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X2_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_4X2_MSA
+#else
+ CGEMM_SCALE_4X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X2(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X2(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X2(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X2(, -, , -, -);
+#endif
+
+ pa0 += 4;
+ pb0 += 4;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X2(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X2(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X2(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X2(+, -, -, -,);
+#endif
+
+ pa0 += 4;
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_2X2
+#else
+ CGEMM_SCALE_2X2
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 4;
+ pc1 += 4;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X2(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X2(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X2(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X2(, -, , -, -);
+#endif
+
+ pa0 += 2;
+ pb0 += 4;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X2(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X2(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X2(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X2(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ pb0 += 4;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_1X2
+#else
+ CGEMM_SCALE_1X2
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+
+ l = k << 2;
+ B = B + l;
+ i = ldc << 2;
+ C = C + i;
+ }
+
+ if (n & 1)
+ {
+ pc0 = C;
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 8;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X1_MSA(, -, , -, -);
+#endif
+
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_8X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_8X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_8X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_8X1_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_8X1_MSA
+#else
+ CGEMM_SCALE_8X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 8;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 4)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X1_MSA(, -, , -, -);
+#endif
+
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_4X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_4X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_4X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_4X1_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_4X1_MSA
+#else
+ CGEMM_SCALE_4X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X1(, -, , -, -);
+#endif
+
+ pa0 += 4;
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_2X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_2X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_2X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_2X1(+, -, -, -,);
+#endif
+
+ pa0 += 4;
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_2X1
+#else
+ CGEMM_SCALE_2X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+
+ pc0 += 4;
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X1(, -, , -, -);
+#endif
+
+ pa0 += 2;
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ CGEMM_KERNEL_1X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ CGEMM_KERNEL_1X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ CGEMM_KERNEL_1X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ CGEMM_KERNEL_1X1(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ CGEMM_TRMM_SCALE_1X1
+#else
+ CGEMM_SCALE_1X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+
+ l = k << 1;
+ B = B + l;
+ i = ldc << 1;
+ C = C + i;
+ }
+
+ return 0;
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
+ FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+ FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 dst0, dst1, dst4, dst5;
+
+ psrc0 = src;
+ pdst = dst;
+ lda *= 2;
+
+ for (j = (n >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+ ILVRL_D2_SP(src7, src5, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ src4 = LD_SP(psrc3);
+ src6 = LD_SP(psrc4);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ ctemp05 = *(psrc3 + 0);
+ ctemp06 = *(psrc3 + 1);
+ ctemp07 = *(psrc4 + 0);
+ ctemp08 = *(psrc4 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ *(pdst + 4) = ctemp05;
+ *(pdst + 5) = ctemp06;
+ *(pdst + 6) = ctemp07;
+ *(pdst + 7) = ctemp08;
+ pdst += 8;
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ psrc1 += 4;
+ psrc2 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ pdst += 4;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ ST_SP2_INC(src0, src1, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ psrc1 += 4;
+
+ ST_SP(src0, pdst);
+ pdst += 4;
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ psrc1 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ pdst += 2;
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+ FLOAT *psrc8, *pdst;
+ FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07;
+ FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14;
+ FLOAT ctemp15, ctemp16;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+ v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ psrc0 = src;
+ pdst = dst;
+ lda *= 2;
+
+ for (j = (n >> 3); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc5 = psrc4 + lda;
+ psrc6 = psrc5 + lda;
+ psrc7 = psrc6 + lda;
+ psrc8 = psrc7 + lda;
+ psrc0 += 8 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+ LD_SP2_INC(psrc5, 4, src8, src9);
+ LD_SP2_INC(psrc6, 4, src10, src11);
+ LD_SP2_INC(psrc7, 4, src12, src13);
+ LD_SP2_INC(psrc8, 4, src14, src15);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+ ILVRL_D2_SP(src10, src8, dst2, dst6);
+ ILVRL_D2_SP(src14, src12, dst3, dst7);
+
+ ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+ ILVRL_D2_SP(src7, src5, dst1, dst5);
+ ILVRL_D2_SP(src11, src9, dst2, dst6);
+ ILVRL_D2_SP(src15, src13, dst3, dst7);
+
+ ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ src4 = LD_SP(psrc3);
+ src6 = LD_SP(psrc4);
+ src8 = LD_SP(psrc5);
+ src10 = LD_SP(psrc6);
+ src12 = LD_SP(psrc7);
+ src14 = LD_SP(psrc8);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+ psrc5 += 4;
+ psrc6 += 4;
+ psrc7 += 4;
+ psrc8 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+ ILVRL_D2_SP(src10, src8, dst2, dst6);
+ ILVRL_D2_SP(src14, src12, dst3, dst7);
+
+ ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ ctemp05 = *(psrc3 + 0);
+ ctemp06 = *(psrc3 + 1);
+ ctemp07 = *(psrc4 + 0);
+ ctemp08 = *(psrc4 + 1);
+ ctemp09 = *(psrc5 + 0);
+ ctemp10 = *(psrc5 + 1);
+ ctemp11 = *(psrc6 + 0);
+ ctemp12 = *(psrc6 + 1);
+ ctemp13 = *(psrc7 + 0);
+ ctemp14 = *(psrc7 + 1);
+ ctemp15 = *(psrc8 + 0);
+ ctemp16 = *(psrc8 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+ psrc5 += 2;
+ psrc6 += 2;
+ psrc7 += 2;
+ psrc8 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ *(pdst + 4) = ctemp05;
+ *(pdst + 5) = ctemp06;
+ *(pdst + 6) = ctemp07;
+ *(pdst + 7) = ctemp08;
+ *(pdst + 8) = ctemp09;
+ *(pdst + 9) = ctemp10;
+ *(pdst + 10) = ctemp11;
+ *(pdst + 11) = ctemp12;
+ *(pdst + 12) = ctemp13;
+ *(pdst + 13) = ctemp14;
+ *(pdst + 14) = ctemp15;
+ *(pdst + 15) = ctemp16;
+ pdst += 16;
+ }
+ }
+
+ if (n & 4)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+ ILVRL_D2_SP(src7, src5, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ src4 = LD_SP(psrc3);
+ src6 = LD_SP(psrc4);
+ psrc1 += 4;
+ psrc2 += 4;
+ psrc3 += 4;
+ psrc4 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+ ILVRL_D2_SP(src6, src4, dst1, dst5);
+
+ ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ ctemp05 = *(psrc3 + 0);
+ ctemp06 = *(psrc3 + 1);
+ ctemp07 = *(psrc4 + 0);
+ ctemp08 = *(psrc4 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ *(pdst + 4) = ctemp05;
+ *(pdst + 5) = ctemp06;
+ *(pdst + 6) = ctemp07;
+ *(pdst + 7) = ctemp08;
+ pdst += 8;
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+
+ ILVRL_D2_SP(src3, src1, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src2 = LD_SP(psrc2);
+ psrc1 += 4;
+ psrc2 += 4;
+
+ ILVRL_D2_SP(src2, src0, dst0, dst4);
+
+ ST_SP2_INC(dst0, dst4, pdst, 4);
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ *(pdst + 2) = ctemp03;
+ *(pdst + 3) = ctemp04;
+ pdst += 4;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ ST_SP2_INC(src0, src1, pdst, 4);
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ psrc1 += 4;
+
+ ST_SP(src0, pdst);
+ pdst += 4;
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ psrc1 += 2;
+
+ *(pdst + 0) = ctemp01;
+ *(pdst + 1) = ctemp02;
+ pdst += 2;
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0;
+ FLOAT *psrc1, *psrc2;
+ FLOAT *pdst0;
+ FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+ v4f32 src0, src1, src2, src3;
+
+ psrc0 = src;
+ pdst0 = dst;
+ lda *= 2;
+
+ for (j = (n >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 8;
+
+ for (i = (m >> 1); i--;)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ LD_SP2(psrc2, 4, src2, src3);
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ ST_SP2_INC(src0, src1, pdst0, 4);
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 4;
+
+ for (i = (m >> 1); i--;)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ ST_SP2_INC(src0, src1, pdst0, 4);
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_SP(psrc1);
+ ST_SP(src0, pdst0);
+ pdst0 += 4;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 2;
+
+ for (i = (m >> 1); i--;)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ *(pdst0 + 2) = ctemp03;
+ *(pdst0 + 3) = ctemp04;
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ pdst0 += 4;
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ pdst0 += 2;
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *pdst0;
+ FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+ v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
+ v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ psrc0 = src;
+ pdst0 = dst;
+ lda *= 2;
+
+ for (j = (n >> 3); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 16;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP4(psrc1, 4, src0, src1, src2, src3);
+ LD_SP4(psrc2, 4, src4, src5, src6, src7);
+ LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11);
+ LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15);
+ ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
+ ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4);
+ psrc1 += 4 * lda;
+ psrc2 += 4 * lda;
+ }
+
+ if (m & 2)
+ {
+ LD_SP4(psrc1, 4, src0, src1, src2, src3);
+ LD_SP4(psrc2, 4, src4, src5, src6, src7);
+ ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ LD_SP4(psrc1, 4, src0, src1, src2, src3);
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+ }
+ }
+
+ if (n & 4)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 8;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ LD_SP2(psrc2, 4, src2, src3);
+ LD_SP2(psrc1 + 2 * lda, 4, src4, src5);
+ LD_SP2(psrc2 + 2 * lda, 4, src6, src7);
+
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+ ST_SP4_INC(src4, src5, src6, src7, pdst0, 4);
+ psrc1 += 4 * lda;
+ psrc2 += 4 * lda;
+ }
+
+ if (m & 2)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ LD_SP2(psrc2, 4, src2, src3);
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ LD_SP2(psrc1, 4, src0, src1);
+ ST_SP2_INC(src0, src1, pdst0, 4);
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 4;
+
+ for (i = (m >> 2); i--;)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ src2 = LD_SP(psrc1 + 2 * lda);
+ src3 = LD_SP(psrc2 + 2 * lda);
+ ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
+
+ psrc1 += 4 * lda;
+ psrc2 += 4 * lda;
+ }
+
+ if (m & 2)
+ {
+ src0 = LD_SP(psrc1);
+ src1 = LD_SP(psrc2);
+ ST_SP2_INC(src0, src1, pdst0, 4);
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_SP(psrc1);
+ ST_SP(src0, pdst0);
+ pdst0 += 4;
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc0 + lda;
+ psrc0 += 2;
+
+ for (i = (m >> 2); i--;)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ *(pdst0 + 2) = ctemp03;
+ *(pdst0 + 3) = ctemp04;
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ pdst0 += 4;
+
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ *(pdst0 + 2) = ctemp03;
+ *(pdst0 + 3) = ctemp04;
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ pdst0 += 4;
+ }
+
+ if (m & 2)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+ ctemp03 = *(psrc2 + 0);
+ ctemp04 = *(psrc2 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ *(pdst0 + 2) = ctemp03;
+ *(pdst0 + 3) = ctemp04;
+
+ psrc1 += 2 * lda;
+ psrc2 += 2 * lda;
+ pdst0 += 4;
+ }
+
+ if (m & 1)
+ {
+ ctemp01 = *(psrc1 + 0);
+ ctemp02 = *(psrc1 + 1);
+
+ *(pdst0 + 0) = ctemp01;
+ *(pdst0 + 1) = ctemp02;
+ pdst0 += 2;
+ }
+ }
+
+ return 0;
+}
#endif
)
{
- BLASLONG i, j, l;
- FLOAT *pc0, *pc1, *pc2, *pc3;
- FLOAT *pa0, *pb0;
+ BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+ FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
FLOAT tmp0, tmp1, tmp2, tmp3;
- FLOAT a0;
- FLOAT b0, b1, b2, b3;
+ FLOAT a0, b0, b1, b2, b3;
v2f64 v_alpha = {alpha, alpha};
v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1;
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v2f64 res0, res1, res2, res3, res4, res5, res6, res7;
v2f64 res8, res9, res10, res11, res12, res13, res14, res15;
- for (j = (n / 4); j--;)
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ for (j = (n >> 2); j--;)
{
pc0 = C;
pc1 = pc0 + ldc;
pa0 = A;
- for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 4;
+#endif
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
- LD_DP2(pb0, 2, src_b0, src_b1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 = src_a0 * src_b;
res14 = src_a2 * src_b;
res15 = src_a3 * src_b;
- pa0 += 8;
- pb0 += 4;
-
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res14 += src_a2 * src_b;
res15 += src_a3 * src_b;
- pa0 += 8;
- pb0 += 4;
-
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res13 += src_a1 * src_b;
res14 += src_a2 * src_b;
res15 += src_a3 * src_b;
-
- pa0 += 8;
- pb0 += 4;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res13 += src_a1 * src_b;
res14 += src_a2 * src_b;
res15 += src_a3 * src_b;
-
- pa0 += 8;
- pb0 += 4;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
dst5 += res5 * v_alpha;
dst6 += res6 * v_alpha;
dst7 += res7 * v_alpha;
-
- ST_DP4(dst0, dst1, dst2, dst3, pc0, 2);
- ST_DP4(dst4, dst5, dst6, dst7, pc1, 2);
-
+#endif
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
+
+#if defined(TRMMKERNEL)
+ dst0 = res8 * v_alpha;
+ dst1 = res9 * v_alpha;
+ dst2 = res10 * v_alpha;
+ dst3 = res11 * v_alpha;
+ dst4 = res12 * v_alpha;
+ dst5 = res13 * v_alpha;
+ dst6 = res14 * v_alpha;
+ dst7 = res15 * v_alpha;
+#else
LD_DP4(pc2, 2, dst0, dst1, dst2, dst3);
LD_DP4(pc3, 2, dst4, dst5, dst6, dst7);
dst5 += res13 * v_alpha;
dst6 += res14 * v_alpha;
dst7 += res15 * v_alpha;
+#endif
- ST_DP4(dst0, dst1, dst2, dst3, pc2, 2);
- ST_DP4(dst4, dst5, dst6, dst7, pc3, 2);
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2);
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2);
- pc0 += 8;
- pc1 += 8;
- pc2 += 8;
- pc3 += 8;
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 4) / 4); i--;)
+ if (m & 4)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 4;
+#endif
- LD_DP2(pa0, 2, src_a0, src_a1);
- LD_DP2(pb0, 2, src_b0, src_b1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 = src_a0 * src_b;
res6 = src_a0 * src_b;
res7 = src_a1 * src_b;
- pa0 += 4;
- pb0 += 4;
-
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_DP2(pa0, 2, src_a0, src_a1);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res6 += src_a0 * src_b;
res7 += src_a1 * src_b;
- pa0 += 4;
- pb0 += 4;
-
- LD_DP2(pa0, 2, src_a0, src_a1);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res6 += src_a0 * src_b;
res7 += src_a1 * src_b;
-
- pa0 += 4;
- pb0 += 4;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_DP2(pa0, 2, src_a0, src_a1);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res6 += src_a0 * src_b;
res7 += src_a1 * src_b;
-
- pa0 += 4;
- pb0 += 4;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
LD_DP2(pc0, 2, dst0, dst1);
LD_DP2(pc1, 2, dst2, dst3);
LD_DP2(pc2, 2, dst4, dst5);
dst5 += res5 * v_alpha;
dst6 += res6 * v_alpha;
dst7 += res7 * v_alpha;
+#endif
+ ST_DP2_INC(dst0, dst1, pc0, 2);
+ ST_DP2_INC(dst2, dst3, pc1, 2);
+ ST_DP2_INC(dst4, dst5, pc2, 2);
+ ST_DP2_INC(dst6, dst7, pc3, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 4;
+#endif
- ST_DP2(dst0, dst1, pc0, 2);
- ST_DP2(dst2, dst3, pc1, 2);
- ST_DP2(dst4, dst5, pc2, 2);
- ST_DP2(dst6, dst7, pc3, 2);
-
- pc0 += 4;
- pc1 += 4;
- pc2 += 4;
- pc3 += 4;
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 2) / 2); i--;)
+ if (m & 2)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
src_a0 = LD_DP(pa0);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ pa0 += 2;
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 = src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res3 = src_a0 * src_b;
- pa0 += 2;
- pb0 += 4;
-
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
src_a0 = LD_DP(pa0);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ pa0 += 2;
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res3 += src_a0 * src_b;
- pa0 += 2;
- pb0 += 4;
-
src_a0 = LD_DP(pa0);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ pa0 += 2;
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res3 += src_a0 * src_b;
-
- pa0 += 2;
- pb0 += 4;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
src_a0 = LD_DP(pa0);
- LD_DP2(pb0, 2, src_b0, src_b1);
+ pa0 += 2;
+ LD_DP2_INC(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res3 += src_a0 * src_b;
-
- pa0 += 2;
- pb0 += 4;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
dst0 = LD_DP(pc0);
dst1 = LD_DP(pc1);
dst2 = LD_DP(pc2);
dst1 += res1 * v_alpha;
dst2 += res2 * v_alpha;
dst3 += res3 * v_alpha;
-
+#endif
ST_DP(dst0, pc0);
ST_DP(dst1, pc1);
ST_DP(dst2, pc2);
ST_DP(dst3, pc3);
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
pc0 += 2;
pc1 += 2;
pc2 += 2;
pc3 += 2;
}
- for (i = (m & 1); i--;)
+ if (m & 1)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 1;
pb0 += 4;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 4;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
tmp2 = alpha * tmp2;
tmp3 = alpha * tmp3;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+ pc2[0] = tmp2;
+ pc3[0] = tmp3;
+#else
pc0[0] += tmp0;
pc1[0] += tmp1;
pc2[0] += tmp2;
pc3[0] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
pc0 += 1;
pc1 += 1;
pc3 += 1;
}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
l = (k << 2);
B = B + l;
i = (ldc << 2);
C = C + i;
}
- for (j = ((n & 2) / 2); j--;)
+ if (n & 2)
{
pc0 = C;
pc1 = pc0 + ldc;
pa0 = A;
- for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
pb0 = B;
+ temp = k;
+#endif
+
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 = src_a0 * src_b;
res6 = src_a2 * src_b;
res7 = src_a3 * src_b;
- pa0 += 8;
- pb0 += 2;
-
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res6 += src_a2 * src_b;
res7 += src_a3 * src_b;
- pa0 += 8;
- pb0 += 2;
-
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res5 += src_a1 * src_b;
res6 += src_a2 * src_b;
res7 += src_a3 * src_b;
-
- pa0 += 8;
- pb0 += 2;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res5 += src_a1 * src_b;
res6 += src_a2 * src_b;
res7 += src_a3 * src_b;
-
- pa0 += 8;
- pb0 += 2;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
LD_DP4(pc1, 2, dst4, dst5, dst6, dst7);
dst5 += res5 * v_alpha;
dst6 += res6 * v_alpha;
dst7 += res7 * v_alpha;
+#endif
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 2;
+#endif
- ST_DP4(dst0, dst1, dst2, dst3, pc0, 2);
- ST_DP4(dst4, dst5, dst6, dst7, pc1, 2);
-
- pc0 += 8;
- pc1 += 8;
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 4) / 4); i--;)
+ if (m & 4)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
pb0 = B;
+ temp = k;
+#endif
- LD_DP2(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 = src_a0 * src_b;
res2 = src_a0 * src_b;
res3 = src_a1 * src_b;
- pa0 += 4;
- pb0 += 2;
-
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_DP2(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
- pa0 += 4;
- pb0 += 2;
-
- LD_DP2(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
-
- pa0 += 4;
- pb0 += 2;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_DP2(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
-
- pa0 += 4;
- pb0 += 2;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
LD_DP2(pc0, 2, dst0, dst1);
LD_DP2(pc1, 2, dst2, dst3);
dst1 += res1 * v_alpha;
dst2 += res2 * v_alpha;
dst3 += res3 * v_alpha;
+#endif
+ ST_DP2_INC(dst0, dst1, pc0, 2);
+ ST_DP2_INC(dst2, dst3, pc1, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 2;
+#endif
- ST_DP2(dst0, dst1, pc0, 2);
- ST_DP2(dst2, dst3, pc1, 2);
-
- pc0 += 4;
- pc1 += 4;
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 2) / 2); i--;)
+ if (m & 2)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
src_a0 = LD_DP(pa0);
+ pa0 += 2;
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 = src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res1 = src_a0 * src_b;
- pa0 += 2;
- pb0 += 2;
-
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
src_a0 = LD_DP(pa0);
+ pa0 += 2;
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res1 += src_a0 * src_b;
- pa0 += 2;
- pb0 += 2;
-
src_a0 = LD_DP(pa0);
+ pa0 += 2;
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res1 += src_a0 * src_b;
-
- pa0 += 2;
- pb0 += 2;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
src_a0 = LD_DP(pa0);
+ pa0 += 2;
src_b0 = LD_DP(pb0);
+ pb0 += 2;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res1 += src_a0 * src_b;
-
- pa0 += 2;
- pb0 += 2;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+#else
dst0 = LD_DP(pc0);
dst1 = LD_DP(pc1);
dst0 += res0 * v_alpha;
dst1 += res1 * v_alpha;
-
+#endif
ST_DP(dst0, pc0);
ST_DP(dst1, pc1);
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
pc0 += 2;
pc1 += 2;
}
- for (i = (m & 1); i--;)
+ if (m & 1)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 1;
pb0 += 2;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 2;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
tmp0 = alpha * tmp0;
tmp1 = alpha * tmp1;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+#else
pc0[0] += tmp0;
pc1[0] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
pc0 += 1;
pc1 += 1;
}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+
l = (k << 1);
B = B + l;
i = (ldc << 1);
C = C + i;
}
- for (j = (n & 1); j--;)
+ if (n & 1)
{
pc0 = C;
pa0 = A;
- for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 1;
+#endif
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res2 = src_a2 * src_b;
res3 = src_a3 * src_b;
- pa0 += 8;
pb0 += 1;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
- pa0 += 8;
pb0 += 1;
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
- pa0 += 8;
pb0 += 1;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
- pa0 += 8;
pb0 += 1;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
LD_DP4(pc0, 2, dst0, dst1, dst2, dst3);
dst0 += res0 * v_alpha;
dst1 += res1 * v_alpha;
dst2 += res2 * v_alpha;
dst3 += res3 * v_alpha;
+#endif
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 1;
+#endif
- ST_DP4(dst0, dst1, dst2, dst3, pc0, 2);
-
- pc0 += 8;
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 4) / 4); i--;)
+ if (m & 4)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
pb0 = B;
+ temp = k;
+#endif
- LD_DP2(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 = src_a0 * src_b;
res1 = src_a1 * src_b;
- pa0 += 4;
pb0 += 1;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_DP2(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
- pa0 += 4;
pb0 += 1;
- LD_DP2(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
- pa0 += 4;
pb0 += 1;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_DP2(pa0, 2, src_a0, src_a1);
+ LD_DP2_INC(pa0, 2, src_a0, src_a1);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
- pa0 += 4;
pb0 += 1;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+#else
LD_DP2(pc0, 2, dst0, dst1);
dst0 += res0 * v_alpha;
dst1 += res1 * v_alpha;
+#endif
+ ST_DP2_INC(dst0, dst1, pc0, 2);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 1;
+#endif
- ST_DP2(dst0, dst1, pc0, 2);
-
- pc0 += 4;
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 2) / 2); i--;)
+ if (m & 2)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
pb0 = B;
+ temp = k;
+#endif
src_a0 = LD_DP(pa0);
src_b[0] = pb0[0];
pa0 += 2;
pb0 += 1;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
src_a0 = LD_DP(pa0);
src_b[0] = pb0[0];
pb0 += 1;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
src_a0 = LD_DP(pa0);
src_b[0] = pb0[0];
pb0 += 1;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+#else
dst0 = LD_DP(pc0);
dst0 += res0 * v_alpha;
-
+#endif
ST_DP(dst0, pc0);
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
pc0 += 2;
}
- for (i = (m & 1); i--;)
+ if (m & 1)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 1;
pb0 += 1;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 1;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 1;
}
+#if defined(TRMMKERNEL)
+ pc0[0] = alpha * tmp0;
+#else
pc0[0] += alpha * tmp0;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
pc0 += 1;
}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+
l = (k << 0);
B = B + l;
i = (ldc << 0);
C = C + i;
}
+
return 0;
}
FLOAT * __restrict dst)
{
BLASLONG i, j;
- FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
- FLOAT *pdst;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
for (i = (m >> 2); i--;)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- LD_DP2(psrc3, 2, src4, src5);
- LD_DP2(psrc4, 2, src6, src7);
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
- psrc1 += 4;
- psrc2 += 4;
- psrc3 += 4;
- psrc4 += 4;
-
- dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
- dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
- dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
- dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
-
- dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
- dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
- dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
- dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
-
- ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
- pdst += 16;
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src6, src4, dst1, dst5);
+ ILVRL_D2_DP(src3, src1, dst2, dst6);
+ ILVRL_D2_DP(src7, src5, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
}
for (i = (m & 3); i--;)
for (i = (m >> 2); i--;)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- psrc1 += 4;
- psrc2 += 4;
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
- dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
- dst1 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
- dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
- dst5 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src3, src1, dst1, dst5);
- ST_DP4(dst0, dst4, dst1, dst5, pdst, 2);
- pdst += 8;
+ ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2);
}
for (i = (m & 3); i--;)
FLOAT * __restrict dst)
{
BLASLONG i, j;
- FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
- FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
- FLOAT *pdst;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+ FLOAT *psrc8, *pdst;
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
for (i = (m >> 3); i--;)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- LD_DP2(psrc3, 2, src4, src5);
- LD_DP2(psrc4, 2, src6, src7);
- LD_DP2(psrc5, 2, src8, src9);
- LD_DP2(psrc6, 2, src10, src11);
- LD_DP2(psrc7, 2, src12, src13);
- LD_DP2(psrc8, 2, src14, src15);
-
- dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
- dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
- dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8);
- dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12);
- dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
- dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
- dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8);
- dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12);
-
- ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
-
- dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
- dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
- dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9);
- dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13);
- dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
- dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
- dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9);
- dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13);
-
- ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 16,
- 2);
-
- LD_DP2(psrc1 + 4, 2, src0, src1);
- LD_DP2(psrc2 + 4, 2, src2, src3);
- LD_DP2(psrc3 + 4, 2, src4, src5);
- LD_DP2(psrc4 + 4, 2, src6, src7);
- LD_DP2(psrc5 + 4, 2, src8, src9);
- LD_DP2(psrc6 + 4, 2, src10, src11);
- LD_DP2(psrc7 + 4, 2, src12, src13);
- LD_DP2(psrc8 + 4, 2, src14, src15);
-
- dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
- dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
- dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8);
- dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12);
- dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
- dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
- dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8);
- dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12);
-
- ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 32,
- 2);
-
- dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
- dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
- dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9);
- dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13);
- dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
- dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
- dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9);
- dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13);
-
- ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 48,
- 2);
-
- psrc1 += 8;
- psrc2 += 8;
- psrc3 += 8;
- psrc4 += 8;
- psrc5 += 8;
- psrc6 += 8;
- psrc7 += 8;
- psrc8 += 8;
- pdst += 64;
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+ LD_DP2_INC(psrc5, 2, src8, src9);
+ LD_DP2_INC(psrc6, 2, src10, src11);
+ LD_DP2_INC(psrc7, 2, src12, src13);
+ LD_DP2_INC(psrc8, 2, src14, src15);
+
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src6, src4, dst1, dst5);
+ ILVRL_D2_DP(src10, src8, dst2, dst6);
+ ILVRL_D2_DP(src14, src12, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+ ILVRL_D2_DP(src3, src1, dst0, dst4);
+ ILVRL_D2_DP(src7, src5, dst1, dst5);
+ ILVRL_D2_DP(src11, src9, dst2, dst6);
+ ILVRL_D2_DP(src15, src13, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+ LD_DP2_INC(psrc5, 2, src8, src9);
+ LD_DP2_INC(psrc6, 2, src10, src11);
+ LD_DP2_INC(psrc7, 2, src12, src13);
+ LD_DP2_INC(psrc8, 2, src14, src15);
+
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src6, src4, dst1, dst5);
+ ILVRL_D2_DP(src10, src8, dst2, dst6);
+ ILVRL_D2_DP(src14, src12, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
+
+ ILVRL_D2_DP(src3, src1, dst0, dst4);
+ ILVRL_D2_DP(src7, src5, dst1, dst5);
+ ILVRL_D2_DP(src11, src9, dst2, dst6);
+ ILVRL_D2_DP(src15, src13, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
}
for (i = (m & 7); i--;)
for (i = (m >> 2); i--;)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- LD_DP2(psrc3, 2, src4, src5);
- LD_DP2(psrc4, 2, src6, src7);
- psrc1 += 4;
- psrc2 += 4;
- psrc3 += 4;
- psrc4 += 4;
-
- dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
- dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
- dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
- dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
-
- dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
- dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
- dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
- dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
-
- ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
- pdst += 16;
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+
+ ILVRL_D2_DP(src2, src0, dst0, dst4);
+ ILVRL_D2_DP(src6, src4, dst1, dst5);
+ ILVRL_D2_DP(src3, src1, dst2, dst6);
+ ILVRL_D2_DP(src7, src5, dst3, dst7);
+
+ ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
}
for (i = (m & 3); i--;)
psrc1 += 2;
psrc2 += 2;
- dst0 = (v2f64) __msa_ilvr_d((v2i64) src1, (v2i64) src0);
- dst1 = (v2f64) __msa_ilvl_d((v2i64) src1, (v2i64) src0);
+ ILVRL_D2_DP(src1, src0, dst0, dst1);
- ST_DP2(dst0, dst1, pdst, 2);
- pdst += 4;
+ ST_DP2_INC(dst0, dst1, pdst, 2);
}
if (m & 1)
for (i = (n >> 2); i--;)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- LD_DP2(psrc3, 2, src4, src5);
- LD_DP2(psrc4, 2, src6, src7);
- psrc1 += 4;
- psrc2 += 4;
- psrc3 += 4;
- psrc4 += 4;
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
pdst1 += m * 4;
psrc3 += 2;
psrc4 += 2;
- ST_DP4(src0, src1, src2, src3, pdst2, 2);
- pdst2 += 8;
+ ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
}
if (n & 1)
for (i = (n >> 2); i--;)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- psrc1 += 4;
- psrc2 += 4;
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
ST_DP4(src0, src1, src2, src3, pdst1, 2);
pdst1 += m * 4;
psrc1 += 2;
psrc2 += 2;
- ST_DP2(src0, src1, pdst2, 2);
- pdst2 += 4;
+ ST_DP2_INC(src0, src1, pdst2, 2);
}
if (n & 1)
for (i = (n >> 2); i--;)
{
- LD_DP2(psrc1, 2, src0, src1);
- psrc1 += 4;
+ LD_DP2_INC(psrc1, 2, src0, src1);
ST_DP2(src0, src1, pdst1, 2);
pdst1 += 4 * m;
for (i = (n >> 3); i--;)
{
- LD_DP4(psrc1, 2, src0, src1, src2, src3);
- LD_DP4(psrc2, 2, src4, src5, src6, src7);
- LD_DP4(psrc3, 2, src8, src9, src10, src11);
- LD_DP4(psrc4, 2, src12, src13, src14, src15);
- psrc1 += 8;
- psrc2 += 8;
- psrc3 += 8;
- psrc4 += 8;
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
pdst1 + 16, 2);
- LD_DP4(psrc5, 2, src0, src1, src2, src3);
- LD_DP4(psrc6, 2, src4, src5, src6, src7);
- LD_DP4(psrc7, 2, src8, src9, src10, src11);
- LD_DP4(psrc8, 2, src12, src13, src14, src15);
- psrc5 += 8;
- psrc6 += 8;
- psrc7 += 8;
- psrc8 += 8;
+ LD_DP4_INC(psrc5, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc6, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc7, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc8, 2, src12, src13, src14, src15);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32,
2);
if (n & 4)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- LD_DP2(psrc3, 2, src4, src5);
- LD_DP2(psrc4, 2, src6, src7);
- LD_DP2(psrc5, 2, src8, src9);
- LD_DP2(psrc6, 2, src10, src11);
- LD_DP2(psrc7, 2, src12, src13);
- LD_DP2(psrc8, 2, src14, src15);
- psrc1 += 4;
- psrc2 += 4;
- psrc3 += 4;
- psrc4 += 4;
- psrc5 += 4;
- psrc6 += 4;
- psrc7 += 4;
- psrc8 += 4;
-
- ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
- ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
- pdst2 + 16, 2);
- pdst2 += 32;
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+ LD_DP2_INC(psrc5, 2, src8, src9);
+ LD_DP2_INC(psrc6, 2, src10, src11);
+ LD_DP2_INC(psrc7, 2, src12, src13);
+ LD_DP2_INC(psrc8, 2, src14, src15);
+
+ ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
+ ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15,
+ pdst2, 2);
}
if (n & 2)
psrc7 += 2;
psrc8 += 2;
- ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
- pdst3 += 16;
+ ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
}
if (n & 1)
for (i = (n >> 3); i--;)
{
- LD_DP4(psrc1, 2, src0, src1, src2, src3);
- LD_DP4(psrc2, 2, src4, src5, src6, src7);
- LD_DP4(psrc3, 2, src8, src9, src10, src11);
- LD_DP4(psrc4, 2, src12, src13, src14, src15);
- psrc1 += 8;
- psrc2 += 8;
- psrc3 += 8;
- psrc4 += 8;
- psrc5 += 8;
- psrc6 += 8;
- psrc7 += 8;
- psrc8 += 8;
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
if (n & 4)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- LD_DP2(psrc3, 2, src4, src5);
- LD_DP2(psrc4, 2, src6, src7);
- psrc1 += 4;
- psrc2 += 4;
- psrc3 += 4;
- psrc4 += 4;
-
- ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
- pdst2 += 16;
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+
+ ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
}
if (n & 2)
psrc3 += 2;
psrc4 += 2;
- ST_DP4(src0, src1, src2, src3, pdst3, 2);
- pdst3 += 8;
+ ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
}
if (n & 1)
for (i = (n >> 3); i--;)
{
- LD_DP4(psrc1, 2, src0, src1, src2, src3);
- LD_DP4(psrc2, 2, src4, src5, src6, src7);
- psrc1 += 8;
- psrc2 += 8;
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
pdst1 += 8 * m;
if (n & 4)
{
- LD_DP2(psrc1, 2, src0, src1);
- LD_DP2(psrc2, 2, src2, src3);
- psrc1 += 4;
- psrc2 += 4;
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
- ST_DP4(src0, src1, src2, src3, pdst2, 2);
- pdst2 += 8;
+ ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
}
if (n & 2)
psrc1 += 2;
psrc2 += 2;
- ST_DP2(src0, src1, pdst3, 2);
- pdst3 += 4;
+ ST_DP2_INC(src0, src1, pdst3, 2);
}
if (n & 1)
for (i = (n >> 3); i--;)
{
- LD_DP4(psrc1, 2, src0, src1, src2, src3);
- psrc1 += 8;
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
ST_DP4(src0, src1, src2, src3, pdst1, 2);
pdst1 += 8 * m;
if (n & 4)
{
- LD_DP2(psrc1, 2, src0, src1);
- psrc1 += 4;
+ LD_DP2_INC(psrc1, 2, src0, src1);
- ST_DP2(src0, src1, pdst2, 2);
- pdst2 += 4;
+ ST_DP2_INC(src0, src1, pdst2, 2);
}
if (n & 2)
#define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
-#define COPY_FLOAT_TO_VECTOR(a, b) \
- b = __msa_cast_to_vector_float(a); \
- b = (v4f32) __msa_splati_w((v4i32) b, 0);
+#define COPY_FLOAT_TO_VECTOR(a) ( { \
+ v4f32 out; \
+ out = __msa_cast_to_vector_float(a); \
+ out = (v4f32) __msa_splati_w((v4i32) out, 0); \
+ out; \
+} )
+#define COPY_DOUBLE_TO_VECTOR(a) ( { \
+ v2f64 out; \
+ out = __msa_cast_to_vector_double(a); \
+ out = (v2f64) __msa_splati_d((v2i64) out, 0); \
+ out; \
+} )
+
+/* Description : Load 2 variables with stride
+ Arguments : Inputs - psrc, stride
+ Outputs - out0, out1
+*/
+#define LD_GP2_INC(psrc, stride, out0, out1) \
+{ \
+ out0 = *(psrc); \
+ (psrc) += stride; \
+ out1 = *(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_GP3_INC(psrc, stride, out0, \
+ out1, out2) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ out2 = *(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_GP4_INC(psrc, stride, out0, \
+ out1, out2, out3) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ LD_GP2_INC(psrc, stride, out2, out3); \
+}
+
+#define LD_GP5_INC(psrc, stride, out0, \
+ out1, out2, out3, out4) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ LD_GP2_INC(psrc, stride, out2, out3); \
+ out4 = *(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_GP6_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ LD_GP2_INC(psrc, stride, out2, out3); \
+ LD_GP2_INC(psrc, stride, out4, out5); \
+}
+
+#define LD_GP7_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5, out6) \
+{ \
+ LD_GP2_INC(psrc, stride, out0, out1); \
+ LD_GP2_INC(psrc, stride, out2, out3); \
+ LD_GP2_INC(psrc, stride, out4, out5); \
+ out6 = *(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_GP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+{ \
+ LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \
+ LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \
+}
/* Description : Load 2 vectors of single precision floating point elements with stride
Arguments : Inputs - psrc, stride
out1 = LD_SP((psrc) + stride); \
}
+#define LD_SP4(psrc, stride, out0, out1, out2, out3) \
+{ \
+ LD_SP2(psrc, stride, out0, out1) \
+ LD_SP2(psrc + 2 * stride, stride, out2, out3) \
+}
+
+#define LD_SP2_INC(psrc, stride, out0, out1) \
+{ \
+ out0 = LD_SP((psrc)); \
+ (psrc) += stride; \
+ out1 = LD_SP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_SP3_INC(psrc, stride, out0, \
+ out1, out2) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ out2 = LD_SP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_SP4_INC(psrc, stride, out0, \
+ out1, out2, out3) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ LD_SP2_INC(psrc, stride, out2, out3); \
+}
+
+#define LD_SP5_INC(psrc, stride, out0, \
+ out1, out2, out3, out4) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ LD_SP2_INC(psrc, stride, out2, out3); \
+ out4 = LD_SP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_SP6_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ LD_SP2_INC(psrc, stride, out2, out3); \
+ LD_SP2_INC(psrc, stride, out4, out5); \
+}
+
+#define LD_SP7_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5, out6) \
+{ \
+ LD_SP2_INC(psrc, stride, out0, out1); \
+ LD_SP2_INC(psrc, stride, out2, out3); \
+ LD_SP2_INC(psrc, stride, out4, out5); \
+ out6 = LD_SP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_SP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+{ \
+ LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \
+ LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \
+}
+
+#define LD_SP16_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7, out8, \
+ out9, out10, out11, out12, out13, \
+ out14, out15) \
+{ \
+ LD_SP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7); \
+ LD_SP8_INC(psrc, stride, out8, out9, out10, \
+ out11, out12, out13, out14, out15); \
+}
+
/* Description : Load 2 vectors of double precision floating point elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
LD_DP2(psrc + 2 * stride, stride, out2, out3) \
}
+#define LD_DP2_INC(psrc, stride, out0, out1) \
+{ \
+ out0 = LD_DP(psrc); \
+ (psrc) += stride; \
+ out1 = LD_DP(psrc); \
+ (psrc) += stride; \
+}
+
+#define LD_DP3_INC(psrc, stride, out0, \
+ out1, out2) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ out2 = LD_DP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_DP4_INC(psrc, stride, out0, \
+ out1, out2, out3) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ LD_DP2_INC(psrc, stride, out2, out3); \
+}
+
+#define LD_DP5_INC(psrc, stride, out0, \
+ out1, out2, out3, out4) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ LD_DP2_INC(psrc, stride, out2, out3); \
+ out4 = LD_DP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_DP6_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ LD_DP2_INC(psrc, stride, out2, out3); \
+ LD_DP2_INC(psrc, stride, out4, out5); \
+}
+
+#define LD_DP7_INC(psrc, stride, out0, \
+ out1, out2, out3, \
+ out4, out5, out6) \
+{ \
+ LD_DP2_INC(psrc, stride, out0, out1); \
+ LD_DP2_INC(psrc, stride, out2, out3); \
+ LD_DP2_INC(psrc, stride, out4, out5); \
+ out6 = LD_DP((psrc)); \
+ (psrc) += stride; \
+}
+
+#define LD_DP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+{ \
+ LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \
+ LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \
+}
+
+#define LD_DP16_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7, out8, \
+ out9, out10, out11, out12, out13, \
+ out14, out15) \
+{ \
+ LD_DP8_INC(psrc, stride, out0, out1, out2, \
+ out3, out4, out5, out6, out7); \
+ LD_DP8_INC(psrc, stride, out8, out9, out10, \
+ out11, out12, out13, out14, out15); \
+}
+
+/* Description : Store GP variable with stride
+ Arguments : Inputs - in0, in1, pdst, stride
+ Details : Store 4 single precision floating point elements from 'in0' to (pdst)
+ Store 4 single precision floating point elements from 'in1' to (pdst + stride)
+*/
+#define ST_GP2_INC(in0, in1, \
+ pdst, stride) \
+{ \
+ *(pdst) = in0; \
+ (pdst) += stride; \
+ *(pdst) = in1; \
+ (pdst) += stride; \
+}
+
+#define ST_GP3_INC(in0, in1, in2, \
+ pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ *(pdst) = in2; \
+ (pdst) += stride; \
+}
+
+#define ST_GP4_INC(in0, in1, in2, in3, \
+ pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ ST_GP2_INC(in2, in3, pdst, stride); \
+}
+
+#define ST_GP5_INC(in0, in1, in2, in3, \
+ in4, pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ ST_GP2_INC(in2, in3, pdst, stride); \
+ *(pdst) = in4; \
+ (pdst) += stride; \
+}
+
+#define ST_GP6_INC(in0, in1, in2, in3, \
+ in4, in5, pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ ST_GP2_INC(in2, in3, pdst, stride); \
+ ST_GP2_INC(in4, in5, pdst, stride); \
+}
+
+#define ST_GP7_INC(in0, in1, in2, in3, in4, \
+ in5, in6, pdst, stride) \
+{ \
+ ST_GP2_INC(in0, in1, pdst, stride); \
+ ST_GP2_INC(in2, in3, pdst, stride); \
+ ST_GP2_INC(in4, in5, pdst, stride); \
+ *(pdst) = in6; \
+ (pdst) += stride; \
+}
+
+#define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \
+ in6, in7, pdst, stride) \
+{ \
+ ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \
+ ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \
+}
+
/* Description : Store vectors of single precision floating point elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 4 single precision floating point elements from 'in0' to (pdst)
ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \
}
+#define ST_SP2_INC(in0, in1, pdst, stride) \
+{ \
+ ST_SP(in0, (pdst)); \
+ (pdst) += stride; \
+ ST_SP(in1, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_SP3_INC(in0, in1, in2, \
+ pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP(in2, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_SP4_INC(in0, in1, in2, in3, \
+ pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP2_INC(in2, in3, pdst, stride); \
+}
+
+#define ST_SP5_INC(in0, in1, in2, in3, \
+ in4, pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP2_INC(in2, in3, pdst, stride); \
+ ST_SP(in4, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_SP6_INC(in0, in1, in2, in3, \
+ in4, in5, pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP2_INC(in2, in3, pdst, stride); \
+ ST_SP2_INC(in4, in5, pdst, stride); \
+}
+
+#define ST_SP7_INC(in0, in1, in2, in3, in4, \
+ in5, in6, pdst, stride) \
+{ \
+ ST_SP2_INC(in0, in1, pdst, stride); \
+ ST_SP2_INC(in2, in3, pdst, stride); \
+ ST_SP2_INC(in4, in5, pdst, stride); \
+ ST_SP(in6, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \
+ in6, in7, pdst, stride) \
+{ \
+ ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \
+ ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \
+}
+
+#define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \
+ in7, in8, in9, in10, in11, in12, \
+ in13, in14, in15, pdst, stride) \
+{ \
+ ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \
+ in7, pdst, stride); \
+ ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \
+ in15, pdst, stride); \
+}
+
/* Description : Store vectors of double precision floating point elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 2 double precision floating point elements from 'in0' to (pdst)
ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
}
+#define ST_DP2_INC(in0, in1, pdst, stride) \
+{ \
+ ST_DP(in0, (pdst)); \
+ (pdst) += stride; \
+ ST_DP(in1, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_DP3_INC(in0, in1, in2, \
+ pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP(in2, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_DP4_INC(in0, in1, in2, in3, \
+ pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP2_INC(in2, in3, pdst, stride); \
+}
+
+#define ST_DP5_INC(in0, in1, in2, in3, \
+ in4, pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP2_INC(in2, in3, pdst, stride); \
+ ST_DP(in4, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_DP6_INC(in0, in1, in2, in3, \
+ in4, in5, pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP2_INC(in2, in3, pdst, stride); \
+ ST_DP2_INC(in4, in5, pdst, stride); \
+}
+
+#define ST_DP7_INC(in0, in1, in2, in3, in4, \
+ in5, in6, pdst, stride) \
+{ \
+ ST_DP2_INC(in0, in1, pdst, stride); \
+ ST_DP2_INC(in2, in3, pdst, stride); \
+ ST_DP2_INC(in4, in5, pdst, stride); \
+ ST_DP(in6, (pdst)); \
+ (pdst) += stride; \
+}
+
+#define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \
+ in6, in7, pdst, stride) \
+{ \
+ ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \
+ ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \
+}
+
+#define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \
+ in7, in8, in9, in10, in11, in12, \
+ in13, in14, in15, pdst, stride) \
+{ \
+ ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \
+ in7, pdst, stride); \
+ ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \
+ in15, pdst, stride); \
+}
+
+/* Description : shuffle elements in vector as shf_val
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+*/
+#define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \
+{ \
+ out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
+ out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
+}
+#define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
+#define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
+
+#define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \
+ shf_val) \
+{ \
+ out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
+ out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
+ out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \
+}
+#define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
+
+#define SHF_W4(RTYPE, in0, in1, in2, in3, \
+ out0, out1, out2, out3, shf_val) \
+{ \
+ SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \
+ SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \
+}
+#define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
+#define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
+
/* Description : Interleave both left and right half of input vectors
Arguments : Inputs - in0, in1
Outputs - out0, out1
out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
}
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
}
+#define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
/* Description : Indexed word element values are replicated to all
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
}
+#define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
{ \
}
#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
+#define SPLATI_D2(RTYPE, in, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \
+ out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \
+}
+#define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double word elements of 'in0' are copied to the left half
+ of 'out0' & even double word elements of 'in1' are copied to
+ the right half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
+}
+#define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
+#define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
+
+#define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \
+ out0, out1, out2) \
+{ \
+ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
+ out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \
+}
+#define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) \
+{ \
+ PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
+ PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
+}
+#define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
+
+/* Description : pack both even and odd half of input vectors
+ Arguments : Inputs - in0, in1
+ Outputs - out0, out1
+ Return Type - as per RTYPE
+ Details : Even double word elements of 'in0' and 'in1' are copied to the
+ 'out0' & odd double word elements of 'in0' and 'in1' are
+ copied to the 'out1'.
+*/
+#define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \
+ out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \
+}
+#define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
+
+#define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \
+{ \
+ out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
+ out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
+}
+#define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element from 'in0' is multiplied with elements from 'in1'
+ and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+}
+#define MUL3(in0, in1, in2, in3, in4, in5, \
+ out0, out1, out2) \
+{ \
+ out0 = in0 * in1; \
+ out1 = in2 * in3; \
+ out2 = in4 * in5; \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) \
+{ \
+ MUL2(in0, in1, in2, in3, out0, out1); \
+ MUL2(in4, in5, in6, in7, out2, out3); \
+}
+
+/* Description : Addition of 2 pairs of variables
+ Arguments : Inputs - in0, in1, in2, in3
+ Outputs - out0, out1
+ Details : Each element in 'in0' is added to 'in1' and result is written
+ to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+{ \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+}
+#define ADD3(in0, in1, in2, in3, in4, in5, \
+ out0, out1, out2) \
+{ \
+ out0 = in0 + in1; \
+ out1 = in2 + in3; \
+ out2 = in4 + in5; \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3) \
+{ \
+ ADD2(in0, in1, in2, in3, out0, out1); \
+ ADD2(in4, in5, in6, in7, out2, out3); \
+}
+
/* Description : Transpose 4x4 block with word elements in vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3
Return Type - as per RTYPE
*/
-#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
-{ \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
- ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
- \
- out0 = (RTYPE) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
- out1 = (RTYPE) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
- out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
- out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
+#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \
+ out0, out1, out2, out3) \
+{ \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
+ ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
+ ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \
+ ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \
}
#define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
#endif
)
{
- BLASLONG i, j, l;
+ BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
FLOAT *pa0, *pb0;
FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
- FLOAT a0, a1;
- FLOAT b0, b1, b2, b3, b4, b5, b6, b7;
+ FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7;
v4f32 v_alpha = {alpha, alpha, alpha, alpha};
v4f32 src_a0, src_a1, src_b, src_b0, src_b1;
v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
v4f32 res8, res9, res10, res11, res12, res13, res14, res15;
- for (j = (n / 8); j--;)
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ for (j = (n >> 3); j--;)
{
pc0 = C;
pc1 = pc0 + ldc;
pc6 = pc5 + ldc;
pc7 = pc6 + ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
pa0 = A;
- for (i = (m / 8); i--;)
+ for (i = (m >> 3); i--;)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 8;
+#endif
- LD_SP2(pa0, 4, src_a0, src_a1);
- LD_SP2(pb0, 4, src_b0, src_b1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 8; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 = src_a0 * src_b;
res14 = src_a0 * src_b;
res15 = src_a1 * src_b;
- pa0 += 8;
- pb0 += 8;
-
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_SP2(pa0, 4, src_a0, src_a1);
- LD_SP2(pb0, 4, src_b0, src_b1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
res14 += src_a0 * src_b;
res15 += src_a1 * src_b;
- pa0 += 8;
- pb0 += 8;
-
- LD_SP2(pa0, 4, src_a0, src_a1);
- LD_SP2(pb0, 4, src_b0, src_b1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
res14 += src_a0 * src_b;
res15 += src_a1 * src_b;
-
- pa0 += 8;
- pb0 += 8;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_SP2(pa0, 4, src_a0, src_a1);
- LD_SP2(pb0, 4, src_b0, src_b1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
res14 += src_a0 * src_b;
res15 += src_a1 * src_b;
-
- pa0 += 8;
- pb0 += 8;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
LD_SP2(pc0, 4, dst0, dst1);
LD_SP2(pc1, 4, dst2, dst3);
LD_SP2(pc2, 4, dst4, dst5);
dst5 += res5 * v_alpha;
dst6 += res6 * v_alpha;
dst7 += res7 * v_alpha;
-
- ST_SP2(dst0, dst1, pc0, 4);
- ST_SP2(dst2, dst3, pc1, 4);
- ST_SP2(dst4, dst5, pc2, 4);
- ST_SP2(dst6, dst7, pc3, 4);
-
+#endif
+ ST_SP2_INC(dst0, dst1, pc0, 4);
+ ST_SP2_INC(dst2, dst3, pc1, 4);
+ ST_SP2_INC(dst4, dst5, pc2, 4);
+ ST_SP2_INC(dst6, dst7, pc3, 4);
+
+#if defined(TRMMKERNEL)
+ dst0 = res8 * v_alpha;
+ dst1 = res9 * v_alpha;
+ dst2 = res10 * v_alpha;
+ dst3 = res11 * v_alpha;
+ dst4 = res12 * v_alpha;
+ dst5 = res13 * v_alpha;
+ dst6 = res14 * v_alpha;
+ dst7 = res15 * v_alpha;
+#else
LD_SP2(pc4, 4, dst0, dst1);
LD_SP2(pc5, 4, dst2, dst3);
LD_SP2(pc6, 4, dst4, dst5);
dst5 += res13 * v_alpha;
dst6 += res14 * v_alpha;
dst7 += res15 * v_alpha;
+#endif
+ ST_SP2_INC(dst0, dst1, pc4, 4);
+ ST_SP2_INC(dst2, dst3, pc5, 4);
+ ST_SP2_INC(dst4, dst5, pc6, 4);
+ ST_SP2_INC(dst6, dst7, pc7, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 8; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 8;
+#endif
- ST_SP2(dst0, dst1, pc4, 4);
- ST_SP2(dst2, dst3, pc5, 4);
- ST_SP2(dst4, dst5, pc6, 4);
- ST_SP2(dst6, dst7, pc7, 4);
-
- pc0 += 8;
- pc1 += 8;
- pc2 += 8;
- pc3 += 8;
- pc4 += 8;
- pc5 += 8;
- pc6 += 8;
- pc7 += 8;
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 4) / 4); i--;)
+ if (m & 4)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 8; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
src_a0 = LD_SP(pa0);
- LD_SP2(pb0, 4, src_b0, src_b1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 = src_a0 * src_b;
res7 = src_a0 * src_b;
pa0 += 4;
- pb0 += 8;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
src_a0 = LD_SP(pa0);
- LD_SP2(pb0, 4, src_b0, src_b1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
res7 += src_a0 * src_b;
pa0 += 4;
- pb0 += 8;
src_a0 = LD_SP(pa0);
- LD_SP2(pb0, 4, src_b0, src_b1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
res7 += src_a0 * src_b;
pa0 += 4;
- pb0 += 8;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
src_a0 = LD_SP(pa0);
- LD_SP2(pb0, 4, src_b0, src_b1);
+ LD_SP2_INC(pb0, 4, src_b0, src_b1);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
res7 += src_a0 * src_b;
pa0 += 4;
- pb0 += 8;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
dst0 = LD_SP(pc0);
dst1 = LD_SP(pc1);
dst2 = LD_SP(pc2);
dst1 += res1 * v_alpha;
dst2 += res2 * v_alpha;
dst3 += res3 * v_alpha;
-
+#endif
ST_SP(dst0, pc0);
ST_SP(dst1, pc1);
ST_SP(dst2, pc2);
ST_SP(dst3, pc3);
+#if defined(TRMMKERNEL)
+ dst0 = res4 * v_alpha;
+ dst1 = res5 * v_alpha;
+ dst2 = res6 * v_alpha;
+ dst3 = res7 * v_alpha;
+#else
dst0 = LD_SP(pc4);
dst1 = LD_SP(pc5);
dst2 = LD_SP(pc6);
dst1 += res5 * v_alpha;
dst2 += res6 * v_alpha;
dst3 += res7 * v_alpha;
-
+#endif
ST_SP(dst0, pc4);
ST_SP(dst1, pc5);
ST_SP(dst2, pc6);
ST_SP(dst3, pc7);
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 8; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+
pc0 += 4;
pc1 += 4;
pc2 += 4;
pc7 += 4;
}
- for (i = ((m & 2) / 2); i--;)
+ if (m & 2)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 8; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 2;
pb0 += 8;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 8;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
tmp12 = alpha * tmp12;
tmp14 = alpha * tmp14;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp2;
+ pc2[0] = tmp4;
+ pc3[0] = tmp6;
+ pc4[0] = tmp8;
+ pc5[0] = tmp10;
+ pc6[0] = tmp12;
+ pc7[0] = tmp14;
+#else
pc0[0] += tmp0;
pc1[0] += tmp2;
pc2[0] += tmp4;
pc5[0] += tmp10;
pc6[0] += tmp12;
pc7[0] += tmp14;
-
+#endif
tmp1 = alpha * tmp1;
tmp3 = alpha * tmp3;
tmp5 = alpha * tmp5;
tmp13 = alpha * tmp13;
tmp15 = alpha * tmp15;
+#if defined(TRMMKERNEL)
+ pc0[1] = tmp1;
+ pc1[1] = tmp3;
+ pc2[1] = tmp5;
+ pc3[1] = tmp7;
+ pc4[1] = tmp9;
+ pc5[1] = tmp11;
+ pc6[1] = tmp13;
+ pc7[1] = tmp15;
+#else
pc0[1] += tmp1;
pc1[1] += tmp3;
pc2[1] += tmp5;
pc5[1] += tmp11;
pc6[1] += tmp13;
pc7[1] += tmp15;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 8; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
pc0 += 2;
pc1 += 2;
pc7 += 2;
}
- for (i = (m & 1); i--;)
+ if (m & 1)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 8;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 8; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 1;
pb0 += 8;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 8;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
tmp0 += a0 * b0;
b1 = pb0[1];
- tmp1 += a0 * b1;
+ tmp1 += a0 * b1;
b2 = pb0[2];
tmp2 += a0 * b2;
tmp6 = alpha * tmp6;
tmp7 = alpha * tmp7;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+ pc2[0] = tmp2;
+ pc3[0] = tmp3;
+ pc4[0] = tmp4;
+ pc5[0] = tmp5;
+ pc6[0] = tmp6;
+ pc7[0] = tmp7;
+#else
pc0[0] += tmp0;
pc1[0] += tmp1;
pc2[0] += tmp2;
pc5[0] += tmp5;
pc6[0] += tmp6;
pc7[0] += tmp7;
+#endif
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 8; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 8;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
pc0 += 1;
pc1 += 1;
pc2 += 1;
pc7 += 1;
}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 8; // number of values in A
+#endif
+
l = (k << 3);
B = B + l;
i = (ldc << 3);
C = C + i;
}
- for (j = ((n & 4) / 4); j--;)
+ if (n & 4)
{
pc0 = C;
pc1 = pc0 + ldc;
pa0 = A;
- for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 4;
+#endif
- LD_SP2(pa0, 4, src_a0, src_a1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0 = LD_SP(pb0);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res6 = src_a0 * src_b;
res7 = src_a1 * src_b;
- pa0 += 8;
pb0 += 4;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0 = LD_SP(pb0);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res6 += src_a0 * src_b;
res7 += src_a1 * src_b;
- pa0 += 8;
pb0 += 4;
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0 = LD_SP(pb0);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res6 += src_a0 * src_b;
res7 += src_a1 * src_b;
- pa0 += 8;
pb0 += 4;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0 = LD_SP(pb0);
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res6 += src_a0 * src_b;
res7 += src_a1 * src_b;
- pa0 += 8;
pb0 += 4;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+ dst4 = res4 * v_alpha;
+ dst5 = res5 * v_alpha;
+ dst6 = res6 * v_alpha;
+ dst7 = res7 * v_alpha;
+#else
LD_SP2(pc0, 4, dst0, dst1);
LD_SP2(pc1, 4, dst2, dst3);
LD_SP2(pc2, 4, dst4, dst5);
dst5 += res5 * v_alpha;
dst6 += res6 * v_alpha;
dst7 += res7 * v_alpha;
+#endif
- ST_SP2(dst0, dst1, pc0, 4);
- ST_SP2(dst2, dst3, pc1, 4);
- ST_SP2(dst4, dst5, pc2, 4);
- ST_SP2(dst6, dst7, pc3, 4);
+ ST_SP2_INC(dst0, dst1, pc0, 4);
+ ST_SP2_INC(dst2, dst3, pc1, 4);
+ ST_SP2_INC(dst4, dst5, pc2, 4);
+ ST_SP2_INC(dst6, dst7, pc3, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 4;
+#endif
- pc0 += 8;
- pc1 += 8;
- pc2 += 8;
- pc3 += 8;
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 4) / 4); i--;)
+ if (m & 4)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
pb0 = B;
+ temp = k;
+#endif
src_a0 = LD_SP(pa0);
src_b0 = LD_SP(pb0);
pa0 += 4;
pb0 += 4;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
src_a0 = LD_SP(pa0);
src_b0 = LD_SP(pb0);
pb0 += 4;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
src_a0 = LD_SP(pa0);
src_b0 = LD_SP(pb0);
pa0 += 4;
pb0 += 4;
}
-
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
dst0 = LD_SP(pc0);
dst1 = LD_SP(pc1);
dst2 = LD_SP(pc2);
dst1 += res1 * v_alpha;
dst2 += res2 * v_alpha;
dst3 += res3 * v_alpha;
-
+#endif
ST_SP(dst0, pc0);
ST_SP(dst1, pc1);
ST_SP(dst2, pc2);
ST_SP(dst3, pc3);
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
pc0 += 4;
pc1 += 4;
pc2 += 4;
pc3 += 4;
}
- for (i = ((m & 2) / 2); i--;)
+ if (m & 2)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 2;
pb0 += 4;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 4;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
tmp4 = alpha * tmp4;
tmp6 = alpha * tmp6;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp2;
+ pc2[0] = tmp4;
+ pc3[0] = tmp6;
+#else
pc0[0] += tmp0;
pc1[0] += tmp2;
pc2[0] += tmp4;
pc3[0] += tmp6;
-
+#endif
tmp1 = alpha * tmp1;
tmp3 = alpha * tmp3;
tmp5 = alpha * tmp5;
tmp7 = alpha * tmp7;
+#if defined(TRMMKERNEL)
+ pc0[1] = tmp1;
+ pc1[1] = tmp3;
+ pc2[1] = tmp5;
+ pc3[1] = tmp7;
+#else
pc0[1] += tmp1;
pc1[1] += tmp3;
pc2[1] += tmp5;
pc3[1] += tmp7;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
pc0 += 2;
pc1 += 2;
pc3 += 2;
}
- for (i = (m & 1); i--;)
+ if (m & 1)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 1;
pb0 += 4;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 4;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
tmp2 = alpha * tmp2;
tmp3 = alpha * tmp3;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+ pc2[0] = tmp2;
+ pc3[0] = tmp3;
+#else
pc0[0] += tmp0;
pc1[0] += tmp1;
pc2[0] += tmp2;
pc3[0] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 4;
+#endif
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
pc0 += 1;
pc1 += 1;
pc2 += 1;
pc3 += 1;
}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
l = (k << 2);
B = B + l;
i = (ldc << 2);
C = C + i;
}
- for (j = ((n & 2) / 2); j--;)
+ if (n & 2)
{
pc0 = C;
pc1 = pc0 + ldc;
pa0 = A;
- for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0[0] = pb0[0];
src_b0[1] = pb0[1];
res2 = src_a0 * src_b;
res3 = src_a1 * src_b;
- pa0 += 8;
pb0 += 2;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0[0] = pb0[0];
src_b0[1] = pb0[1];
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
- pa0 += 8;
pb0 += 2;
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0[0] = pb0[0];
src_b0[1] = pb0[1];
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
- pa0 += 8;
pb0 += 2;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0[0] = pb0[0];
src_b0[1] = pb0[1];
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
- pa0 += 8;
pb0 += 2;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+ dst2 = res2 * v_alpha;
+ dst3 = res3 * v_alpha;
+#else
LD_SP2(pc0, 4, dst0, dst1);
LD_SP2(pc1, 4, dst2, dst3);
dst1 += res1 * v_alpha;
dst2 += res2 * v_alpha;
dst3 += res3 * v_alpha;
+#endif
+ ST_SP2_INC(dst0, dst1, pc0, 4);
+ ST_SP2_INC(dst2, dst3, pc1, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 2;
+#endif
- ST_SP2(dst0, dst1, pc0, 4);
- ST_SP2(dst2, dst3, pc1, 4);
-
- pc0 += 8;
- pc1 += 8;
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 4) / 4); i--;)
+ if (m & 4)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
pb0 = B;
+ temp = k;
+#endif
src_a0 = LD_SP(pa0);
src_b0[0] = pb0[0];
pa0 += 4;
pb0 += 2;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
src_a0 = LD_SP(pa0);
src_b0[0] = pb0[0];
pb0 += 2;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
src_a0 = LD_SP(pa0);
src_b0[0] = pb0[0];
pb0 += 2;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+#else
dst0 = LD_SP(pc0);
dst1 = LD_SP(pc1);
dst0 += res0 * v_alpha;
dst1 += res1 * v_alpha;
-
+#endif
ST_SP(dst0, pc0);
ST_SP(dst1, pc1);
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
pc0 += 4;
pc1 += 4;
}
- for (i = ((m & 2) / 2); i--;)
+ if (m & 2)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 2;
pb0 += 2;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 2;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
}
tmp0 = alpha * tmp0;
+ tmp1 = alpha * tmp1;
tmp2 = alpha * tmp2;
+ tmp3 = alpha * tmp3;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp2;
+ pc0[1] = tmp1;
+ pc1[1] = tmp3;
+#else
pc0[0] += tmp0;
pc1[0] += tmp2;
-
- tmp1 = alpha * tmp1;
- tmp3 = alpha * tmp3;
-
pc0[1] += tmp1;
pc1[1] += tmp3;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
pc0 += 2;
pc1 += 2;
}
- for (i = (m & 1); i--;)
+ if (m & 1)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 1;
pb0 += 2;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 2;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
tmp0 = alpha * tmp0;
tmp1 = alpha * tmp1;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc1[0] = tmp1;
+#else
pc0[0] += tmp0;
pc1[0] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 2;
+#endif
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
pc0 += 1;
pc1 += 1;
}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
l = (k << 1);
B = B + l;
i = (ldc << 1);
C = C + i;
}
- for (j = (n & 1); j--;)
+ if (n & 1)
{
pc0 = C;
pa0 = A;
- for (i = (m / 8); i--;)
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 3); i--;)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 8;
+ pb0 = B + off * 1;
+#endif
- LD_SP2(pa0, 4, src_a0, src_a1);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 8; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0[0] = pb0[0];
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 = src_a0 * src_b;
res1 = src_a1 * src_b;
- pa0 += 8;
pb0 += 1;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0[0] = pb0[0];
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
- pa0 += 8;
pb0 += 1;
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0[0] = pb0[0];
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
- pa0 += 8;
pb0 += 1;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
- LD_SP2(pa0, 4, src_a0, src_a1);
+ LD_SP2_INC(pa0, 4, src_a0, src_a1);
src_b0[0] = pb0[0];
src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
- pa0 += 8;
pb0 += 1;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+ dst1 = res1 * v_alpha;
+#else
LD_SP2(pc0, 4, dst0, dst1);
dst0 += res0 * v_alpha;
dst1 += res1 * v_alpha;
+#endif
+ ST_SP2_INC(dst0, dst1, pc0, 4);
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 8; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 8;
+ pb0 += temp * 1;
+#endif
- ST_SP2(dst0, dst1, pc0, 4);
-
- pc0 += 8;
+#ifdef LEFT
+ off += 8; // number of values in A
+#endif
+#endif
}
- for (i = ((m & 4) / 4); i--;)
+ if (m & 4)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 4;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
pb0 = B;
+ temp = k;
+#endif
src_a0 = LD_SP(pa0);
src_b0[0] = pb0[0];
pa0 += 4;
pb0 += 1;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
src_a0 = LD_SP(pa0);
src_b0[0] = pb0[0];
pb0 += 1;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
src_a0 = LD_SP(pa0);
src_b0[0] = pb0[0];
pb0 += 1;
}
+#if defined(TRMMKERNEL)
+ dst0 = res0 * v_alpha;
+#else
dst0 = LD_SP(pc0);
dst0 += res0 * v_alpha;
-
+#endif
ST_SP(dst0, pc0);
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 4;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
pc0 += 4;
}
- for (i = (m & 2) / 2; i--;)
+ if (m & 2)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 2;
pb0 += 1;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 1;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 1;
}
- tmp0 = alpha * tmp0;
+#if defined(TRMMKERNEL)
+ pc0[0] = tmp0;
+ pc0[1] = tmp1;
+#else
pc0[0] += tmp0;
-
- tmp1 = alpha * tmp1;
pc0[1] += tmp1;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
pc0 += 2;
}
- for (i = (m & 1); i--;)
+ if (m & 1)
{
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
pb0 = B;
+#else
+ pa0 += off * 1;
+ pb0 = B + off * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
a0 = pa0[0];
b0 = pb0[0];
pa0 += 1;
pb0 += 1;
- for (l = ((k - 1) / 2); l--;)
+ for (l = ((temp - 1) >> 1); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 1;
}
- if ((k - 1) & 1)
+ if ((temp - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
pb0 += 1;
}
+#if defined(TRMMKERNEL)
+ pc0[0] = alpha * tmp0;
+#else
pc0[0] += alpha * tmp0;
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 1;
+ pb0 += temp * 1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
pc0 += 1;
}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
l = (k << 0);
B = B + l;
i = (ldc << 0);
#include "common.h"
#include "macros_msa.h"
-int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
- FLOAT * __restrict dst)
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
{
BLASLONG i, j;
- FLOAT *psrc0;
- FLOAT *psrc1, *psrc2, *psrc3, *psrc4;
- FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
- FLOAT *pdst;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+ FLOAT *psrc8, *pdst;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
for (i = (m >> 3); i--;)
{
- LD_SP2(psrc1, 4, src0, src1);
- LD_SP2(psrc2, 4, src2, src3);
- LD_SP2(psrc3, 4, src4, src5);
- LD_SP2(psrc4, 4, src6, src7);
- LD_SP2(psrc5, 4, src8, src9);
- LD_SP2(psrc6, 4, src10, src11);
- LD_SP2(psrc7, 4, src12, src13);
- LD_SP2(psrc8, 4, src14, src15);
- psrc1 += 8;
- psrc2 += 8;
- psrc3 += 8;
- psrc4 += 8;
- psrc5 += 8;
- psrc6 += 8;
- psrc7 += 8;
- psrc8 += 8;
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+ LD_SP2_INC(psrc5, 4, src8, src9);
+ LD_SP2_INC(psrc6, 4, src10, src11);
+ LD_SP2_INC(psrc7, 4, src12, src13);
+ LD_SP2_INC(psrc8, 4, src14, src15);
TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6);
TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5,
TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13,
dst15);
- ST_SP2(dst0, dst1, pdst, 4);
- ST_SP2(dst2, dst3, pdst + 8, 4);
- ST_SP2(dst4, dst5, pdst + 16, 4);
- ST_SP2(dst6, dst7, pdst + 24, 4);
- ST_SP2(dst8, dst9, pdst + 32, 4);
- ST_SP2(dst10, dst11, pdst + 40, 4);
- ST_SP2(dst12, dst13, pdst + 48, 4);
- ST_SP2(dst14, dst15, pdst + 56, 4);
- pdst += 64;
+ ST_SP2_INC(dst0, dst1, pdst, 4);
+ ST_SP2_INC(dst2, dst3, pdst, 4);
+ ST_SP2_INC(dst4, dst5, pdst, 4);
+ ST_SP2_INC(dst6, dst7, pdst, 4);
+ ST_SP2_INC(dst8, dst9, pdst, 4);
+ ST_SP2_INC(dst10, dst11, pdst, 4);
+ ST_SP2_INC(dst12, dst13, pdst, 4);
+ ST_SP2_INC(dst14, dst15, pdst, 4);
}
for (i = (m & 7); i--;)
TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
- ST_SP2(dst0, dst1, pdst, 4);
- ST_SP2(dst2, dst3, pdst + 8, 4);
- pdst += 16;
+ ST_SP2_INC(dst0, dst1, pdst, 4);
+ ST_SP2_INC(dst2, dst3, pdst, 4);
}
for (i = (m & 3); i--;)
#include "common.h"
#include "macros_msa.h"
-int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
- FLOAT * __restrict dst)
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
{
BLASLONG i, j;
- FLOAT *psrc0;
- FLOAT *psrc1, *psrc2, *psrc3, *psrc4;
- FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
- FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
+ FLOAT *psrc8, *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
for (i = (n >> 3); i--;)
{
- LD_SP2(psrc1, 4, src0, src1);
- LD_SP2(psrc2, 4, src2, src3);
- LD_SP2(psrc3, 4, src4, src5);
- LD_SP2(psrc4, 4, src6, src7);
- LD_SP2(psrc5, 4, src8, src9);
- LD_SP2(psrc6, 4, src10, src11);
- LD_SP2(psrc7, 4, src12, src13);
- LD_SP2(psrc8, 4, src14, src15);
- psrc1 += 8;
- psrc2 += 8;
- psrc3 += 8;
- psrc4 += 8;
- psrc5 += 8;
- psrc6 += 8;
- psrc7 += 8;
- psrc8 += 8;
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
+ LD_SP2_INC(psrc5, 4, src8, src9);
+ LD_SP2_INC(psrc6, 4, src10, src11);
+ LD_SP2_INC(psrc7, 4, src12, src13);
+ LD_SP2_INC(psrc8, 4, src14, src15);
ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15,
psrc7 += 4;
psrc8 += 4;
- ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4);
- pdst2 += 32;
+ ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4);
}
if (n & 2)
for (i = (n >> 3); i--;)
{
- LD_SP2(psrc1, 4, src0, src1);
- LD_SP2(psrc2, 4, src2, src3);
- LD_SP2(psrc3, 4, src4, src5);
- LD_SP2(psrc4, 4, src6, src7);
- psrc1 += 8;
- psrc2 += 8;
- psrc3 += 8;
- psrc4 += 8;
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
+ LD_SP2_INC(psrc3, 4, src4, src5);
+ LD_SP2_INC(psrc4, 4, src6, src7);
ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
pdst1 += 8 * m;
psrc3 += 4;
psrc4 += 4;
- ST_SP4(src0, src1, src2, src3, pdst2, 4);
- pdst2 += 16;
+ ST_SP4_INC(src0, src1, src2, src3, pdst2, 4);
}
if (n & 2)
for (i = (n >> 3); i--;)
{
- LD_SP2(psrc1, 4, src0, src1);
- LD_SP2(psrc2, 4, src2, src3);
- psrc1 += 8;
- psrc2 += 8;
+ LD_SP2_INC(psrc1, 4, src0, src1);
+ LD_SP2_INC(psrc2, 4, src2, src3);
ST_SP4(src0, src1, src2, src3, pdst1, 4);
pdst1 += 8 * m;
psrc1 += 4;
psrc2 += 4;
- ST_SP2(src0, src1, pdst2, 4);
- pdst2 += 8;
+ ST_SP2_INC(src0, src1, pdst2, 4);
}
if (n & 2)
for (i = (n >> 3); i--;)
{
- LD_SP2(psrc1, 4, src0, src1);
- psrc1 += 8;
+ LD_SP2_INC(psrc1, 4, src0, src1);
ST_SP2(src0, src1, pdst1, 4);
pdst1 += 8 * m;
}
}
- return 0;
+ return 0;
}
src_a = LD_SP(a + 32);
SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
- COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36);
+ src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
res_c4 *= src_a36;
res_c12 *= src_a36;
res_c0 -= res_c2 * src_a16;
res_c8 -= res_c10 * src_a16;
- COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9);
- COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8);
- COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
+ src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
+ src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
+ src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
res_c1 *= src_a9;
res_c9 *= src_a9;
bb += 4;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
LD_SP2(aa, 4, src_a0, src_a1);
src_a = LD_SP(a + 32);
SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
- COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36);
+ src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
res_c4 *= src_a36;
res_c3 -= res_c4 * src_a35;
res_c1 -= res_c2 * src_a17;
res_c0 -= res_c2 * src_a16;
- COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9);
- COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8);
- COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
+ src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
+ src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
+ src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
res_c1 *= src_a9;
res_c0 -= res_c1 * src_a8;
src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
- COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5);
- COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4);
- COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
+ src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
+ src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
+ src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
res_c3 *= src_a15;
res_c7 *= src_a15;
bb += 4;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
src_a0 = LD_SP(aa);
src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
- COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5);
- COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4);
- COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
+ src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
+ src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
+ src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
res_c3 *= src_a15;
res_c2 -= res_c3 * src_a14;
src_a = LD_SP(a + 27);
SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
- COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31);
+ src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
res_c3 *= src_a27;
res_c11 *= src_a27;
res_c7 -= res_c5 * src_a47;
res_c15 -= res_c13 * src_a47;
- COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54);
- COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55);
- COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63);
+ src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
+ src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
+ src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
res_c6 *= src_a54;
res_c14 *= src_a54;
src_a = LD_SP(a + 27);
SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
- COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31);
+ src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
res_c3 *= src_a27;
res_c4 -= res_c3 * src_a28;
res_c6 -= res_c5 * src_a46;
res_c7 -= res_c5 * src_a47;
- COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54);
- COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55);
- COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63);
+ src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
+ src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
+ src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
res_c6 *= src_a54;
res_c7 -= res_c6 * src_a55;
b += 8;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
src_a0 = LD_SP(a);
src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
- COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10);
- COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11);
- COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15);
+ src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
+ src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
+ src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
res_c0 *= src_a0;
res_c4 *= src_a0;
b += 4;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
src_a0 = LD_SP(a);
src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
- COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10);
- COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11);
- COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15);
+ src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
+ src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
+ src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
res_c0 *= src_a0;
res_c1 -= res_c0 * src_a1;
src_b = LD_SP(b + 27);
SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
- COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31);
+ src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
src_c4 *= src_b18;
src_c5 *= src_b18;
src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
- COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54);
- COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55);
- COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63);
+ src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
+ src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
+ src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
src_c8 *= src_b36;
src_c9 *= src_b36;
b += 4;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
LD_SP2(a, 4, src_a0, src_a1);
src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
- COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10);
- COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11);
- COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15);
+ src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
+ src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
+ src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
src_c0 *= src_b0;
src_c1 *= src_b0;
{
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
b += 2;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
b += 2;
}
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
- COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+ src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
src_c0 *= src_b0;
src_c1 *= src_b0;
{
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
b += 1;
}
- if (bk & 3)
+ if ((bk & 3) && (bk > 0))
{
if (bk & 2)
{
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
{
LD_SP2(a, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
}
}
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 *= src_b0;
src_c1 *= src_b0;
src_b = LD_SP(b + 27);
SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
- COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31);
+ src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
src_b = LD_SP(b + 36);
SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
- COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54);
- COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55);
- COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63);
+ src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
+ src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
+ src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
src_c0 *= src_b0;
src_c1 -= src_c0 * src_b1;
b += 4;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
src_a0 = LD_SP(a);
src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
- COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10);
- COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11);
- COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15);
+ src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
+ src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
+ src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
src_c0 *= src_b0;
src_c1 -= src_c0 * src_b1;
b += 2;
}
- if (bk & 3)
+ if ((bk & 3) && (bk > 0))
{
if (bk & 2)
{
}
}
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
- COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
+ src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
src_c0 *= src_b0;
src_c1 -= src_c0 * src_b1;
src_b = LD_SP(b + 32);
SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
- COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36);
+ src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
src_c8 *= src_b36;
src_c9 *= src_b36;
ST_SP2(src_c4, src_c5, c_nxt2line, 4);
ST_SP2(src_c6, src_c7, c_nxt3line, 4);
- COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9);
- COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
+ src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c2 *= src_b9;
src_c3 *= src_b9;
bb += 4;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
LD_SP2(aa, 4, src_a0, src_a1);
src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
- COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5);
- COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
+ src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c7 *= src_b15;
src_c6 *= src_b15;
{
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
bb += 2;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
+ src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
a -= 16;
b -= 4;
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
- COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2);
- COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
+ src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
+ src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
src_c2 *= src_b3;
src_c3 *= src_b3;
{
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
bb += 1;
}
- if (bk & 3)
+ if ((bk & 3) && (bk > 0))
{
if (bk & 2)
{
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
{
LD_SP2(aa, 4, src_a0, src_a1);
- COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
src_c0 -= src_a0 * src_b0;
src_c1 -= src_a1 * src_b0;
a -= 8;
b -= 1;
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c0 *= src_b0;
src_c1 *= src_b0;
src_b = LD_SP(b + 32);
SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
- COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36);
+ src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
src_b = LD_SP(b + 24);
SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
- COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9);
- COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
+ src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c7 *= src_b63;
src_c6 -= src_c7 * src_b62;
bb += 4;
}
- if (bk & 1)
+ if ((bk & 1) && (bk > 0))
{
src_a = LD_SP(aa);
src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
- COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5);
- COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
+ src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c3 *= src_b15;
src_c2 -= src_c3 * src_b14;
bb += 2;
}
- if (bk & 3)
+ if ((bk & 3) && (bk > 0))
{
if (bk & 2)
{
a -= 8;
b -= 4;
- COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
- COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2);
- COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
+ src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
+ src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
+ src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
src_c1 *= src_b3;
src_c0 -= src_c1 * src_b2;
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#define ZGEMM_KERNEL_4X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
+ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = OP4 src_a1r * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_D2_DP(src_b1, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ res3_r OP0## = src_a1r * src_br; \
+ res3_r OP1## = src_a1i * src_bi; \
+ res3_i OP2## = OP4 src_a1r * src_bi; \
+ res3_i OP3## = src_a1i * src_br; \
+ \
+ /* 2nd col */ \
+ SPLATI_D2_DP(src_b2, src_br, src_bi); \
+ res4_r OP0## = src_a0r * src_br; \
+ res4_r OP1## = src_a0i * src_bi; \
+ res4_i OP2## = OP4 src_a0r * src_bi; \
+ res4_i OP3## = src_a0i * src_br; \
+ \
+ res5_r OP0## = src_a1r * src_br; \
+ res5_r OP1## = src_a1i * src_bi; \
+ res5_i OP2## = OP4 src_a1r * src_bi; \
+ res5_i OP3## = src_a1i * src_br; \
+ \
+ /* 3rd col */ \
+ SPLATI_D2_DP(src_b3, src_br, src_bi); \
+ res6_r OP0## = src_a0r * src_br; \
+ res6_r OP1## = src_a0i * src_bi; \
+ res6_i OP2## = OP4 src_a0r * src_bi; \
+ res6_i OP3## = src_a0i * src_br; \
+ \
+ res7_r OP0## = src_a1r * src_br; \
+ res7_r OP1## = src_a1i * src_bi; \
+ res7_i OP2## = OP4 src_a1r * src_bi; \
+ res7_i OP3## = src_a1i * src_br; \
+}
+
+#define ZGEMM_KERNEL_2X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP2_INC(pa0, 2, src_a0, src_a1); \
+ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_D2_DP(src_b1, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ /* 2nd col */ \
+ SPLATI_D2_DP(src_b2, src_br, src_bi); \
+ res4_r OP0## = src_a0r * src_br; \
+ res4_r OP1## = src_a0i * src_bi; \
+ res4_i OP2## = OP4 src_a0r * src_bi; \
+ res4_i OP3## = src_a0i * src_br; \
+ \
+ /* 3rd col */ \
+ SPLATI_D2_DP(src_b3, src_br, src_bi); \
+ res6_r OP0## = src_a0r * src_br; \
+ res6_r OP1## = src_a0i * src_bi; \
+ res6_i OP2## = OP4 src_a0r * src_bi; \
+ res6_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_1X4_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ src_a0 = LD_DP(pa0); \
+ LD_DP4_INC(pb0, 2, src_b0, src_b1, src_b2, src_b3); \
+ \
+ PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th and 1st col */ \
+ PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 2nd and 3rd col */ \
+ PCKEVOD_D2_DP(src_b3, src_b2, src_br, src_bi); \
+ res1_r OP0## = src_a0r * src_br; \
+ res1_r OP1## = src_a0i * src_bi; \
+ res1_i OP2## = OP4 src_a0r * src_bi; \
+ res1_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_4X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
+ LD_DP2_INC(pb0, 2, src_b0, src_b1); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = OP4 src_a1r * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_D2_DP(src_b1, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+ \
+ res3_r OP0## = src_a1r * src_br; \
+ res3_r OP1## = src_a1i * src_bi; \
+ res3_i OP2## = OP4 src_a1r * src_bi; \
+ res3_i OP3## = src_a1i * src_br; \
+}
+
+#define ZGEMM_KERNEL_2X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP2_INC(pa0, 2, src_a0, src_a1); \
+ LD_DP2_INC(pb0, 2, src_b0, src_b1); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ /* 1st col */ \
+ SPLATI_D2_DP(src_b1, src_br, src_bi); \
+ res2_r OP0## = src_a0r * src_br; \
+ res2_r OP1## = src_a0i * src_bi; \
+ res2_i OP2## = OP4 src_a0r * src_bi; \
+ res2_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_1X2_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ src_a0 = LD_DP(pa0); \
+ LD_DP2_INC(pb0, 2, src_b0, src_b1); \
+ \
+ PCKEVOD_D2_DP(src_a0, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th and 1st col */ \
+ PCKEVOD_D2_DP(src_b1, src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP4_INC(pa0, 2, src_a0, src_a1, src_a2, src_a3); \
+ src_b0 = LD_DP(pb0); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ PCKEVOD_D2_DP(src_a3, src_a2, src_a1r, src_a1i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+ \
+ res1_r OP0## = src_a1r * src_br; \
+ res1_r OP1## = src_a1i * src_bi; \
+ res1_i OP2## = OP4 src_a1r * src_bi; \
+ res1_i OP3## = src_a1i * src_br; \
+}
+
+#define ZGEMM_KERNEL_2X1_MSA(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ LD_DP2_INC(pa0, 2, src_a0, src_a1); \
+ src_b0 = LD_DP(pb0); \
+ \
+ PCKEVOD_D2_DP(src_a1, src_a0, src_a0r, src_a0i); \
+ \
+ /* 0th col */ \
+ SPLATI_D2_DP(src_b0, src_br, src_bi); \
+ res0_r OP0## = src_a0r * src_br; \
+ res0_r OP1## = src_a0i * src_bi; \
+ res0_i OP2## = OP4 src_a0r * src_bi; \
+ res0_i OP3## = src_a0i * src_br; \
+}
+
+#define ZGEMM_KERNEL_1X1(OP0, OP1, OP2, OP3, OP4) \
+{ \
+ /* 0th col */ \
+ a0_r = pa0[0]; \
+ a0_i = pa0[1]; \
+ b0_r = pb0[0]; \
+ b0_i = pb0[1]; \
+ \
+ res0 OP0## = a0_r * b0_r; \
+ res0 OP1## = a0_i * b0_i; \
+ res1 OP2## = OP4 a0_r * b0_i; \
+ res1 OP3## = a0_i * b0_r; \
+}
+
+#define ZGEMM_SCALE_4X4_MSA \
+{ \
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \
+ \
+ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r += alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i += alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
+ \
+ LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i += alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ dst1_r += alpha_r * res5_r; \
+ dst1_r -= alpha_i * res5_i; \
+ dst1_i += alpha_r * res5_i; \
+ dst1_i += alpha_i * res5_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); \
+ \
+ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i += alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ dst1_r += alpha_r * res7_r; \
+ dst1_r -= alpha_i * res7_i; \
+ dst1_i += alpha_r * res7_i; \
+ dst1_i += alpha_i * res7_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \
+}
+
+#define ZGEMM_SCALE_2X4_MSA \
+{ \
+ LD_DP2(pc0, 2, dst0, dst1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ LD_DP2(pc1, 2, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+ ST_DP2_INC(dst2, dst3, pc1, 2); \
+ \
+ LD_DP2(pc2, 2, dst0, dst1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i += alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ LD_DP2(pc3, 2, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i += alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst0, dst1, pc2, 2); \
+ ST_DP2_INC(dst2, dst3, pc3, 2); \
+}
+
+#define ZGEMM_SCALE_1X4_MSA \
+{ \
+ dst0 = LD_DP(pc0); \
+ dst1 = LD_DP(pc1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ dst2 = LD_DP(pc2); \
+ dst3 = LD_DP(pc3); \
+ \
+ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res1_r; \
+ dst0_r -= alpha_i * res1_i; \
+ dst0_i += alpha_r * res1_i; \
+ dst0_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP(dst0, pc0); \
+ ST_DP(dst1, pc1); \
+ ST_DP(dst2, pc2); \
+ ST_DP(dst3, pc3); \
+}
+
+#define ZGEMM_SCALE_4X2_MSA \
+{ \
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); \
+ \
+ PCKEVOD_D2_DP(dst5, dst4, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst7, dst6, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r += alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i += alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
+}
+
+#define ZGEMM_SCALE_2X2_MSA \
+{ \
+ LD_DP2(pc0, 2, dst0, dst1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+ \
+ LD_DP2(pc1, 2, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst3, dst2, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i += alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst2, dst3, pc1, 2); \
+}
+
+#define ZGEMM_SCALE_1X2_MSA \
+{ \
+ dst0 = LD_DP(pc0); \
+ dst1 = LD_DP(pc1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP(dst0, pc0); \
+ ST_DP(dst1, pc1); \
+}
+
+#define ZGEMM_SCALE_4X1_MSA \
+{ \
+ LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ PCKEVOD_D2_DP(dst3, dst2, dst1_r, dst1_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r += alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i += alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+}
+
+#define ZGEMM_SCALE_2X1_MSA \
+{ \
+ LD_DP2(pc0, 2, dst0, dst1); \
+ \
+ PCKEVOD_D2_DP(dst1, dst0, dst0_r, dst0_i); \
+ \
+ dst0_r += alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i += alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+}
+
+#define ZGEMM_SCALE_1X1 \
+{ \
+ pc0[0] += alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] += alphar * res1; \
+ pc0[1] += alphai * res0; \
+}
+
+#define ZGEMM_TRMM_SCALE_4X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r = alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i = alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
+ \
+ dst0_r = alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i = alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ dst1_r = alpha_r * res5_r; \
+ dst1_r -= alpha_i * res5_i; \
+ dst1_i = alpha_r * res5_i; \
+ dst1_i += alpha_i * res5_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ dst0_r = alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i = alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ dst1_r = alpha_r * res7_r; \
+ dst1_r -= alpha_i * res7_i; \
+ dst1_i = alpha_r * res7_i; \
+ dst1_i += alpha_i * res7_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc2, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc3, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_2X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+ ST_DP2_INC(dst2, dst3, pc1, 2); \
+ \
+ dst0_r = alpha_r * res4_r; \
+ dst0_r -= alpha_i * res4_i; \
+ dst0_i = alpha_r * res4_i; \
+ dst0_i += alpha_i * res4_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ dst0_r = alpha_r * res6_r; \
+ dst0_r -= alpha_i * res6_i; \
+ dst0_i = alpha_r * res6_i; \
+ dst0_i += alpha_i * res6_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst0, dst1, pc2, 2); \
+ ST_DP2_INC(dst2, dst3, pc3, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_1X4_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ dst0_r = alpha_r * res1_r; \
+ dst0_r -= alpha_i * res1_i; \
+ dst0_i = alpha_r * res1_i; \
+ dst0_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP(dst0, pc0); \
+ ST_DP(dst1, pc1); \
+ ST_DP(dst2, pc2); \
+ ST_DP(dst3, pc3); \
+}
+
+#define ZGEMM_TRMM_SCALE_4X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ dst1_r = alpha_r * res3_r; \
+ dst1_r -= alpha_i * res3_i; \
+ dst1_i = alpha_r * res3_i; \
+ dst1_i += alpha_i * res3_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst4, dst5); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst6, dst7); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+ ST_DP4_INC(dst4, dst5, dst6, dst7, pc1, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_2X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+ \
+ dst0_r = alpha_r * res2_r; \
+ dst0_r -= alpha_i * res2_i; \
+ dst0_i = alpha_r * res2_i; \
+ dst0_i += alpha_i * res2_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst2, dst3); \
+ \
+ ST_DP2_INC(dst2, dst3, pc1, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_1X2_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP(dst0, pc0); \
+ ST_DP(dst1, pc1); \
+}
+
+#define ZGEMM_TRMM_SCALE_4X1_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ dst1_r = alpha_r * res1_r; \
+ dst1_r -= alpha_i * res1_i; \
+ dst1_i = alpha_r * res1_i; \
+ dst1_i += alpha_i * res1_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ ILVRL_D2_DP(dst1_i, dst1_r, dst2, dst3); \
+ \
+ ST_DP4_INC(dst0, dst1, dst2, dst3, pc0, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_2X1_MSA \
+{ \
+ dst0_r = alpha_r * res0_r; \
+ dst0_r -= alpha_i * res0_i; \
+ dst0_i = alpha_r * res0_i; \
+ dst0_i += alpha_i * res0_r; \
+ \
+ ILVRL_D2_DP(dst0_i, dst0_r, dst0, dst1); \
+ \
+ ST_DP2_INC(dst0, dst1, pc0, 2); \
+}
+
+#define ZGEMM_TRMM_SCALE_1X1 \
+{ \
+ pc0[0] = alphar * res0; \
+ pc0[0] -= alphai * res1; \
+ pc0[1] = alphar * res1; \
+ pc0[1] += alphai * res0; \
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alphar, FLOAT alphai,
+ FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc
+#ifdef TRMMKERNEL
+ , BLASLONG offset
+#endif
+ )
+{
+ BLASLONG i, j, l, temp;
+#if defined(TRMMKERNEL)
+ BLASLONG off;
+#endif
+ FLOAT *pc0, *pc1, *pc2, *pc3, *pa0, *pb0;
+ FLOAT res0, res1, a0_r, a0_i, b0_r, b0_i;
+ v2f64 src_a0, src_a1, src_a2, src_a3, src_b0, src_b1, src_b2, src_b3;
+ v2f64 src_a0r, src_a0i, src_a1r, src_a1i, src_br, src_bi;
+ v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v2f64 dst0_r, dst0_i, dst1_r, dst1_i, alpha_r, alpha_i;
+ v2f64 res0_r, res0_i, res1_r, res1_i, res2_r, res2_i, res3_r, res3_i;
+ v2f64 res4_r, res4_i, res5_r, res5_i, res6_r, res6_i, res7_r, res7_i;
+
+ alpha_r = COPY_DOUBLE_TO_VECTOR(alphar);
+ alpha_i = COPY_DOUBLE_TO_VECTOR(alphai);
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off = -offset;
+#endif
+
+ for (j = (n >> 2); j--;)
+ {
+ pc0 = C;
+ pc1 = pc0 + 2 * ldc;
+ pc2 = pc1 + 2 * ldc;
+ pc3 = pc2 + 2 * ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 2); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X4_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X4_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_4X4_MSA
+#else
+ ZGEMM_SCALE_4X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X4_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X4_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_2X4_MSA
+#else
+ ZGEMM_SCALE_2X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 4;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 4; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X4_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X4_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X4_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X4_MSA(, -, , -, -);
+#endif
+
+ pa0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X4_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X4_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X4_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X4_MSA(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_1X4_MSA
+#else
+ ZGEMM_SCALE_1X4_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 4; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 4;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ pc2 += 2;
+ pc3 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 4; // number of values in A
+#endif
+
+ l = k << 3;
+ B = B + l;
+ i = ldc << 3;
+ C = C + i;
+ }
+
+ if (n & 2)
+ {
+ pc0 = C;
+ pc1 = pc0 + 2 * ldc;
+
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 2); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X2_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X2_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_4X2_MSA
+#else
+ ZGEMM_SCALE_4X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X2_MSA(, -, , -, -);
+#endif
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X2_MSA(+, -, -, -,);
+#endif
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_2X2_MSA
+#else
+ ZGEMM_SCALE_2X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 2; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X2_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X2_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X2_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X2_MSA(, -, , -, -);
+#endif
+
+ pa0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X2_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X2_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X2_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X2_MSA(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_1X2_MSA
+#else
+ ZGEMM_SCALE_1X2_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 2; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 2;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ pc1 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 2; // number of values in A
+#endif
+
+ l = k << 2;
+ B = B + l;
+ i = ldc << 2;
+ C = C + i;
+ }
+
+ if (n & 1)
+ {
+ pc0 = C;
+ pa0 = A;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+ off = offset;
+#endif
+
+ for (i = (m >> 2); i--;)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 4;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 4; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X1_MSA(, -, , -, -);
+#endif
+
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_4X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_4X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_4X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_4X1_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_4X1_MSA
+#else
+ ZGEMM_SCALE_4X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 4; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 4;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 4; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 2)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 2;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 2; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X1_MSA(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X1_MSA(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X1_MSA(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X1_MSA(, -, , -, -);
+#endif
+
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_2X1_MSA(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_2X1_MSA(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_2X1_MSA(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_2X1_MSA(+, -, -, -,);
+#endif
+
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_2X1_MSA
+#else
+ ZGEMM_SCALE_2X1_MSA
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 2; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 2;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 2; // number of values in A
+#endif
+#endif
+ }
+
+ if (m & 1)
+ {
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ pb0 = B;
+#else
+ pa0 += off * 2 * 1;
+ pb0 = B + off * 2 * 1;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+ temp = k - off;
+#elif defined(LEFT)
+ temp = off + 1; // number of values in A
+#else
+ temp = off + 1; // number of values in B
+#endif
+#else
+ pb0 = B;
+ temp = k;
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X1(, -, , +, +);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X1(, +, , +, -);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X1(, +, , -, +);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X1(, -, , -, -);
+#endif
+
+ pa0 += 2;
+ pb0 += 2;
+
+ for (l = (temp - 1); l--;)
+ {
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+ ZGEMM_KERNEL_1X1(+, -, +, +,);
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+ ZGEMM_KERNEL_1X1(+, +, -, +,);
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+ ZGEMM_KERNEL_1X1(+, +, +, -,);
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+ ZGEMM_KERNEL_1X1(+, -, -, -,);
+#endif
+
+ pa0 += 2;
+ pb0 += 2;
+ }
+
+#if defined(TRMMKERNEL)
+ ZGEMM_TRMM_SCALE_1X1
+#else
+ ZGEMM_SCALE_1X1
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+ temp = k - off;
+#ifdef LEFT
+ temp -= 1; // number of values in A
+#else
+ temp -= 1; // number of values in B
+#endif
+ pa0 += temp * 2 * 1;
+ pb0 += temp * 2 * 1;
+#endif
+
+#ifdef LEFT
+ off += 1; // number of values in A
+#endif
+#endif
+
+ pc0 += 2;
+ }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+ off += 1; // number of values in A
+#endif
+
+ l = k << 1;
+ B = B + l;
+ i = ldc << 1;
+ C = C + i;
+ }
+ return 0;
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ psrc0 = src;
+ pdst = dst;
+ lda *= 2;
+
+ for (j = (n >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
+
+ ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2);
+ ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15,
+ pdst, 2);
+ }
+
+ if (m & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src4, src5);
+ LD_DP2_INC(psrc3, 2, src8, src9);
+ LD_DP2_INC(psrc4, 2, src12, src13);
+
+ ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2);
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_DP(psrc1);
+ src4 = LD_DP(psrc2);
+ src8 = LD_DP(psrc3);
+ src12 = LD_DP(psrc4);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ ST_DP4_INC(src0, src4, src8, src12, pdst, 2);
+ }
+ }
+
+ if (n & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+
+ ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2);
+ }
+
+ if (m & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src4, src5);
+
+ ST_DP4_INC(src0, src4, src1, src5, pdst, 2);
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_DP(psrc1);
+ src4 = LD_DP(psrc2);
+ psrc1 += 2;
+ psrc2 += 2;
+
+ ST_DP2_INC(src0, src4, pdst, 2);
+ }
+ }
+
+ if (n & 1)
+ {
+ psrc1 = psrc0;
+
+ for (i = (m >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ ST_DP4_INC(src0, src1, src2, src3, pdst, 2);
+ }
+
+ if (m & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ ST_DP2_INC(src0, src1, pdst, 2);
+ }
+
+ if (m & 1)
+ {
+ src0 = LD_DP(psrc1);
+ ST_DP(src0, pdst);
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
+{
+ BLASLONG i, j;
+ FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
+ FLOAT *pdst0, *pdst1, *pdst2, *pdst3;
+ v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
+ v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
+
+ psrc0 = src;
+ pdst0 = dst;
+ lda *= 2;
+
+ pdst2 = dst + 2 * m * (n & ~3);
+ pdst3 = dst + 2 * m * (n & ~1);
+
+ for (j = (m >> 2); j--;)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc3 = psrc2 + lda;
+ psrc4 = psrc3 + lda;
+ psrc0 += 4 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 32;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+ LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
+ LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+ ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
+ pdst1 + 16, 2);
+ pdst1 += m * 8;
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+ LD_DP2_INC(psrc3, 2, src4, src5);
+ LD_DP2_INC(psrc4, 2, src6, src7);
+
+ ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
+ }
+
+ if (n & 1)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+ src2 = LD_DP(psrc3);
+ src3 = LD_DP(psrc4);
+ psrc1 += 2;
+ psrc2 += 2;
+ psrc3 += 2;
+ psrc4 += 2;
+
+ ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
+ }
+ }
+
+ if (m & 2)
+ {
+ psrc1 = psrc0;
+ psrc2 = psrc1 + lda;
+ psrc0 += 2 * lda;
+
+ pdst1 = pdst0;
+ pdst0 += 16;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
+
+ ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
+
+ pdst1 += m * 8;
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ LD_DP2_INC(psrc2, 2, src2, src3);
+
+ ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
+ }
+
+ if (n & 1)
+ {
+ src0 = LD_DP(psrc1);
+ src1 = LD_DP(psrc2);
+
+ ST_DP2_INC(src0, src1, pdst3, 2);
+
+ psrc1 += 2;
+ psrc2 += 2;
+ }
+ }
+
+ if (m & 1)
+ {
+ psrc1 = psrc0;
+ pdst1 = pdst0;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
+ ST_DP4(src0, src1, src2, src3, pdst1, 2);
+
+ pdst1 += m * 8;
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(psrc1, 2, src0, src1);
+ ST_DP2_INC(src0, src1, pdst2, 2);
+ }
+
+ if (n & 1)
+ {
+ src0 = LD_DP(psrc1);
+ ST_DP(src0, pdst3);
+ }
+ }
+
+ return 0;
+}
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
-#define CGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_N 2
-
-#define ZGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
-#define CGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
-#define ZGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 128