From edb5980c1366f528645361b55b7e90758fa5949c Mon Sep 17 00:00:00 2001 From: Kaustubh Raste Date: Mon, 9 May 2016 15:15:26 +0530 Subject: [PATCH] DTRSM optimization for MIPS P5600 and I6400 using MSA Signed-off-by: Kaustubh Raste --- CONTRIBUTORS.md | 3 + kernel/mips/KERNEL.P5600 | 8 +- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 1416 +++++++++++++++++++++++++++++++++ kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 1397 ++++++++++++++++++++++++++++++++ kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 963 ++++++++++++++++++++++ kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 866 ++++++++++++++++++++ 6 files changed, 4649 insertions(+), 4 deletions(-) create mode 100644 kernel/mips/dtrsm_kernel_LN_8x4_msa.c create mode 100644 kernel/mips/dtrsm_kernel_LT_8x4_msa.c create mode 100644 kernel/mips/dtrsm_kernel_RN_8x4_msa.c create mode 100644 kernel/mips/dtrsm_kernel_RT_8x4_msa.c diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 999413b..a13308f 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -157,3 +157,6 @@ In chronological order: * Shivraj Patil * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA + +* Kaustubh Raste + * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index d215752..0ac30d7 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -118,10 +118,10 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c new file mode 100644 index 0000000..9f0eb95 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -0,0 +1,1416 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; + v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; + v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; + v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; + v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; + v2f64 src_a61, src_a62, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pbb, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + pba += 8; + pbb += 4; + + LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); + LD_DP2(pbb, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a8; + src_a1 = src_a9; + src_a2 = src_a16; + src_a3 = src_a17; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } + + a -= 64; + b -= 32; + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8); + res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8); + res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9); + res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9); + res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10); + res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10); + res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11); + res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11); + + src_a54 = __msa_cast_to_vector_double(*(a + 54)); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a62 = LD_DP(a + 62); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); + src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); + src_a60 = LD_DP(a + 60); + src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); + src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); + src_a52 = LD_DP(a + 52); + src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); + src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); + src_a44 = LD_DP(a + 44); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); + src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); + src_a36 = __msa_cast_to_vector_double(*(a + 36)); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c6 *= src_a54; + + res_c15 *= src_a63; + res_c14 -= res_c15 * src_a62; + res_c14 *= src_a54; + + ST_DP(res_c7, b + 28); + ST_DP(res_c6, b + 24); + ST_DP(res_c15, b + 30); + ST_DP(res_c14, b + 26); + src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14); + src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14); + ST_DP(src_c3, c + 6); + ST_DP(src_c7, c_nxt1line + 6); + ST_DP(src_c11, c_nxt2line + 6); + ST_DP(src_c15, c_nxt3line + 6); + + res_c5 -= res_c7 * src_a61; + res_c5 -= res_c6 * src_a53; + res_c5 *= src_a45; + + res_c4 -= res_c7 * src_a60; + res_c4 -= res_c6 * src_a52; + res_c4 -= res_c5 * src_a44; + res_c4 *= src_a36; + + res_c13 -= res_c15 * src_a61; + res_c13 -= res_c14 * src_a53; + res_c13 *= src_a45; + + res_c12 -= res_c15 * src_a60; + res_c12 -= res_c14 * src_a52; + res_c12 -= res_c13 * src_a44; + res_c12 *= src_a36; + + src_a56 = LD_DP(a + 56); + src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); + src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); + src_a58 = LD_DP(a + 58); + src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); + src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); + + ST_DP(res_c4, b + 16); + ST_DP(res_c5, b + 20); + ST_DP(res_c12, b + 18); + ST_DP(res_c13, b + 22); + + src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12); + src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12); + ST_DP(src_c2, c + 4); + ST_DP(src_c6, c_nxt1line + 4); + ST_DP(src_c10, c_nxt2line + 4); + ST_DP(src_c14, c_nxt3line + 4); + + src_a50 = LD_DP(a + 50); + src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); + src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); + src_a42 = LD_DP(a + 42); + src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); + src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); + src_a34 = LD_DP(a + 34); + src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); + src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); + src_a26 = LD_DP(a + 26); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); + src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); + src_a18 = __msa_cast_to_vector_double(*(a + 18)); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + res_c11 -= res_c15 * src_a59; + res_c10 -= res_c15 * src_a58; + res_c9 -= res_c15 * src_a57; + res_c8 -= res_c15 * src_a56; + + res_c3 -= res_c6 * src_a51; + res_c3 -= res_c5 * src_a43; + res_c3 -= res_c4 * src_a35; + res_c3 *= src_a27; + + res_c2 -= res_c6 * src_a50; + res_c2 -= res_c5 * src_a42; + res_c2 -= res_c4 * src_a34; + res_c2 -= res_c3 * src_a26; + res_c2 *= src_a18; + + res_c11 -= res_c14 * src_a51; + res_c11 -= res_c13 * src_a43; + res_c11 -= res_c12 * src_a35; + res_c11 *= src_a27; + + res_c10 -= res_c14 * src_a50; + res_c10 -= res_c13 * src_a42; + res_c10 -= res_c12 * src_a34; + res_c10 -= res_c11 * src_a26; + res_c10 *= src_a18; + + src_a48 = LD_DP(a + 48); + src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); + src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); + src_a40 = LD_DP(a + 40); + src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); + src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); + + ST_DP(res_c2, b + 8); + ST_DP(res_c3, b + 12); + ST_DP(res_c10, b + 10); + ST_DP(res_c11, b + 14); + + src_a32 = LD_DP(a + 32); + src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); + src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); + src_a24 = LD_DP(a + 24); + src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); + src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); + + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10); + src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10); + ST_DP(src_c1, c + 2); + ST_DP(src_c5, c_nxt1line + 2); + ST_DP(src_c9, c_nxt2line + 2); + ST_DP(src_c13, c_nxt3line + 2); + + res_c1 -= res_c6 * src_a49; + res_c1 -= res_c5 * src_a41; + res_c1 -= res_c4 * src_a33; + res_c1 -= res_c3 * src_a25; + + res_c0 -= res_c6 * src_a48; + res_c0 -= res_c5 * src_a40; + res_c0 -= res_c4 * src_a32; + res_c0 -= res_c3 * src_a24; + + res_c9 -= res_c14 * src_a49; + res_c9 -= res_c13 * src_a41; + res_c9 -= res_c12 * src_a33; + res_c9 -= res_c11 * src_a25; + + res_c8 -= res_c14 * src_a48; + res_c8 -= res_c13 * src_a40; + res_c8 -= res_c12 * src_a32; + res_c8 -= res_c11 * src_a24; + + src_a16 = LD_DP(a + 16); + src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); + src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); + src_a8 = LD_DP(a + 8); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + res_c1 -= res_c2 * src_a17; + res_c1 *= src_a9; + + res_c9 -= res_c10 * src_a17; + res_c9 *= src_a9; + + res_c0 -= res_c2 * src_a16; + res_c0 -= res_c1 * src_a8; + res_c0 *= src_a0; + + res_c8 -= res_c10 * src_a16; + res_c8 -= res_c9 * src_a8; + res_c8 *= src_a0; + + ST_DP(res_c0, b + 0); + ST_DP(res_c8, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c9, b + 6); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8); + src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8); + + ST_DP(src_c0, c); + ST_DP(src_c4, c_nxt1line); + ST_DP(src_c8, c_nxt2line); + ST_DP(src_c12, c_nxt3line); +} + +static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a8, src_a9, src_a16, src_a17; + v2f64 src_a18, src_a24, src_a25, src_a26, src_a27, src_a32, src_a33; + v2f64 src_a34, src_a35, src_a36, src_a40, src_a41, src_a42, src_a43; + v2f64 src_a44, src_a45, src_a48, src_a49, src_a50, src_a51, src_a52; + v2f64 src_a53, src_a54, src_a56, src_a57, src_a58, src_a59, src_a60; + v2f64 src_a61, src_a62, src_a63; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pbb); + + for (i = bk - 1; i--;) + { + pba += 8; + pbb += 2; + + LD_DP4(pba, 2, src_a8, src_a9, src_a16, src_a17); + src_b1 = LD_DP(pbb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_a0 = src_a8; + src_a1 = src_a9; + src_a2 = src_a16; + src_a3 = src_a17; + src_b0 = src_b1; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + + src_a56 = LD_DP(a - 8); + src_a57 = (v2f64) __msa_splati_d((v2i64) src_a56, 1); + src_a56 = (v2f64) __msa_splati_d((v2i64) src_a56, 0); + src_a58 = LD_DP(a - 6); + src_a59 = (v2f64) __msa_splati_d((v2i64) src_a58, 1); + src_a58 = (v2f64) __msa_splati_d((v2i64) src_a58, 0); + src_a60 = LD_DP(a - 4); + src_a61 = (v2f64) __msa_splati_d((v2i64) src_a60, 1); + src_a60 = (v2f64) __msa_splati_d((v2i64) src_a60, 0); + src_a62 = LD_DP(a - 2); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); + src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); + + res_c7 *= src_a63; + res_c6 -= res_c7 * src_a62; + res_c5 -= res_c7 * src_a61; + res_c4 -= res_c7 * src_a60; + res_c3 -= res_c7 * src_a59; + res_c2 -= res_c7 * src_a58; + res_c1 -= res_c7 * src_a57; + res_c0 -= res_c7 * src_a56; + + src_a48 = LD_DP(a - 16); + src_a49 = (v2f64) __msa_splati_d((v2i64) src_a48, 1); + src_a48 = (v2f64) __msa_splati_d((v2i64) src_a48, 0); + src_a50 = LD_DP(a - 14); + src_a51 = (v2f64) __msa_splati_d((v2i64) src_a50, 1); + src_a50 = (v2f64) __msa_splati_d((v2i64) src_a50, 0); + src_a52 = LD_DP(a - 12); + src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); + src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); + src_a54 = __msa_cast_to_vector_double(*(a - 10)); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + src_a40 = LD_DP(a - 24); + src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); + src_a40 = (v2f64) __msa_splati_d((v2i64) src_a40, 0); + src_a42 = LD_DP(a - 22); + src_a43 = (v2f64) __msa_splati_d((v2i64) src_a42, 1); + src_a42 = (v2f64) __msa_splati_d((v2i64) src_a42, 0); + src_a44 = LD_DP(a - 20); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); + src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); + + res_c6 *= src_a54; + res_c5 -= res_c6 * src_a53; + res_c4 -= res_c6 * src_a52; + res_c3 -= res_c6 * src_a51; + res_c2 -= res_c6 * src_a50; + res_c1 -= res_c6 * src_a49; + res_c0 -= res_c6 * src_a48; + + res_c5 *= src_a45; + res_c4 -= res_c5 * src_a44; + res_c3 -= res_c5 * src_a43; + res_c2 -= res_c5 * src_a42; + res_c1 -= res_c5 * src_a41; + res_c0 -= res_c5 * src_a40; + + ST_DP(res_c7, b - 2); + ST_DP(res_c6, b - 4); + ST_DP(res_c5, b - 6); + + src_a32 = LD_DP(a - 32); + src_a33 = (v2f64) __msa_splati_d((v2i64) src_a32, 1); + src_a32 = (v2f64) __msa_splati_d((v2i64) src_a32, 0); + src_a34 = LD_DP(a - 30); + src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); + src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); + src_a36 = __msa_cast_to_vector_double(*(a - 28)); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + + res_c4 *= src_a36; + res_c3 -= res_c4 * src_a35; + res_c2 -= res_c4 * src_a34; + res_c1 -= res_c4 * src_a33; + res_c0 -= res_c4 * src_a32; + + src_a24 = LD_DP(a - 40); + src_a25 = (v2f64) __msa_splati_d((v2i64) src_a24, 1); + src_a24 = (v2f64) __msa_splati_d((v2i64) src_a24, 0); + src_a26 = LD_DP(a - 38); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); + src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); + src_a16 = LD_DP(a - 48); + src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); + src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); + src_a18 = __msa_cast_to_vector_double(*(a - 46)); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a0 = __msa_cast_to_vector_double(*(a - 64)); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = LD_DP(a - 56); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + + res_c3 *= src_a27; + res_c2 -= res_c3 * src_a26; + res_c1 -= res_c3 * src_a25; + res_c0 -= res_c3 * src_a24; + + res_c2 *= src_a18; + res_c1 -= res_c2 * src_a17; + res_c0 -= res_c2 * src_a16; + + res_c1 *= src_a9; + res_c0 -= res_c1 * src_a8; + + res_c0 *= src_a0; + + ST_DP(res_c4, b - 8); + ST_DP(res_c3, b - 10); + ST_DP(res_c2, b - 12); + ST_DP(res_c1, b - 14); + ST_DP(res_c0, b - 16); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); +} + +static void dsolve_8x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a8, a9, a16, a17, a18, a24, a25, a26, a27, a32, a33, a34, a35; + FLOAT a36, a40, a41, a42, a43, a44, a45, a48, a49, a50, a51, a52, a53; + FLOAT a54, a56, a57, a58, a59, a60, a61, a62, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + int i; + FLOAT *aa = a, *bb = b; + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0; + + for (i = bk; i--; ) + { + a0 = aa[0]; + a1 = aa[1]; + a2 = aa[2]; + a3 = aa[3]; + a4 = aa[4]; + a5 = aa[5]; + a6 = aa[6]; + a7 = aa[7]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + c4 -= a4 * b0; + c5 -= a5 * b0; + c6 -= a6 * b0; + c7 -= a7 * b0; + + aa += 8; + bb += 1; + } + } + + a -= 64; + b -= 8; + + a0 = *(a + 0); + a8 = *(a + 8); + a9 = *(a + 9); + a16 = *(a + 16); + a17 = *(a + 17); + a18 = *(a + 18); + a24 = *(a + 24); + a25 = *(a + 25); + a26 = *(a + 26); + a27 = *(a + 27); + a32 = *(a + 32); + a33 = *(a + 33); + a34 = *(a + 34); + a35 = *(a + 35); + a36 = *(a + 36); + a40 = *(a + 40); + a41 = *(a + 41); + a42 = *(a + 42); + a43 = *(a + 43); + a44 = *(a + 44); + a45 = *(a + 45); + a48 = *(a + 48); + a49 = *(a + 49); + a50 = *(a + 50); + a51 = *(a + 51); + a52 = *(a + 52); + a53 = *(a + 53); + a54 = *(a + 54); + a56 = *(a + 56); + a57 = *(a + 57); + a58 = *(a + 58); + a59 = *(a + 59); + a60 = *(a + 60); + a61 = *(a + 61); + a62 = *(a + 62); + a63 = *(a + 63); + + c7 *= a63; + + c6 -= c7 * a62; + c6 *= a54; + + c5 -= c7 * a61; + c5 -= c6 * a53; + c5 *= a45; + + c4 -= c7 * a60; + c4 -= c6 * a52; + c4 -= c5 * a44; + c4 *= a36; + + c3 -= c7 * a59; + c3 -= c6 * a51; + c3 -= c5 * a43; + c3 -= c4 * a35; + c3 *= a27; + + c2 -= c7 * a58; + c2 -= c6 * a50; + c2 -= c5 * a42; + c2 -= c4 * a34; + c2 -= c3 * a26; + c2 *= a18; + + c1 -= c7 * a57; + c1 -= c6 * a49; + c1 -= c5 * a41; + c1 -= c4 * a33; + c1 -= c3 * a25; + c1 -= c2 * a17; + c1 *= a9; + + c0 -= c7 * a56; + c0 -= c6 * a48; + c0 -= c5 * a40; + c0 -= c4 * a32; + c0 -= c3 * a24; + c0 -= c2 * a16; + c0 -= c1 * a8; + c0 *= a0; + + *(b + 7) = c7; + *(b + 6) = c6; + *(b + 5) = c5; + *(b + 4) = c4; + *(b + 3) = c3; + *(b + 2) = c2; + *(b + 1) = c1; + *(b + 0) = c0; + + *(c + 7) = c7; + *(c + 6) = c6; + *(c + 5) = c5; + *(c + 4) = c4; + *(c + 3) = c3; + *(c + 2) = c2; + *(c + 1) = c1; + *(c + 0) = c0; +} + +static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; + v2f64 src_a14, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 16, *bb = b + 16; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + LD_DP2(bb, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + aa += 4; + bb += 4; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5); + + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + + src_a9 = LD_DP(a + 9); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + + src_a8 = __msa_cast_to_vector_double(*(a + 8)); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + + res_c3 *= src_a15; + res_c7 *= src_a15; + + res_c2 -= res_c3 * src_a14; + res_c6 -= res_c7 * src_a14; + res_c2 *= src_a10; + res_c6 *= src_a10; + + res_c1 -= res_c3 * src_a13; + res_c5 -= res_c7 * src_a13; + res_c1 -= res_c2 * src_a9; + res_c5 -= res_c6 * src_a9; + res_c1 *= src_a5; + res_c5 *= src_a5; + + res_c0 -= res_c3 * src_a12; + res_c4 -= res_c7 * src_a12; + res_c0 -= res_c2 * src_a8; + res_c4 -= res_c6 * src_a8; + res_c0 -= res_c1 * src_a4; + res_c4 -= res_c5 * src_a4; + res_c0 *= src_a0; + res_c4 *= src_a0; + + ST_DP(res_c7, b + 14); + ST_DP(res_c3, b + 12); + ST_DP(res_c6, b + 10); + ST_DP(res_c2, b + 8); + ST_DP(res_c5, b + 6); + ST_DP(res_c1, b + 4); + ST_DP(res_c4, b + 2); + ST_DP(res_c0, b + 0); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); +} + +static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v2f64 src_a0, src_a4, src_a5, src_a8, src_a9, src_a10, src_a12, src_a13; + v2f64 src_a14, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 16, *bb = b + 8; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + src_b0 = LD_DP(bb); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + aa += 4; + bb += 2; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + + src_a9 = LD_DP(a + 9); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + + src_a8 = __msa_cast_to_vector_double(*(a + 8)); + src_a0 = __msa_cast_to_vector_double(*(a + 0)); + + src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + + res_c3 *= src_a15; + + res_c2 -= res_c3 * src_a14; + res_c2 *= src_a10; + + res_c1 -= res_c3 * src_a13; + res_c1 -= res_c2 * src_a9; + res_c1 *= src_a5; + + res_c0 -= res_c3 * src_a12; + res_c0 -= res_c2 * src_a8; + res_c0 -= res_c1 * src_a4; + res_c0 *= src_a0; + + ST_DP(res_c3, b + 6); + ST_DP(res_c2, b + 4); + ST_DP(res_c1, b + 2); + ST_DP(res_c0, b + 0); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); +} + +static void dsolve_4x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a4, a5, a8, a9, a10, a12, a13, a14, a15; + FLOAT c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 16, *bb = b + 4; + FLOAT a0, a1, a2, a3, b0; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + a2 = aa[2]; + a3 = aa[3]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + + aa += 4; + bb += 1; + } + } + + a0 = *(a + 0); + a4 = *(a + 4); + a5 = *(a + 5); + a8 = *(a + 8); + a9 = *(a + 9); + a10 = *(a + 10); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + + c3 *= a15; + + c2 -= c3 * a14; + c2 *= a10; + + c1 -= c3 * a13; + c1 -= c2 * a9; + c1 *= a5; + + c0 -= c3 * a12; + c0 -= c2 * a8; + c0 -= c1 * a4; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 4, *bb = b + 8; + FLOAT a0, a1, b0, b1, b2, b3; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = bb[1]; + c0_nxt1 -= a0 * b1; + c1_nxt1 -= a1 * b1; + + b2 = bb[2]; + c0_nxt2 -= a0 * b2; + c1_nxt2 -= a1 * b2; + + b3 = bb[3]; + c0_nxt3 -= a0 * b3; + c1_nxt3 -= a1 * b3; + + aa += 2; + bb += 4; + } + } + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c0 -= c1 * a2; + c0 *= a0; + + c1_nxt1 *= a3; + c0_nxt1 -= c1_nxt1 * a2; + c0_nxt1 *= a0; + + c1_nxt2 *= a3; + c0_nxt2 -= c1_nxt2 * a2; + c0_nxt2 *= a0; + + c1_nxt3 *= a3; + c0_nxt3 -= c1_nxt3 * a2; + c0_nxt3 *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 4, *bb = b + 4; + FLOAT a0, a1, b0, b1; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = bb[1]; + c0_nxt -= a0 * b1; + c1_nxt -= a1 * b1; + + aa += 2; + bb += 2; + } + } + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + + c0 -= c1 * a2; + c0 *= a0; + + c1_nxt *= a3; + + c0_nxt -= c1_nxt * a2; + c0_nxt *= a0; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a2, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0; + FLOAT *aa = a + 4, *bb = b + 2; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + aa += 2; + bb += 1; + } + } + + a0 = *(a + 0); + a2 = *(a + 2); + a3 = *(a + 3); + + c1 *= a3; + c0 -= c1 * a2; + c0 *= a0; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + + a0 = *a; + c0 = *(c + 0); + c0_nxt1 = *(c + 1 * ldc); + c0_nxt2 = *(c + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 1, *bb = b + 4; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + } + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; +} + +static void dsolve_1x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) +{ + *c *= *a; + *(c + ldc) = *a * *(c + ldc); + + *b = *c; + *(b + 1) = *(c + ldc); +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG kk, i, j; + FLOAT *aa, *bb, *cc; + + for (j = (n >> 2); j--;) + { + kk = m; + + if (m & 7) + { + if (m & 1) + { + aa = a + (m - 1) * k + kk; + bb = b + 4 * kk; + cc = c + (m - 1); + + dsolve_1x4_ln_msa(aa - 1, bb - 4, cc, ldc, k - kk); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k + 2 * kk; + bb = b + 4 * kk; + cc = c + ((m & -2) - 2); + + dsolve_2x4_ln_msa(aa - 4, bb - 8, cc, ldc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k + 4 * kk; + bb = b + 4 * kk; + cc = c + ((m & -4) - 4); + + dsolve_4x4_ln_msa(aa - 16, bb - 16, cc, ldc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x4_ln_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + kk = m; + + if (m & 7) + { + if (m & 1) + { + aa = a + ((m & -1) - 1) * k; + cc = c + ((m & -1) - 1); + + dsolve_1x2_ln_msa(aa + kk - 1, b + kk * 2 - 2, cc, ldc); + + kk -= 1; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k; + cc = c + ((m & -2) - 2); + + dsolve_2x2_ln_msa(aa + kk * 2 - 4, b + kk * 2 - 4, cc, ldc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k; + cc = c + ((m & -4) - 4); + + dsolve_4x2_ln_msa(aa + kk * 4 - 16, b + kk * 2 - 8, cc, ldc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x2_ln_msa(aa + kk * 8, b + kk * 2, cc, ldc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = m; + + if (m & 7) + { + if (m & 1) + { + kk -= 1; + aa = a + ((m & -1) - 1) * k + kk; + cc = c + ((m & -1) - 1); + + *cc *= *aa; + *(b + kk) = *cc; + } + + if (m & 2) + { + aa = a + ((m & -2) - 2) * k + kk * 2; + cc = c + ((m & -2) - 2); + + dsolve_2x1_ln_msa(aa - 4, b + kk - 2, cc, k - kk); + + kk -= 2; + } + + if (m & 4) + { + aa = a + ((m & -4) - 4) * k; + cc = c + ((m & -4) - 4); + + dsolve_4x1_ln_msa(aa + 4 * kk - 16, b + kk - 4, cc, k - kk); + + kk -= 4; + } + } + + i = (m >> 3); + if (i > 0) + { + aa = a + ((m & -8) - 8) * k; + cc = c + ((m & -8) - 8); + + do + { + dsolve_8x1_ln_msa(aa + 8 * kk, b + kk, cc, k - kk); + + aa -= 8 * k; + cc -= 8; + kk -= 8; + i --; + } while (i > 0); + } + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c new file mode 100644 index 0000000..da35aa8 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -0,0 +1,1397 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(b, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + a += 8; + b += 4; + + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(b, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + a += 8; + b += 4; + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + res_c8 = (v2f64) __msa_ilvr_d((v2i64) src_c12, (v2i64) src_c8); + res_c9 = (v2f64) __msa_ilvl_d((v2i64) src_c12, (v2i64) src_c8); + res_c10 = (v2f64) __msa_ilvr_d((v2i64) src_c13, (v2i64) src_c9); + res_c11 = (v2f64) __msa_ilvl_d((v2i64) src_c13, (v2i64) src_c9); + res_c12 = (v2f64) __msa_ilvr_d((v2i64) src_c14, (v2i64) src_c10); + res_c13 = (v2f64) __msa_ilvl_d((v2i64) src_c14, (v2i64) src_c10); + res_c14 = (v2f64) __msa_ilvr_d((v2i64) src_c15, (v2i64) src_c11); + res_c15 = (v2f64) __msa_ilvl_d((v2i64) src_c15, (v2i64) src_c11); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + res_c8 *= src_a0; + res_c9 -= res_c8 * src_a1; + res_c10 -= res_c8 * src_a2; + res_c11 -= res_c8 * src_a3; + res_c12 -= res_c8 * src_a4; + res_c13 -= res_c8 * src_a5; + res_c14 -= res_c8 * src_a6; + res_c15 -= res_c8 * src_a7; + + src_a9 = __msa_cast_to_vector_double(*(a + 9)); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + res_c9 *= src_a9; + res_c10 -= res_c9 * src_a10; + res_c11 -= res_c9 * src_a11; + res_c12 -= res_c9 * src_a12; + res_c13 -= res_c9 * src_a13; + res_c14 -= res_c9 * src_a14; + res_c15 -= res_c9 * src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c8, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c9, b + 6); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c8 = (v2f64) __msa_ilvr_d((v2i64) res_c9, (v2i64) res_c8); + src_c12 = (v2f64) __msa_ilvl_d((v2i64) res_c9, (v2i64) res_c8); + + ST_DP(src_c0, c); + ST_DP(src_c4, c_nxt1line); + ST_DP(src_c8, c_nxt2line); + ST_DP(src_c12, c_nxt3line); + + src_a18 = LD_DP(a + 18); + src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a20 = LD_DP(a + 20); + src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1); + src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0); + src_a22 = LD_DP(a + 22); + src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1); + src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + res_c10 *= src_a18; + res_c11 -= res_c10 * src_a19; + res_c12 -= res_c10 * src_a20; + res_c13 -= res_c10 * src_a21; + res_c14 -= res_c10 * src_a22; + res_c15 -= res_c10 * src_a23; + + src_a27 = __msa_cast_to_vector_double(*(a + 27)); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a28 = LD_DP(a + 28); + src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); + src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); + src_a30 = LD_DP(a + 30); + src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1); + src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + res_c11 *= src_a27; + res_c12 -= res_c11 * src_a28; + res_c13 -= res_c11 * src_a29; + res_c14 -= res_c11 * src_a30; + res_c15 -= res_c11 * src_a31; + + ST_DP(res_c2, b + 8); + ST_DP(res_c10, b + 10); + ST_DP(res_c3, b + 12); + ST_DP(res_c11, b + 14); + + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c9 = (v2f64) __msa_ilvr_d((v2i64) res_c11, (v2i64) res_c10); + src_c13 = (v2f64) __msa_ilvl_d((v2i64) res_c11, (v2i64) res_c10); + + src_a36 = LD_DP(a + 36); + src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a38 = LD_DP(a + 38); + src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1); + src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + res_c12 *= src_a36; + res_c13 -= res_c12 * src_a37; + res_c14 -= res_c12 * src_a38; + res_c15 -= res_c12 * src_a39; + + src_a45 = __msa_cast_to_vector_double(*(a + 45)); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a46 = LD_DP(a + 46); + src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); + src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + res_c13 *= src_a45; + res_c14 -= res_c13 * src_a46; + res_c15 -= res_c13 * src_a47; + + ST_DP(src_c1, c + 2); + ST_DP(src_c5, c_nxt1line + 2); + ST_DP(src_c9, c_nxt2line + 2); + ST_DP(src_c13, c_nxt3line + 2); + + ST_DP(res_c4, b + 16); + ST_DP(res_c12, b + 18); + ST_DP(res_c5, b + 20); + ST_DP(res_c13, b + 22); + + src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c10 = (v2f64) __msa_ilvr_d((v2i64) res_c13, (v2i64) res_c12); + src_c14 = (v2f64) __msa_ilvl_d((v2i64) res_c13, (v2i64) res_c12); + + src_a63 = __msa_cast_to_vector_double(*(a + 63)); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a54 = LD_DP(a + 54); + src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + + res_c14 *= src_a54; + res_c15 -= res_c14 * src_a55; + + res_c7 *= src_a63; + res_c15 *= src_a63; + + ST_DP(src_c2, c + 4); + ST_DP(src_c6, c_nxt1line + 4); + ST_DP(src_c10, c_nxt2line + 4); + ST_DP(src_c14, c_nxt3line + 4); + + ST_DP(res_c6, b + 24); + ST_DP(res_c14, b + 26); + ST_DP(res_c7, b + 28); + ST_DP(res_c15, b + 30); + + src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + src_c11 = (v2f64) __msa_ilvr_d((v2i64) res_c15, (v2i64) res_c14); + src_c15 = (v2f64) __msa_ilvl_d((v2i64) res_c15, (v2i64) res_c14); + + ST_DP(src_c3, c + 6); + ST_DP(src_c7, c_nxt1line + 6); + ST_DP(src_c11, c_nxt2line + 6); + ST_DP(src_c15, c_nxt3line + 6); +} + +static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18; + v2f64 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28; + v2f64 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39; + v2f64 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + a += 8; + b += 2; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c4, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c4, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c5, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c5, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c2); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c2); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c3); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c3); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + src_a4 = LD_DP(a + 4); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); + src_a4 = (v2f64) __msa_splati_d((v2i64) src_a4, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + res_c4 -= res_c0 * src_a4; + res_c5 -= res_c0 * src_a5; + res_c6 -= res_c0 * src_a6; + res_c7 -= res_c0 * src_a7; + + src_a9 = __msa_cast_to_vector_double(*(a + 9)); + src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a12 = LD_DP(a + 12); + src_a13 = (v2f64) __msa_splati_d((v2i64) src_a12, 1); + src_a12 = (v2f64) __msa_splati_d((v2i64) src_a12, 0); + src_a14 = LD_DP(a + 14); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a14, 1); + src_a14 = (v2f64) __msa_splati_d((v2i64) src_a14, 0); + + res_c1 *= src_a9; + res_c2 -= res_c1 * src_a10; + res_c3 -= res_c1 * src_a11; + res_c4 -= res_c1 * src_a12; + res_c5 -= res_c1 * src_a13; + res_c6 -= res_c1 * src_a14; + res_c7 -= res_c1 * src_a15; + + src_a18 = LD_DP(a + 18); + src_a19 = (v2f64) __msa_splati_d((v2i64) src_a18, 1); + src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a20 = LD_DP(a + 20); + src_a21 = (v2f64) __msa_splati_d((v2i64) src_a20, 1); + src_a20 = (v2f64) __msa_splati_d((v2i64) src_a20, 0); + src_a22 = LD_DP(a + 22); + src_a23 = (v2f64) __msa_splati_d((v2i64) src_a22, 1); + src_a22 = (v2f64) __msa_splati_d((v2i64) src_a22, 0); + + res_c2 *= src_a18; + res_c3 -= res_c2 * src_a19; + res_c4 -= res_c2 * src_a20; + res_c5 -= res_c2 * src_a21; + res_c6 -= res_c2 * src_a22; + res_c7 -= res_c2 * src_a23; + + src_a27 = __msa_cast_to_vector_double(*(a + 27)); + src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a28 = LD_DP(a + 28); + src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); + src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); + src_a30 = LD_DP(a + 30); + src_a31 = (v2f64) __msa_splati_d((v2i64) src_a30, 1); + src_a30 = (v2f64) __msa_splati_d((v2i64) src_a30, 0); + + res_c3 *= src_a27; + res_c4 -= res_c3 * src_a28; + res_c5 -= res_c3 * src_a29; + res_c6 -= res_c3 * src_a30; + res_c7 -= res_c3 * src_a31; + + ST_DP(res_c0, b + 0); + ST_DP(res_c1, b + 2); + ST_DP(res_c2, b + 4); + ST_DP(res_c3, b + 6); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c4 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c5 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c4, src_c5, c + ldc, 2); + + src_a36 = LD_DP(a + 36); + src_a37 = (v2f64) __msa_splati_d((v2i64) src_a36, 1); + src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a38 = LD_DP(a + 38); + src_a39 = (v2f64) __msa_splati_d((v2i64) src_a38, 1); + src_a38 = (v2f64) __msa_splati_d((v2i64) src_a38, 0); + + res_c4 *= src_a36; + res_c5 -= res_c4 * src_a37; + res_c6 -= res_c4 * src_a38; + res_c7 -= res_c4 * src_a39; + + src_a45 = __msa_cast_to_vector_double(*(a + 45)); + src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a46 = LD_DP(a + 46); + src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); + src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); + + res_c5 *= src_a45; + res_c6 -= res_c5 * src_a46; + res_c7 -= res_c5 * src_a47; + + src_a63 = __msa_cast_to_vector_double(*(a + 63)); + src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a54 = LD_DP(a + 54); + src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); + src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + + res_c6 *= src_a54; + res_c7 -= res_c6 * src_a55; + + res_c7 *= src_a63; + + ST_DP(res_c4, b + 8); + ST_DP(res_c5, b + 10); + ST_DP(res_c6, b + 12); + ST_DP(res_c7, b + 14); + + src_c2 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + src_c3 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + + ST_DP2(src_c2, src_c3, c + 4, 2); + ST_DP2(src_c6, src_c7, c + 4 + ldc, 2); +} + +static void dsolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18; + FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39; + FLOAT a45, a46, a47, a54, a55, a63; + FLOAT c0, c1, c2, c3, c4, c5, c6, c7; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + c4 = *(c + 4); + c5 = *(c + 5); + c6 = *(c + 6); + c7 = *(c + 7); + + if (bk > 0) + { + int i; + FLOAT a0, a1, a2, a3, a4, a5, a6, a7, b0; + + for (i = bk; i--; ) + { + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + c4 -= a4 * b0; + c5 -= a5 * b0; + c6 -= a6 * b0; + c7 -= a7 * b0; + + a += 8; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a4 = *(a + 4); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a9 = *(a + 9); + a10 = *(a + 10); + a11 = *(a + 11); + a12 = *(a + 12); + a13 = *(a + 13); + a14 = *(a + 14); + a15 = *(a + 15); + a18 = *(a + 18); + a19 = *(a + 19); + a20 = *(a + 20); + a21 = *(a + 21); + a22 = *(a + 22); + a23 = *(a + 23); + a27 = *(a + 27); + a28 = *(a + 28); + a29 = *(a + 29); + a30 = *(a + 30); + a31 = *(a + 31); + a36 = *(a + 36); + a37 = *(a + 37); + a38 = *(a + 38); + a39 = *(a + 39); + a45 = *(a + 45); + a46 = *(a + 46); + a47 = *(a + 47); + a54 = *(a + 54); + a55 = *(a + 55); + a63 = *(a + 63); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a9; + + c2 -= c0 * a2; + c2 -= c1 * a10; + c2 *= a18; + + c3 -= c0 * a3; + c3 -= c1 * a11; + c3 -= c2 * a19; + c3 *= a27; + + c4 -= c0 * a4; + c4 -= c1 * a12; + c4 -= c2 * a20; + c4 -= c3 * a28; + c4 *= a36; + + c5 -= c0 * a5; + c5 -= c1 * a13; + c5 -= c2 * a21; + c5 -= c3 * a29; + c5 -= c4 * a37; + c5 *= a45; + + c6 -= c0 * a6; + c6 -= c1 * a14; + c6 -= c2 * a22; + c6 -= c3 * a30; + c6 -= c4 * a38; + c6 -= c5 * a46; + c6 *= a54; + + c7 -= c0 * a7; + c7 -= c1 * a15; + c7 -= c2 * a23; + c7 -= c3 * a31; + c7 -= c4 * a39; + c7 -= c5 * a47; + c7 -= c6 * a55; + c7 *= a63; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; + *(c + 4) = c4; + *(c + 5) = c5; + *(c + 6) = c6; + *(c + 7) = c7; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + *(b + 4) = c4; + *(b + 5) = c5; + *(b + 6) = c6; + *(b + 7) = c7; +} + +static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v2f64 src_a10, src_a11, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + LD_DP2(b, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + a += 4; + b += 4; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + res_c4 = (v2f64) __msa_ilvr_d((v2i64) src_c6, (v2i64) src_c4); + res_c5 = (v2f64) __msa_ilvl_d((v2i64) src_c6, (v2i64) src_c4); + res_c6 = (v2f64) __msa_ilvr_d((v2i64) src_c7, (v2i64) src_c5); + res_c7 = (v2f64) __msa_ilvl_d((v2i64) src_c7, (v2i64) src_c5); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + res_c4 *= src_a0; + res_c5 -= res_c4 * src_a1; + res_c6 -= res_c4 * src_a2; + res_c7 -= res_c4 * src_a3; + + src_a5 = __msa_cast_to_vector_double(*(a + 5)); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + res_c5 *= src_a5; + res_c6 -= res_c5 * src_a6; + res_c7 -= res_c5 * src_a7; + + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a15 = __msa_cast_to_vector_double(*(a + 15)); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c3 *= src_a15; + + res_c6 *= src_a10; + res_c7 -= res_c6 * src_a11; + res_c7 *= src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c4, b + 2); + ST_DP(res_c1, b + 4); + ST_DP(res_c5, b + 6); + ST_DP(res_c2, b + 8); + ST_DP(res_c6, b + 10); + ST_DP(res_c3, b + 12); + ST_DP(res_c7, b + 14); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c4 = (v2f64) __msa_ilvr_d((v2i64) res_c5, (v2i64) res_c4); + src_c6 = (v2f64) __msa_ilvl_d((v2i64) res_c5, (v2i64) res_c4); + + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + src_c5 = (v2f64) __msa_ilvr_d((v2i64) res_c7, (v2i64) res_c6); + src_c7 = (v2f64) __msa_ilvl_d((v2i64) res_c7, (v2i64) res_c6); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); +} + +static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7; + v2f64 src_a10, src_a11, src_a15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + a += 4; + b += 2; + } + } + + res_c0 = (v2f64) __msa_ilvr_d((v2i64) src_c2, (v2i64) src_c0); + res_c1 = (v2f64) __msa_ilvl_d((v2i64) src_c2, (v2i64) src_c0); + res_c2 = (v2f64) __msa_ilvr_d((v2i64) src_c3, (v2i64) src_c1); + res_c3 = (v2f64) __msa_ilvl_d((v2i64) src_c3, (v2i64) src_c1); + + src_a0 = LD_DP(a + 0); + src_a1 = (v2f64) __msa_splati_d((v2i64) src_a0, 1); + src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a2 = LD_DP(a + 2); + src_a3 = (v2f64) __msa_splati_d((v2i64) src_a2, 1); + src_a2 = (v2f64) __msa_splati_d((v2i64) src_a2, 0); + + res_c0 *= src_a0; + res_c1 -= res_c0 * src_a1; + res_c2 -= res_c0 * src_a2; + res_c3 -= res_c0 * src_a3; + + src_a5 = __msa_cast_to_vector_double(*(a + 5)); + src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a6 = LD_DP(a + 6); + src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); + src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); + + res_c1 *= src_a5; + res_c2 -= res_c1 * src_a6; + res_c3 -= res_c1 * src_a7; + + src_a10 = LD_DP(a + 10); + src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); + src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); + src_a15 = __msa_cast_to_vector_double(*(a + 15)); + src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + + res_c2 *= src_a10; + res_c3 -= res_c2 * src_a11; + res_c3 *= src_a15; + + ST_DP(res_c0, b + 0); + ST_DP(res_c1, b + 2); + ST_DP(res_c2, b + 4); + ST_DP(res_c3, b + 6); + + src_c0 = (v2f64) __msa_ilvr_d((v2i64) res_c1, (v2i64) res_c0); + src_c1 = (v2f64) __msa_ilvr_d((v2i64) res_c3, (v2i64) res_c2); + src_c2 = (v2f64) __msa_ilvl_d((v2i64) res_c1, (v2i64) res_c0); + src_c3 = (v2f64) __msa_ilvl_d((v2i64) res_c3, (v2i64) res_c2); + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); +} + +static void dsolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT c0, c1, c2, c3; + FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, a2, a3, b0; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + + a += 4; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a2 = *(a + 2); + a3 = *(a + 3); + a5 = *(a + 5); + a6 = *(a + 6); + a7 = *(a + 7); + a10 = *(a + 10); + a11 = *(a + 11); + a15 = *(a + 15); + + c0 *= a0; + + c1 -= c0 * a1; + c1 *= a5; + + c2 -= c0 * a2; + c2 -= c1 * a6; + c2 *= a10; + + c3 -= c0 * a3; + c3 -= c1 * a7; + c3 -= c2 * a11; + c3 *= a15; + + *(b + 0) = c0; + *(b + 1) = c1; + *(b + 2) = c2; + *(b + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3; + FLOAT c0, c1, c0_nxt1, c1_nxt1; + FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + ldc); + c1_nxt1 = *(c + 1 + ldc); + c0_nxt2 = *(c + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0, b1, b2, b3; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[1]; + c0_nxt1 -= a0 * b1; + c1_nxt1 -= a1 * b1; + + b2 = b[2]; + c0_nxt2 -= a0 * b2; + c1_nxt2 -= a1 * b2; + + b3 = b[3]; + c0_nxt3 -= a0 * b3; + c1_nxt3 -= a1 * b3; + + a += 2; + b += 4; + } + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + c0_nxt1 *= a0; + c1_nxt1 -= c0_nxt1 * a1; + c1_nxt1 *= a3; + + c0_nxt2 *= a0; + c1_nxt2 -= c0_nxt2 * a1; + c1_nxt2 *= a3; + + c0_nxt3 *= a0; + c1_nxt3 -= c0_nxt3 * a1; + c1_nxt3 *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; + *(b + 4) = c1; + *(b + 5) = c1_nxt1; + *(b + 6) = c1_nxt2; + *(b + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt1; + *(c + 1 + ldc) = c1_nxt1; + + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0, a1, a3; + FLOAT c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0, b1; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[1]; + c0_nxt -= a0 * b1; + c1_nxt -= a1 * b1; + + a += 2; + b += 2; + } + } + + a0 = *a; + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + c0_nxt *= a0; + c1_nxt -= c0_nxt * a1; + c1_nxt *= a3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + *(b + 2) = c1; + *(b + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT a0, a1, a3, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + a += 2; + b += 1; + } + } + + a0 = *(a + 0); + a1 = *(a + 1); + a3 = *(a + 3); + + c0 *= a0; + c1 -= c0 * a1; + c1 *= a3; + + *(b + 0) = c0; + *(b + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT a0; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + + c0 = *(c + 0); + c0_nxt1 = *(c + 1 * ldc); + c0_nxt2 = *(c + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c0_nxt3 -= a[0] * b[3]; + + a += 1; + b += 4; + } + } + + a0 = *a; + + c0 *= a0; + c0_nxt1 *= a0; + c0_nxt2 *= a0; + c0_nxt3 *= a0; + + *(c + 0 * ldc) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; + + *(b + 0) = c0; + *(b + 1) = c0_nxt1; + *(b + 2) = c0_nxt2; + *(b + 3) = c0_nxt3; +} + +static void dsolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT c0, c0_nxt; + + c0 = *c; + c0_nxt = *(c + ldc); + + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= *a * b[0]; + c0_nxt -= *a * b[1]; + + a += 1; + b += 2; + } + } + + c0 *= *a; + c0_nxt *= *a; + + *(b + 0) = c0; + *(b + 1) = c0_nxt; + + *(c + 0) = c0; + *(c + ldc) = c0_nxt; +} + +static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + *c -= *a * *b; + + a += 1; + b += 1; + } + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc; + + for (j = (n >> 2); j--;) + { + kk = 0; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x4_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dsolve_1x4_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + kk = 0; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x2_lt_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dsolve_1x2_lt_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + } + + if (n & 1) + { + kk = 0; + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_lt_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + kk += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_lt_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + kk += 4; + } + + if (m & 2) + { + dsolve_2x1_lt_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + kk += 2; + } + + if (m & 1) + { + dgmm_dsolve_1x1_msa(aa, b, cc, kk); + + aa += k; + cc += 1; + kk += 1; + } + } + + b += k; + c += ldc; + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c new file mode 100644 index 0000000..659f772 --- /dev/null +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -0,0 +1,963 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v2f64 src_b10, src_b11, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + v2f64 src_b; + + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(b, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + a += 8; + b += 4; + + LD_DP4(a, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(b, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + a += 8; + b += 4; + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + src_b5 = __msa_cast_to_vector_double(*(b + 5)); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b6 = LD_DP(b + 6); + src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); + src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); + src_b10 = LD_DP(b + 10); + src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b15 = __msa_cast_to_vector_double(*(b + 15)); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + src_c4 -= src_c0 * src_b1; + src_c5 -= src_c1 * src_b1; + src_c6 -= src_c2 * src_b1; + src_c7 -= src_c3 * src_b1; + + src_c4 *= src_b5; + src_c5 *= src_b5; + src_c6 *= src_b5; + src_c7 *= src_b5; + + src_c8 -= src_c0 * src_b2; + src_c9 -= src_c1 * src_b2; + src_c10 -= src_c2 * src_b2; + src_c11 -= src_c3 * src_b2; + + src_c8 -= src_c4 * src_b6; + src_c9 -= src_c5 * src_b6; + src_c10 -= src_c6 * src_b6; + src_c11 -= src_c7 * src_b6; + + src_c8 *= src_b10; + src_c9 *= src_b10; + src_c10 *= src_b10; + src_c11 *= src_b10; + + src_c12 -= src_c0 * src_b3; + src_c13 -= src_c1 * src_b3; + src_c14 -= src_c2 * src_b3; + src_c15 -= src_c3 * src_b3; + + src_c12 -= src_c4 * src_b7; + src_c13 -= src_c5 * src_b7; + src_c14 -= src_c6 * src_b7; + src_c15 -= src_c7 * src_b7; + + src_c12 -= src_c8 * src_b11; + src_c13 -= src_c9 * src_b11; + src_c14 -= src_c10 * src_b11; + src_c15 -= src_c11 * src_b11; + + src_c12 *= src_b15; + src_c13 *= src_b15; + src_c14 *= src_b15; + src_c15 *= src_b15; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2); +} + +static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b1, src_b3; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + a += 8; + b += 2; + } + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b3 = __msa_cast_to_vector_double(*(b + 3)); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + src_c4 -= src_c0 * src_b1; + src_c5 -= src_c1 * src_b1; + src_c6 -= src_c2 * src_b1; + src_c7 -= src_c3 * src_b1; + + src_c4 *= src_b3; + src_c5 *= src_b3; + src_c6 *= src_b3; + src_c7 *= src_b3; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3; + v2f64 src_b0; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b; + + for (i = bk; i--;) + { + LD_DP4(a, 2, src_a0, src_a1, src_a2, src_a3); + src_b = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b, (v2i64) src_b); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + a += 8; + b += 1; + } + } + + src_b0 = __msa_cast_to_vector_double(*b); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7; + v2f64 src_b10, src_b11, src_b15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + LD_DP2(b, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + a += 4; + b += 4; + } + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + src_b5 = __msa_cast_to_vector_double(*(b + 5)); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b6 = LD_DP(b + 6); + src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); + src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); + src_b10 = LD_DP(b + 10); + src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b15 = __msa_cast_to_vector_double(*(b + 15)); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + + src_c2 *= src_b5; + src_c3 *= src_b5; + + src_c4 -= src_c0 * src_b2; + src_c5 -= src_c1 * src_b2; + + src_c4 -= src_c2 * src_b6; + src_c5 -= src_c3 * src_b6; + + src_c4 *= src_b10; + src_c5 *= src_b10; + + src_c6 -= src_c0 * src_b3; + src_c7 -= src_c1 * src_b3; + + src_c6 -= src_c2 * src_b7; + src_c7 -= src_c3 * src_b7; + + src_c6 -= src_c4 * src_b11; + src_c7 -= src_c5 * src_b11; + + src_c6 *= src_b15; + src_c7 *= src_b15; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + BLASLONG i; + v2f64 src_a0, src_a1, src_b, src_b0; + + for (i = bk; i--;) + { + LD_DP2(a, 2, src_a0, src_a1); + src_b0 = LD_DP(b); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + a += 4; + b += 2; + } + } + + src_b0 = LD_DP(b + 0); + src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b3 = __msa_cast_to_vector_double(*(b + 3)); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + + src_c2 -= src_c0 * src_b1; + src_c3 -= src_c1 * src_b1; + + src_c2 *= src_b3; + src_c3 *= src_b3; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1, c2, c3; + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, a2, a3; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + c2 -= a2 * b0; + c3 -= a3 * b0; + + a += 4; + b += 1; + } + } + + b0 = *b; + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + FLOAT c1, c1_nxt1, c1_nxt2, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0, b1, b2, b3; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[1]; + c0_nxt1 -= a0 * b1; + c1_nxt1 -= a1 * b1; + + b2 = b[2]; + c0_nxt2 -= a0 * b2; + c1_nxt2 -= a1 * b2; + + b3 = b[3]; + c0_nxt3 -= a0 * b3; + c1_nxt3 -= a1 * b3; + + a += 2; + b += 4; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + c1 *= b0; + + c0_nxt1 -= c0 * b1; + c1_nxt1 -= c1 * b1; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0_nxt2 -= c0 * b2; + c1_nxt2 -= c1 * b2; + c0_nxt2 -= c0_nxt1 * b6; + c1_nxt2 -= c1_nxt1 * b6; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt3 -= c0 * b3; + c1_nxt3 -= c1 * b3; + c0_nxt3 -= c0_nxt1 * b7; + c1_nxt3 -= c1_nxt1 * b7; + c0_nxt3 -= c0_nxt2 * b11; + c1_nxt3 -= c1_nxt2 * b11; + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; + + *(c + 1) = c1; + *(c + 1 + 1 * ldc) = c1_nxt1; + *(c + 1 + 2 * ldc) = c1_nxt2; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0, b1; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[1]; + c0_nxt -= a0 * b1; + c1_nxt -= a1 * b1; + + a += 2; + b += 2; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + c1 *= b0; + + c0_nxt -= c0 * b1; + c1_nxt -= c1 * b1; + + c0_nxt *= b3; + c1_nxt *= b3; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + if (bk > 0) + { + BLASLONG i; + FLOAT a0, a1, b0; + + for (i = bk; i--;) + { + a0 = a[0]; + a1 = a[1]; + + b0 = b[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + a += 2; + b += 1; + } + } + + b0 = *b; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + + c0 = *(c + 0); + c0_nxt1 = *(c + 1 * ldc); + c0_nxt2 = *(c + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= a[0] * b[0]; + c0_nxt1 -= a[0] * b[1]; + c0_nxt2 -= a[0] * b[2]; + c0_nxt3 -= a[0] * b[3]; + + a += 1; + b += 4; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b2 = *(b + 2); + b3 = *(b + 3); + b5 = *(b + 5); + b6 = *(b + 6); + b7 = *(b + 7); + b10 = *(b + 10); + b11 = *(b + 11); + b15 = *(b + 15); + + c0 *= b0; + + c0_nxt1 -= c0 * b1; + c0_nxt1 *= b5; + + c0_nxt2 -= c0 * b2; + c0_nxt2 -= c0_nxt1 * b6; + c0_nxt2 *= b10; + + c0_nxt3 -= c0 * b3; + c0_nxt3 -= c0_nxt1 * b7; + c0_nxt3 -= c0_nxt2 * b11; + c0_nxt3 *= b15; + + *(a + 0) = c0; + *(a + 1) = c0_nxt1; + *(a + 2) = c0_nxt2; + *(a + 3) = c0_nxt3; + + *(c + 0) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; +} + +static void dsolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b1, b3, c0, c0_nxt; + + c0 = *c; + c0_nxt = *(c + ldc); + + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + c0 -= *a * b[0]; + c0_nxt -= *a * b[1]; + + a += 1; + b += 2; + } + } + + b0 = *(b + 0); + b1 = *(b + 1); + b3 = *(b + 3); + + c0 *= b0; + + c0_nxt -= c0 * b1; + c0_nxt *= b3; + + *(a + 0) = c0; + *(a + 1) = c0_nxt; + + *(c + 0) = c0; + *(c + ldc) = c0_nxt; +} + +static void dgmm_dsolve_1x1_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) +{ + if (bk > 0) + { + BLASLONG i; + + for (i = bk; i--;) + { + *c -= *a * *b; + + a += 1; + b += 1; + } + } + + *c *= *a; + *b = *c; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc; + + kk = 0; + + for (j = (n >> 2); j--;) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x4_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x4_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + kk += 4; + b += 4 * k; + c += 4 * ldc; + } + + if (n & 3) + { + if (n & 2) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x2_rn_msa(aa, b, cc, ldc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x2_rn_msa(aa, b, cc, ldc, kk); + + aa += k; + cc += 1; + } + } + + b += 2 * k; + c += 2 * ldc; + kk += 2; + } + + if (n & 1) + { + aa = a; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_rn_msa(aa, b, cc, kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_rn_msa(aa, b, cc, kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x1_rn_msa(aa, b, cc, kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dgmm_dsolve_1x1_msa(b, aa, cc, kk); + + aa += k; + cc += 1; + } + } + + b += k; + c += ldc; + kk += 1; + } + } + + return 0; +} diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c new file mode 100644 index 0000000..a90d5fe --- /dev/null +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -0,0 +1,866 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +static void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15; + v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v2f64 src_b14, src_b15; + FLOAT *c_nxt1line = c + ldc; + FLOAT *c_nxt2line = c + 2 * ldc; + FLOAT *c_nxt3line = c + 3 * ldc; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c_nxt1line, 2, src_c4, src_c5, src_c6, src_c7); + LD_DP4(c_nxt2line, 2, src_c8, src_c9, src_c10, src_c11); + LD_DP4(c_nxt3line, 2, src_c12, src_c13, src_c14, src_c15); + + if (bk > 0) + { + BLASLONG i; + FLOAT *pba = a, *pbb = b; + v2f64 src_b, src_b0, src_b1, src_b2, src_b3; + v2f64 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7; + + LD_DP4(pba, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pbb, 2, src_b0, src_b1); + + for (i = (bk - 1); i--;) + { + pba += 8; + pbb += 4; + + LD_DP4(pba, 2, src_a4, src_a5, src_a6, src_a7); + LD_DP2(pbb, 2, src_b2, src_b3); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + + src_a0 = src_a4; + src_a1 = src_a5; + src_a2 = src_a6; + src_a3 = src_a7; + src_b0 = src_b2; + src_b1 = src_b3; + } + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c8 -= src_a0 * src_b; + src_c9 -= src_a1 * src_b; + src_c10 -= src_a2 * src_b; + src_c11 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c12 -= src_a0 * src_b; + src_c13 -= src_a1 * src_b; + src_c14 -= src_a2 * src_b; + src_c15 -= src_a3 * src_b; + } + + a -= 32; + b -= 16; + + src_b12 = LD_DP(b + 12); + src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); + src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); + src_b14 = LD_DP(b + 14); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1); + src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0); + + src_b8 = LD_DP(b + 8); + src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); + src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); + src_b10 = __msa_cast_to_vector_double(*(b + 10)); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b4 = LD_DP(b + 4); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); + src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); + + src_c12 *= src_b15; + src_c13 *= src_b15; + src_c14 *= src_b15; + src_c15 *= src_b15; + + src_c8 -= src_c12 * src_b14; + src_c9 -= src_c13 * src_b14; + src_c10 -= src_c14 * src_b14; + src_c11 -= src_c15 * src_b14; + + src_c8 *= src_b10; + src_c9 *= src_b10; + src_c10 *= src_b10; + src_c11 *= src_b10; + + src_c4 -= src_c12 * src_b13; + src_c5 -= src_c13 * src_b13; + src_c6 -= src_c14 * src_b13; + src_c7 -= src_c15 * src_b13; + + src_c4 -= src_c8 * src_b9; + src_c5 -= src_c9 * src_b9; + src_c6 -= src_c10 * src_b9; + src_c7 -= src_c11 * src_b9; + + src_c4 *= src_b5; + src_c5 *= src_b5; + src_c6 *= src_b5; + src_c7 *= src_b5; + + src_c0 -= src_c12 * src_b12; + src_c1 -= src_c13 * src_b12; + src_c2 -= src_c14 * src_b12; + src_c3 -= src_c15 * src_b12; + + src_c0 -= src_c8 * src_b8; + src_c1 -= src_c9 * src_b8; + src_c2 -= src_c10 * src_b8; + src_c3 -= src_c11 * src_b8; + + src_c0 -= src_c4 * src_b4; + src_c1 -= src_c5 * src_b4; + src_c2 -= src_c6 * src_b4; + src_c3 -= src_c7 * src_b4; + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c12, src_c13, src_c14, src_c15, c_nxt3line, 2); + ST_DP4(src_c12, src_c13, src_c14, src_c15, a + 24, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, c_nxt2line, 2); + ST_DP4(src_c8, src_c9, src_c10, src_c11, a + 16, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c_nxt1line, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b2, src_b3; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + LD_DP4(c + ldc, 2, src_c4, src_c5, src_c6, src_c7); + + if (bk > 0) + { + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0; + + LD_DP4(a + 16, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(b + 4); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + src_c2 -= src_a2 * src_b; + src_c3 -= src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + src_c6 -= src_a2 * src_b; + src_c7 -= src_a3 * src_b; + } + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + + src_c4 *= src_b3; + src_c5 *= src_b3; + src_c6 *= src_b3; + src_c7 *= src_b3; + + src_c0 -= src_c4 * src_b2; + src_c1 -= src_c5 * src_b2; + src_c2 -= src_c6 * src_b2; + src_c3 -= src_c7 * src_b2; + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); +} + +static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +{ + v2f64 src_c0, src_c1, src_c2, src_c3; + v2f64 src_b0; + + LD_DP4(c, 2, src_c0, src_c1, src_c2, src_c3); + + src_b0 = __msa_cast_to_vector_double(*b); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + + src_c0 *= src_b0; + src_c1 *= src_b0; + src_c2 *= src_b0; + src_c3 *= src_b0; + + ST_DP4(src_c0, src_c1, src_c2, src_c3, c, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7; + v2f64 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13; + v2f64 src_b14, src_b15; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + LD_DP2(c + 2 * ldc, 2, src_c4, src_c5); + LD_DP2(c + 3 * ldc, 2, src_c6, src_c7); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 16, *bb = b + 16; + v2f64 src_a0, src_a1, src_b, src_b0, src_b1; + + for (i = bk; i--;) + { + LD_DP2(aa, 2, src_a0, src_a1); + LD_DP2(bb, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + src_c4 -= src_a0 * src_b; + src_c5 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + src_c6 -= src_a0 * src_b; + src_c7 -= src_a1 * src_b; + + aa += 4; + bb += 4; + } + } + + src_b12 = LD_DP(b + 12); + src_b13 = (v2f64) __msa_splati_d((v2i64) src_b12, 1); + src_b12 = (v2f64) __msa_splati_d((v2i64) src_b12, 0); + src_b14 = LD_DP(b + 14); + src_b15 = (v2f64) __msa_splati_d((v2i64) src_b14, 1); + src_b14 = (v2f64) __msa_splati_d((v2i64) src_b14, 0); + + src_b8 = LD_DP(b + 8); + src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); + src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); + src_b10 = __msa_cast_to_vector_double(*(b + 10)); + src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b4 = LD_DP(b + 4); + src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); + src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); + + src_c6 *= src_b15; + src_c7 *= src_b15; + + src_c4 -= src_c6 * src_b14; + src_c5 -= src_c7 * src_b14; + + src_c4 *= src_b10; + src_c5 *= src_b10; + + src_c2 -= src_c6 * src_b13; + src_c3 -= src_c7 * src_b13; + + src_c2 -= src_c4 * src_b9; + src_c3 -= src_c5 * src_b9; + + src_c2 *= src_b5; + src_c3 *= src_b5; + + src_c0 -= src_c6 * src_b12; + src_c1 -= src_c7 * src_b12; + + src_c0 -= src_c4 * src_b8; + src_c1 -= src_c5 * src_b8; + + src_c0 -= src_c2 * src_b4; + src_c1 -= src_c3 * src_b4; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_DP2(src_c6, src_c7, c + 3 * ldc, 2); + ST_DP2(src_c4, src_c5, c + 2 * ldc, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + ST_DP2(src_c0, src_c1, c, 2); + + ST_DP4(src_c4, src_c5, src_c6, src_c7, a + 8, 2); + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +{ + v2f64 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3; + + LD_DP2(c, 2, src_c0, src_c1); + LD_DP2(c + ldc, 2, src_c2, src_c3); + + if (bk > 0) + { + v2f64 src_a0, src_a1, src_b, src_b0; + + LD_DP2(a + 8, 2, src_a0, src_a1); + src_b0 = LD_DP(b + 4); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + src_c0 -= src_a0 * src_b; + src_c1 -= src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + src_c2 -= src_a0 * src_b; + src_c3 -= src_a1 * src_b; + } + + src_b0 = __msa_cast_to_vector_double(*(b + 0)); + src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b2 = LD_DP(b + 2); + src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); + src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); + + src_c2 *= src_b3; + src_c3 *= src_b3; + + src_c0 -= src_c2 * src_b2; + src_c1 -= src_c3 * src_b2; + + src_c0 *= src_b0; + src_c1 *= src_b0; + + ST_DP2(src_c0, src_c1, c, 2); + ST_DP2(src_c2, src_c3, c + ldc, 2); + + ST_DP4(src_c0, src_c1, src_c2, src_c3, a, 2); +} + +static void dsolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +{ + FLOAT b0, c0, c1, c2, c3; + + b0 = *(b + 0); + + c0 = *(c + 0); + c1 = *(c + 1); + c2 = *(c + 2); + c3 = *(c + 3); + + c0 *= b0; + c1 *= b0; + c2 *= b0; + c3 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c2; + *(a + 3) = c3; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 2) = c2; + *(c + 3) = c3; +} + +static void dsolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3; + + c0 = *(c + 0); + c1 = *(c + 1); + c0_nxt1 = *(c + 0 + 1 * ldc); + c1_nxt1 = *(c + 1 + 1 * ldc); + c0_nxt2 = *(c + 0 + 2 * ldc); + c1_nxt2 = *(c + 1 + 2 * ldc); + c0_nxt3 = *(c + 0 + 3 * ldc); + c1_nxt3 = *(c + 1 + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 8, *bb = b + 16; + FLOAT a0, a1, b0, b1, b2, b3; + + for (i = bk; i--;) + { + a0 = aa[0]; + a1 = aa[1]; + + b0 = bb[0]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = bb[1]; + c0_nxt1 -= a0 * b1; + c1_nxt1 -= a1 * b1; + + b2 = bb[2]; + c0_nxt2 -= a0 * b2; + c1_nxt2 -= a1 * b2; + + b3 = bb[3]; + c0_nxt3 -= a0 * b3; + c1_nxt3 -= a1 * b3; + + aa += 2; + bb += 4; + } + } + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c0_nxt3 *= b15; + c1_nxt3 *= b15; + + c0_nxt2 -= c0_nxt3 * b14; + c1_nxt2 -= c1_nxt3 * b14; + c0_nxt2 *= b10; + c1_nxt2 *= b10; + + c0_nxt1 -= c0_nxt3 * b13; + c1_nxt1 -= c1_nxt3 * b13; + c0_nxt1 -= c0_nxt2 * b9; + c1_nxt1 -= c1_nxt2 * b9; + c0_nxt1 *= b5; + c1_nxt1 *= b5; + + c0 -= c0_nxt3 * b12; + c1 -= c1_nxt3 * b12; + c0 -= c0_nxt2 * b8; + c1 -= c1_nxt2 * b8; + c0 -= c0_nxt1 * b4; + c1 -= c1_nxt1 * b4; + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt1; + *(a + 3) = c1_nxt1; + *(a + 4) = c0_nxt2; + *(a + 5) = c1_nxt2; + *(a + 6) = c0_nxt3; + *(a + 7) = c1_nxt3; + + *(c + 0) = c0; + *(c + 1) = c1; + + *(c + 0 + 1 * ldc) = c0_nxt1; + *(c + 1 + 1 * ldc) = c1_nxt1; + + *(c + 0 + 2 * ldc) = c0_nxt2; + *(c + 1 + 2 * ldc) = c1_nxt2; + + *(c + 0 + 3 * ldc) = c0_nxt3; + *(c + 1 + 3 * ldc) = c1_nxt3; +} + +static void dsolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, int bk) +{ + FLOAT b0, b2, b3; + FLOAT c0, c1, c0_nxt, c1_nxt; + + c0 = *(c + 0); + c1 = *(c + 1); + + c0_nxt = *(c + 0 + ldc); + c1_nxt = *(c + 1 + ldc); + + if (bk > 0) + { + FLOAT a0, a1, b0, b1; + + a0 = a[4]; + a1 = a[5]; + + b0 = b[4]; + c0 -= a0 * b0; + c1 -= a1 * b0; + + b1 = b[5]; + c0_nxt -= a0 * b1; + c1_nxt -= a1 * b1; + } + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c0_nxt *= b3; + c1_nxt *= b3; + + c0 -= c0_nxt * b2; + c0 *= b0; + + c1 -= c1_nxt * b2; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + *(a + 2) = c0_nxt; + *(a + 3) = c1_nxt; + + *(c + 0) = c0; + *(c + 1) = c1; + *(c + 0 + ldc) = c0_nxt; + *(c + 1 + ldc) = c1_nxt; +} + +static void dsolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c) +{ + FLOAT b0, c0, c1; + + c0 = *(c + 0); + c1 = *(c + 1); + + b0 = *b; + + c0 *= b0; + c1 *= b0; + + *(a + 0) = c0; + *(a + 1) = c1; + + *(c + 0) = c0; + *(c + 1) = c1; +} + +static void dsolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15; + FLOAT c0, c0_nxt1, c0_nxt2, c0_nxt3; + + c0 = *(c + 0); + c0_nxt1 = *(c + 1 * ldc); + c0_nxt2 = *(c + 2 * ldc); + c0_nxt3 = *(c + 3 * ldc); + + if (bk > 0) + { + BLASLONG i; + FLOAT *aa = a + 4, *bb = b + 16; + + for (i = bk; i--;) + { + c0 -= aa[0] * bb[0]; + c0_nxt1 -= aa[0] * bb[1]; + c0_nxt2 -= aa[0] * bb[2]; + c0_nxt3 -= aa[0] * bb[3]; + + aa += 1; + bb += 4; + } + } + + b0 = *b; + b4 = *(b + 4); + b5 = *(b + 5); + b8 = *(b + 8); + b9 = *(b + 9); + b10 = *(b + 10); + b12 = *(b + 12); + b13 = *(b + 13); + b14 = *(b + 14); + b15 = *(b + 15); + + c0_nxt3 *= b15; + + c0_nxt2 -= c0_nxt3 * b14; + c0_nxt2 *= b10; + + c0_nxt1 -= c0_nxt3 * b13; + c0_nxt1 -= c0_nxt2 * b9; + c0_nxt1 *= b5; + + c0 -= c0_nxt3 * b12; + c0 -= c0_nxt2 * b8; + c0 -= c0_nxt1 * b4; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c0_nxt1; + *(a + 2) = c0_nxt2; + *(a + 3) = c0_nxt3; + + *(c) = c0; + *(c + 1 * ldc) = c0_nxt1; + *(c + 2 * ldc) = c0_nxt2; + *(c + 3 * ldc) = c0_nxt3; +} + +static void dsolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) +{ + FLOAT b0, b2, b3, c0, c0_nxt; + + c0 = *(c + 0); + c0_nxt = *(c + ldc); + + if (bk > 0) + { + c0 -= a[2] * b[4]; + c0_nxt -= a[2] * b[5]; + } + + b3 = *(b + 3); + b2 = *(b + 2); + b0 = *b; + + c0_nxt *= b3; + + c0 -= c0_nxt * b2; + c0 *= b0; + + *(a + 0) = c0; + *(a + 1) = c0_nxt; + + *(c + 0) = c0; + *(c + ldc) = c0_nxt; +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b, + FLOAT *c, BLASLONG ldc, BLASLONG offset) +{ + BLASLONG i, j, kk; + FLOAT *aa, *cc, *bb; + + kk = n; + c += n * ldc; + b += n * k; + + if (n & 3) + { + if (n & 1) + { + aa = a; + c -= ldc; + b -= k; + bb = b + (kk - 1); + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x1_rt_msa(aa + 8 * kk - 8, bb, cc); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x1_rt_msa(aa + 4 * kk - 4, bb, cc); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x1_rt_msa(aa + 2 * kk - 2, bb, cc); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + *cc *= *bb; + *(aa + kk - 1) = *cc; + + aa += k; + cc += 1; + } + + } + + kk -= 1; + } + + if (n & 2) + { + aa = a; + c -= 2 * ldc; + b -= 2 * k; + bb = b + 2 * kk; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x2_rt_msa(aa + 8 * kk - 16, bb - 4, cc, ldc, k - kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x2_rt_msa(aa + 4 * kk - 8, bb - 4, cc, ldc, k - kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x2_rt_msa(aa + 2 * kk - 4, bb - 4, cc, ldc, k - kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x2_rt_msa(aa + kk - 2, bb - 4, cc, ldc, k - kk); + } + } + + kk -= 2; + } + } + + for (j = (n >> 2); j--;) + { + aa = a; + b -= 4 * k; + bb = b + 4 * kk; + c -= 4 * ldc; + cc = c; + + for (i = (m >> 3); i--;) + { + dsolve_8x4_rt_msa(aa + kk * 8, bb, cc, ldc, k - kk); + + aa += 8 * k; + cc += 8; + } + + if (m & 7) + { + if (m & 4) + { + dsolve_4x4_rt_msa(aa + kk * 4 - 16, bb - 16, cc, ldc, k - kk); + + aa += 4 * k; + cc += 4; + } + + if (m & 2) + { + dsolve_2x4_rt_msa(aa + kk * 2 - 8, bb - 16, cc, ldc, k - kk); + + aa += 2 * k; + cc += 2; + } + + if (m & 1) + { + dsolve_1x4_rt_msa(aa + kk - 4, bb - 16, cc, ldc, k - kk); + + aa += k; + cc += 1; + } + } + + kk -= 4; + } + + return 0; +} -- 2.7.4