From: kaustubh Date: Mon, 9 Jan 2017 12:52:09 +0000 (+0530) Subject: Add msa optimization for AXPY, COPY, SCALE, SWAP X-Git-Tag: upstream/0.2.20^2~79^2~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=88afb3bc94ccabb28fe06174bbd791d3bccbff97;p=platform%2Fupstream%2Fopenblas.git Add msa optimization for AXPY, COPY, SCALE, SWAP Signed-off-by: kaustubh --- diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 6835792..50751bc 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -42,15 +42,29 @@ CASUMKERNEL = ../mips/asum.c ZASUMKERNEL = ../mips/asum.c endif +ifdef HAVE_MSA +SAXPYKERNEL = ../mips/saxpy_msa.c +DAXPYKERNEL = ../mips/daxpy_msa.c +CAXPYKERNEL = ../mips/caxpy_msa.c +ZAXPYKERNEL = ../mips/zaxpy_msa.c +else SAXPYKERNEL = ../mips/axpy.c DAXPYKERNEL = ../mips/axpy.c CAXPYKERNEL = ../mips/zaxpy.c ZAXPYKERNEL = ../mips/zaxpy.c +endif +ifdef HAVE_MSA +SCOPYKERNEL = ../mips/scopy_msa.c +DCOPYKERNEL = ../mips/dcopy_msa.c +CCOPYKERNEL = ../mips/ccopy_msa.c +ZCOPYKERNEL = ../mips/zcopy_msa.c +else SCOPYKERNEL = ../mips/copy.c DCOPYKERNEL = ../mips/copy.c CCOPYKERNEL = ../mips/zcopy.c ZCOPYKERNEL = ../mips/zcopy.c +endif ifdef HAVE_MSA SDOTKERNEL = ../mips/sdot_msa.c @@ -74,15 +88,29 @@ DROTKERNEL = ../mips/rot.c CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c +ifdef HAVE_MSA +SSCALKERNEL = ../mips/sscal_msa.c +DSCALKERNEL = ../mips/dscal_msa.c +CSCALKERNEL = ../mips/cscal_msa.c +ZSCALKERNEL = ../mips/zscal_msa.c +else SSCALKERNEL = ../mips/scal.c DSCALKERNEL = ../mips/scal.c CSCALKERNEL = ../mips/zscal.c ZSCALKERNEL = ../mips/zscal.c +endif +ifdef HAVE_MSA +SSWAPKERNEL = ../mips/sswap.c +DSWAPKERNEL = ../mips/dswap.c +CSWAPKERNEL = ../mips/cswap.c +ZSWAPKERNEL = ../mips/zswap.c +else SSWAPKERNEL = ../mips/swap.c DSWAPKERNEL = ../mips/swap.c CSWAPKERNEL = ../mips/zswap.c ZSWAPKERNEL = ../mips/zswap.c +endif ifdef HAVE_MSA SGEMVNKERNEL = ../mips/sgemv_n_msa.c diff --git a/kernel/mips/caxpy_msa.c b/kernel/mips/caxpy_msa.c new file mode 100644 index 0000000..75b835c --- /dev/null +++ b/kernel/mips/caxpy_msa.c @@ -0,0 +1,471 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#if !defined(CONJ) + #define OP0 += + #define OP1 -= + #define OP2 += +#else + #define OP0 -= + #define OP1 += + #define OP2 -= +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) +{ + BLASLONG i, inc_x2, inc_y2; + FLOAT *py; + v4f32 x0, x1, x2, x3, x4, x5, x6, x7; + v4f32 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec; + v4f32 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i; + v4f32 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i; + FLOAT xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7; + FLOAT yd0, yd1, yd2, yd3, yd4, yd5, yd6, yd7; + + if (n < 0) return(0); + if ((da_r == 0.0) && (da_i == 0.0)) return(0); + + py = y; + + if ((1 == inc_x) && (1 == inc_y)) + { + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 64; + + dar_vec = COPY_FLOAT_TO_VECTOR(da_r); + dai_vec = COPY_FLOAT_TO_VECTOR(da_i); + + for (i = (n >> 4); i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + x_pref += 32; + y_pref += 32; + + LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); + LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); + PCKEVOD_W2_SP(x1, x0, x0r, x0i); + PCKEVOD_W2_SP(y1, y0, y0r, y0i); + PCKEVOD_W2_SP(x3, x2, x1r, x1i); + PCKEVOD_W2_SP(y3, y2, y1r, y1i); + PCKEVOD_W2_SP(x5, x4, x2r, x2i); + PCKEVOD_W2_SP(y5, y4, y2r, y2i); + PCKEVOD_W2_SP(x7, x6, x3r, x3i); + PCKEVOD_W2_SP(y7, y6, y3r, y3i); + + FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y2i OP0 dar_vec * x2i; + y3i OP0 dar_vec * x3i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y2r OP1 dai_vec * x2i; + y3r OP1 dai_vec * x3i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + y2i OP2 dai_vec * x2r; + y3i OP2 dai_vec * x3r; + + ILVRL_W2_SP(y0i, y0r, y0, y1); + ILVRL_W2_SP(y1i, y1r, y2, y3); + ILVRL_W2_SP(y2i, y2r, y4, y5); + ILVRL_W2_SP(y3i, y3r, y6, y7); + ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); + } + + if (n & 15) + { + if (n & 8) + { + LD_SP4_INC(x, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + PCKEVOD_W2_SP(x1, x0, x0r, x0i); + PCKEVOD_W2_SP(y1, y0, y0r, y0i); + PCKEVOD_W2_SP(x3, x2, x1r, x1i); + PCKEVOD_W2_SP(y3, y2, y1r, y1i); + + FMADD2(x0r, x1r, dar_vec, y0r, y1r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + + ILVRL_W2_SP(y0i, y0r, y0, y1); + ILVRL_W2_SP(y1i, y1r, y2, y3); + ST_SP4_INC(y0, y1, y2, y3, y, 4); + } + + if (n & 4) + { + LD_SP2_INC(x, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + PCKEVOD_W2_SP(x1, x0, x0r, x0i); + PCKEVOD_W2_SP(y1, y0, y0r, y0i); + + y0r += dar_vec * x0r; + y0i OP0 dar_vec * x0i; + y0r OP1 dai_vec * x0i; + y0i OP2 dai_vec * x0r; + + ILVRL_W2_SP(y0i, y0r, y0, y1); + ST_SP2_INC(y0, y1, y, 4); + } + + if (n & 2) + { + LD_GP4_INC(x, 1, xd0, xd1, xd2, xd3); + LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3); + + FMADD2(xd0, xd2, da_r, yd0, yd2); + yd1 OP0 da_r * xd1; + yd3 OP0 da_r * xd3; + yd0 OP1 da_i * xd1; + yd2 OP1 da_i * xd3; + yd1 OP2 da_i * xd0; + yd3 OP2 da_i * xd2; + + ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, xd0, xd1); + LD_GP2_INC(py, 1, yd0, yd1); + + yd0 += da_r * xd0; + yd1 OP0 da_r * xd1; + yd0 OP1 da_i * xd1; + yd1 OP2 da_i * xd0; + + ST_GP2_INC(yd0, yd1, y, 1); + } + } + } + else if (1 == inc_y) + { + FLOAT *y_pref; + BLASLONG pref_offset; + v4f32 x8, x9, x10, x11, x12, x13, x14; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 64; + + inc_x2 = 2 * inc_x; + + dar_vec = COPY_FLOAT_TO_VECTOR(da_r); + dai_vec = COPY_FLOAT_TO_VECTOR(da_i); + + for (i = (n >> 4); i--;) + { + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + y_pref += 32; + + LD_SP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x14); + LD_SP7_INC(x, inc_x2, x8, x9, x10, x11, x12, x13, x7); + PCKEV_D2_SP(x1, x0, x3, x2, x0, x1); + PCKEV_D2_SP(x5, x4, x14, x6, x2, x3); + PCKEV_D2_SP(x9, x8, x11, x10, x4, x5); + x6 = (v4f32) __msa_pckev_d((v2i64) x13, (v2i64) x12); + x7 = (v4f32) __msa_insert_w((v4i32) x7, 2, *((int *) x)); + x7 = (v4f32) __msa_insert_w((v4i32) x7, 3, *((int *) (x + 1))); + x += inc_x2; + + LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); + PCKEVOD_W2_SP(x1, x0, x0r, x0i); + PCKEVOD_W2_SP(y1, y0, y0r, y0i); + PCKEVOD_W2_SP(x3, x2, x1r, x1i); + PCKEVOD_W2_SP(y3, y2, y1r, y1i); + PCKEVOD_W2_SP(x5, x4, x2r, x2i); + PCKEVOD_W2_SP(y5, y4, y2r, y2i); + PCKEVOD_W2_SP(x7, x6, x3r, x3i); + PCKEVOD_W2_SP(y7, y6, y3r, y3i); + + FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y2i OP0 dar_vec * x2i; + y3i OP0 dar_vec * x3i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y2r OP1 dai_vec * x2i; + y3r OP1 dai_vec * x3i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + y2i OP2 dai_vec * x2r; + y3i OP2 dai_vec * x3r; + + ILVRL_W2_SP(y0i, y0r, y0, y1); + ILVRL_W2_SP(y1i, y1r, y2, y3); + ILVRL_W2_SP(y2i, y2r, y4, y5); + ILVRL_W2_SP(y3i, y3r, y6, y7); + + ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); + } + + if (n & 15) + { + if (n & 8) + { + LD_SP7_INC(x, inc_x2, x0, x1, x2, x6, x4, x5, x3); + PCKEV_D2_SP(x1, x0, x6, x2, x0, x1); + + x2 = (v4f32) __msa_pckev_d((v2i64) x5, (v2i64) x4); + x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x)); + x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) (x + 1))); + x += inc_x2; + + LD_SP4_INC(py, 4, y0, y1, y2, y3); + PCKEVOD_W2_SP(x1, x0, x0r, x0i); + PCKEVOD_W2_SP(y1, y0, y0r, y0i); + PCKEVOD_W2_SP(x3, x2, x1r, x1i); + PCKEVOD_W2_SP(y3, y2, y1r, y1i); + + FMADD2(x0r, x1r, dar_vec, y0r, y1r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + + ILVRL_W2_SP(y0i, y0r, y0, y1); + ILVRL_W2_SP(y1i, y1r, y2, y3); + ST_SP4_INC(y0, y1, y2, y3, y, 4); + } + + if (n & 4) + { + LD_SP3_INC(x, inc_x2, x0, x2, x1); + + x0 = (v4f32) __msa_pckev_d((v2i64) x2, (v2i64) x0); + x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); + x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) (x + 1))); + x += inc_x2; + + LD_SP2_INC(py, 4, y0, y1); + PCKEVOD_W2_SP(x1, x0, x0r, x0i); + PCKEVOD_W2_SP(y1, y0, y0r, y0i); + + y0r += dar_vec * x0r; + y0i OP0 dar_vec * x0i; + y0r OP1 dai_vec * x0i; + y0i OP2 dai_vec * x0r; + + ILVRL_W2_SP(y0i, y0r, y0, y1); + ST_SP2_INC(y0, y1, y, 4); + } + + if (n & 2) + { + xd0 = x[0]; + xd1 = x[1]; + x += inc_x2; + xd2 = x[0]; + xd3 = x[1]; + x += inc_x2; + + LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3); + + FMADD2(xd0, xd2, da_r, yd0, yd2); + yd1 OP0 da_r * xd1; + yd3 OP0 da_r * xd3; + yd0 OP1 da_i * xd1; + yd2 OP1 da_i * xd3; + yd1 OP2 da_i * xd0; + yd3 OP2 da_i * xd2; + + ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, xd0, xd1); + LD_GP2_INC(py, 1, yd0, yd1); + + yd0 += da_r * xd0; + yd1 OP0 da_r * xd1; + yd0 OP1 da_i * xd1; + yd1 OP2 da_i * xd0; + + ST_GP2_INC(yd0, yd1, y, 1); + } + } + } + else + { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + for (i = (n >> 2); i--;) + { + xd0 = x[0]; + xd1 = x[1]; + x += inc_x2; + xd2 = x[0]; + xd3 = x[1]; + x += inc_x2; + xd4 = x[0]; + xd5 = x[1]; + x += inc_x2; + xd6 = x[0]; + xd7 = x[1]; + x += inc_x2; + + yd0 = py[0]; + yd1 = py[1]; + py += inc_y2; + yd2 = py[0]; + yd3 = py[1]; + py += inc_y2; + yd4 = py[0]; + yd5 = py[1]; + py += inc_y2; + yd6 = py[0]; + yd7 = py[1]; + py += inc_y2; + + FMADD4(xd0, xd2, xd4, xd6, da_r, yd0, yd2, yd4, yd6); + yd1 OP0 da_r * xd1; + yd3 OP0 da_r * xd3; + yd5 OP0 da_r * xd5; + yd7 OP0 da_r * xd7; + yd0 OP1 da_i * xd1; + yd2 OP1 da_i * xd3; + yd4 OP1 da_i * xd5; + yd6 OP1 da_i * xd7; + yd1 OP2 da_i * xd0; + yd3 OP2 da_i * xd2; + yd5 OP2 da_i * xd4; + yd7 OP2 da_i * xd6; + + y[0] = yd0; + y[1] = yd1; + y += inc_y2; + y[0] = yd2; + y[1] = yd3; + y += inc_y2; + y[0] = yd4; + y[1] = yd5; + y += inc_y2; + y[0] = yd6; + y[1] = yd7; + y += inc_y2; + } + + if (n & 3) + { + if (n & 2) + { + xd0 = x[0]; + xd1 = x[1]; + x += inc_x2; + xd2 = x[0]; + xd3 = x[1]; + x += inc_x2; + + yd0 = py[0]; + yd1 = py[1]; + py += inc_y2; + yd2 = py[0]; + yd3 = py[1]; + py += inc_y2; + + FMADD2(xd0, xd2, da_r, yd0, yd2); + yd1 OP0 da_r * xd1; + yd3 OP0 da_r * xd3; + yd0 OP1 da_i * xd1; + yd2 OP1 da_i * xd3; + yd1 OP2 da_i * xd0; + yd3 OP2 da_i * xd2; + + y[0] = yd0; + y[1] = yd1; + y += inc_y2; + y[0] = yd2; + y[1] = yd3; + y += inc_y2; + } + + if (n & 1) + { + xd0 = x[0]; + xd1 = x[1]; + + yd0 = y[0]; + yd1 = y[1]; + + yd0 += da_r * xd0; + yd1 OP0 da_r * xd1; + yd0 OP1 da_i * xd1; + yd1 OP2 da_i * xd0; + + y[0] = yd0; + y[1] = yd1; + } + } + } + + return (0); +} diff --git a/kernel/mips/ccopy_msa.c b/kernel/mips/ccopy_msa.c new file mode 100644 index 0000000..cb851f7 --- /dev/null +++ b/kernel/mips/ccopy_msa.c @@ -0,0 +1,201 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i, inc_x2, inc_y2; + v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + + if (n < 0) return (0); + + if ((1 == inc_x) && (1 == inc_y)) + { + if (n > 31) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 128 + 32; + + LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = (n >> 5) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 64; + + x8 = LD_SP(x); x += 4; + ST_SP(x0, y); y += 4; + x9 = LD_SP(x); x += 4; + ST_SP(x1, y); y += 4; + x10 = LD_SP(x); x += 4; + ST_SP(x2, y); y += 4; + x11 = LD_SP(x); x += 4; + ST_SP(x3, y); y += 4; + x12 = LD_SP(x); x += 4; + ST_SP(x4, y); y += 4; + x13 = LD_SP(x); x += 4; + ST_SP(x5, y); y += 4; + x14 = LD_SP(x); x += 4; + ST_SP(x6, y); y += 4; + x15 = LD_SP(x); x += 4; + ST_SP(x7, y); y += 4; + x0 = LD_SP(x); x += 4; + ST_SP(x8, y); y += 4; + x1 = LD_SP(x); x += 4; + ST_SP(x9, y); y += 4; + x2 = LD_SP(x); x += 4; + ST_SP(x10, y); y += 4; + x3 = LD_SP(x); x += 4; + ST_SP(x11, y); y += 4; + x4 = LD_SP(x); x += 4; + ST_SP(x12, y); y += 4; + x5 = LD_SP(x); x += 4; + ST_SP(x13, y); y += 4; + x6 = LD_SP(x); x += 4; + ST_SP(x14, y); y += 4; + x7 = LD_SP(x); x += 4; + ST_SP(x15, y); y += 4; + } + + x8 = LD_SP(x); x += 4; + x9 = LD_SP(x); x += 4; + ST_SP(x0, y); y += 4; + x10 = LD_SP(x); x += 4; + ST_SP(x1, y); y += 4; + x11 = LD_SP(x); x += 4; + ST_SP(x2, y); y += 4; + x12 = LD_SP(x); x += 4; + ST_SP(x3, y); y += 4; + x13 = LD_SP(x); x += 4; + ST_SP(x4, y); y += 4; + x14 = LD_SP(x); x += 4; + ST_SP(x5, y); y += 4; + x15 = LD_SP(x); x += 4; + ST_SP(x6, y); y += 4; + ST_SP(x7, y); y += 4; + + ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4); + } + + if (n & 31) + { + if (n & 16) + { + LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); + ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4); + } + + if (n & 8) + { + LD_SP4_INC(x, 4, x0, x1, x2, x3); + ST_SP4_INC(x0, x1, x2, x3, y, 4); + } + + if (n & 4) + { + LD_SP2_INC(x, 4, x0, x1); + ST_SP2_INC(x0, x1, y, 4); + } + + if (n & 2) + { + LD_GP4_INC(x, 1, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, y, 1); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, f0, f1); + ST_GP2_INC(f0, f1, y, 1); + } + } + } + else + { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + for (i = (n >> 2); i--;) + { + f0 = *x; + f1 = *(x+1); x += inc_x2; + f2 = *x; + f3 = *(x+1); x += inc_x2; + f4 = *x; + f5 = *(x+1); x += inc_x2; + f6 = *x; + f7 = *(x+1); x += inc_x2; + + *y = f0; + *(y+1) = f1; y += inc_y2; + *y = f2; + *(y+1) = f3; y += inc_y2; + *y = f4; + *(y+1) = f5; y += inc_y2; + *y = f6; + *(y+1) = f7; y += inc_y2; + } + + if (n & 2) + { + f0 = *x; + f1 = *(x+1); x += inc_x2; + f2 = *x; + f3 = *(x+1); x += inc_x2; + + *y = f0; + *(y+1) = f1; y += inc_y2; + *y = f2; + *(y+1) = f3; y += inc_y2; + } + + if (n & 1) + { + LD_GP2_INC(x, 1, f0, f1); + ST_GP2_INC(f0, f1, y, 1); + } + } + + return (0); +} diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c new file mode 100644 index 0000000..11a1450 --- /dev/null +++ b/kernel/mips/cscal_msa.c @@ -0,0 +1,1012 @@ +/******************************************************************************* +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +/* This will shuffle the elements in 'in' vector as (mask needed :: 10 11 00 01) + 0 1 2 3 => 1 0 3 2 */ +#define SHF_177 177 + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) +{ + BLASLONG i, inc_x2; + FLOAT *px; + FLOAT tp0, tp1, tp2, tp3, f0, f1, f2, f3; + v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + v4f32 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15; + v4f32 da_i_vec, da_i_vec_neg, da_r_vec; + + px = x; + + if (1 == inc_x) + { + if ((0.0 == da_r) && (0.0 == da_i)) + { + v4f32 zero_v = __msa_cast_to_vector_float(0); + zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); + zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); + zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); + zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + + for (i = (n >> 5); i--;) + { + ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 4); + ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 4); + } + + if (n & 31) + { + if (n & 16) + { + ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 4); + } + + if (n & 8) + { + ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4); + } + + if (n & 4) + { + ST_SP2_INC(zero_v, zero_v, x, 4); + } + + if (n & 2) + { + ST_SP(zero_v, x); x += 4; + } + + if (n & 1) + { + *x = 0; x += 1; + *x = 0; + } + } + } + else if (0.0 == da_r) + { + da_i_vec = COPY_FLOAT_TO_VECTOR(da_i); + da_i_vec_neg = -da_i_vec; + da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec); + + if (n > 31) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64 + 32; + + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = (n >> 5)- 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 64; + + x8 = LD_SP(px); px += 4; + x0 *= da_i_vec; + x9 = LD_SP(px); px += 4; + x1 *= da_i_vec; + x10 = LD_SP(px); px += 4; + x2 *= da_i_vec; + x11 = LD_SP(px); px += 4; + x3 *= da_i_vec; + x12 = LD_SP(px); px += 4; + x4 *= da_i_vec; + x13 = LD_SP(px); px += 4; + x5 *= da_i_vec; + x0 = (v4f32) __msa_shf_w((v4i32) x0, SHF_177); + x14 = LD_SP(px); px += 4; + x6 *= da_i_vec; + x1 = (v4f32) __msa_shf_w((v4i32) x1, SHF_177); + x15 = LD_SP(px); px += 4; + x7 *= da_i_vec; + x2 = (v4f32) __msa_shf_w((v4i32) x2, SHF_177); + x8 *= da_i_vec; + x3 = (v4f32) __msa_shf_w((v4i32) x3, SHF_177); + ST_SP(x0, x); x += 4; + x9 *= da_i_vec; + x4 = (v4f32) __msa_shf_w((v4i32) x4, SHF_177); + ST_SP(x1, x); x += 4; + x10 *= da_i_vec; + x5 = (v4f32) __msa_shf_w((v4i32) x5, SHF_177); + ST_SP(x2, x); x += 4; + x11 *= da_i_vec; + x6 = (v4f32) __msa_shf_w((v4i32) x6, SHF_177); + ST_SP(x3, x); x += 4; + x12 *= da_i_vec; + x7 = (v4f32) __msa_shf_w((v4i32) x7, SHF_177); + ST_SP(x4, x); x += 4; + x13 *= da_i_vec; + x8 = (v4f32) __msa_shf_w((v4i32) x8, SHF_177); + ST_SP(x5, x); x += 4; + x14 *= da_i_vec; + x9 = (v4f32) __msa_shf_w((v4i32) x9, SHF_177); + ST_SP(x6, x); x += 4; + x15 *= da_i_vec; + x10 = (v4f32) __msa_shf_w((v4i32) x10, SHF_177); + ST_SP(x7, x); x += 4; + x11 = (v4f32) __msa_shf_w((v4i32) x11, SHF_177); + ST_SP(x8, x); x += 4; + x0 = LD_SP(px); px += 4; + x12 = (v4f32) __msa_shf_w((v4i32) x12, SHF_177); + ST_SP(x9, x); x += 4; + x1 = LD_SP(px); px += 4; + x13 = (v4f32) __msa_shf_w((v4i32) x13, SHF_177); + ST_SP(x10, x); x += 4; + x2 = LD_SP(px); px += 4; + x14 = (v4f32) __msa_shf_w((v4i32) x14, SHF_177); + ST_SP(x11, x); x += 4; + x3 = LD_SP(px); px += 4; + x15 = (v4f32) __msa_shf_w((v4i32) x15, SHF_177); + ST_SP(x12, x); x += 4; + x4 = LD_SP(px); px += 4; + ST_SP(x13, x); x += 4; + x5 = LD_SP(px); px += 4; + ST_SP(x14, x); x += 4; + x6 = LD_SP(px); px += 4; + ST_SP(x15, x); x += 4; + x7 = LD_SP(px); px += 4; + } + + LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + x4, x5, x6, x7); + MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, + x8, x9, x10, x11); + MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, + x12, x13, x14, x15); + SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); + SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177); + SHF_W4_SP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_177); + SHF_W4_SP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_177); + ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, + x12, x13, x14, x15, x, 4); + } + + if (n & 31) + { + if (n & 16) + { + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + x4, x5, x6, x7); + SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); + SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177); + ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); + } + + if (n & 8) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); + ST_SP4_INC(x0, x1, x2, x3, x, 4); + } + + if (n & 4) + { + LD_SP2_INC(px, 4, x0, x1); + MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); + SHF_W2_SP(x0, x1, x0, x1, SHF_177); + ST_SP2_INC(x0, x1, x, 4); + } + + if (n & 2) + { + LD_GP4_INC(px, 1, f0, f1, f2, f3); + MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, + f0, f1, f2, f3); + ST_GP4_INC(f1, f0, f3, f2, x, 1); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + MUL2(f0, da_i, f1, -da_i, f0, f1); + ST_GP2_INC(f1, f0, x, 1); + } + } + } + else if (0.0 == da_i) + { + da_r_vec = COPY_FLOAT_TO_VECTOR(da_r); + + if (n > 31) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64 + 32; + + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = (n >> 5)- 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 64; + + x8 = LD_SP(px); px += 4; + x0 *= da_r_vec; + x9 = LD_SP(px); px += 4; + x1 *= da_r_vec; + x10 = LD_SP(px); px += 4; + x2 *= da_r_vec; + x11 = LD_SP(px); px += 4; + x3 *= da_r_vec; + x12 = LD_SP(px); px += 4; + x4 *= da_r_vec; + x13 = LD_SP(px); px += 4; + x5 *= da_r_vec; + ST_SP(x0, x); x += 4; + x14 = LD_SP(px); px += 4; + x6 *= da_r_vec; + ST_SP(x1, x); x += 4; + x15 = LD_SP(px); px += 4; + x7 *= da_r_vec; + ST_SP(x2, x); x += 4; + x8 *= da_r_vec; + ST_SP(x3, x); x += 4; + x9 *= da_r_vec; + ST_SP(x4, x); x += 4; + x10 *= da_r_vec; + ST_SP(x5, x); x += 4; + x11 *= da_r_vec; + ST_SP(x6, x); x += 4; + x12 *= da_r_vec; + ST_SP(x7, x); x += 4; + x13 *= da_r_vec; + ST_SP(x8, x); x += 4; + x0 = LD_SP(px); px += 4; + x14 *= da_r_vec; + ST_SP(x9, x); x += 4; + x1 = LD_SP(px); px += 4; + x15 *= da_r_vec; + ST_SP(x10, x); x += 4; + x2 = LD_SP(px); px += 4; + ST_SP(x11, x); x += 4; + x3 = LD_SP(px); px += 4; + ST_SP(x12, x); x += 4; + x4 = LD_SP(px); px += 4; + ST_SP(x13, x); x += 4; + x5 = LD_SP(px); px += 4; + ST_SP(x14, x); x += 4; + x6 = LD_SP(px); px += 4; + ST_SP(x15, x); x += 4; + x7 = LD_SP(px); px += 4; + } + + LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, + x4, x5, x6, x7); + MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, + x8, x9, x10, x11); + MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, + x12, x13, x14, x15); + ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, + x12, x13, x14, x15, x, 4); + } + + if (n & 31) + { + if (n & 16) + { + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, + x4, x5, x6, x7); + ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); + } + + if (n & 8) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + ST_SP4_INC(x0, x1, x2, x3, x, 4); + } + + if (n & 4) + { + LD_SP2_INC(px, 4, x0, x1); + MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); + ST_SP2_INC(x0, x1, x, 4); + } + + if (n & 2) + { + LD_GP4_INC(px, 1, f0, f1, f2, f3); + MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, x, 1); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + MUL2(f0, da_r, f1, da_r, f0, f1); + ST_GP2_INC(f0, f1, x, 1); + } + } + } + else + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64; + + da_i_vec = COPY_FLOAT_TO_VECTOR(da_i); + da_i_vec_neg = -da_i_vec; + da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec); + + da_r_vec = COPY_FLOAT_TO_VECTOR(da_r); + + for (i = (n >> 5); i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 64; + + LD_SP16_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, + x11, x12, x13, x14, x15); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + d4, d5, d6, d7); + MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, + d8, d9, d10, d11); + MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, + d12, d13, d14, d15); + SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177); + SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177); + SHF_W4_SP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_177); + SHF_W4_SP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_177); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); + FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); + FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); + ST_SP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, + d12, d13, d14, d15, x, 4); + } + + if (n & 31) + { + if (n & 16) + { + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + d4, d5, d6, d7); + SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177); + SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); + ST_SP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 4); + } + + if (n & 8) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + ST_SP4_INC(d0, d1, d2, d3, x, 4); + } + + if (n & 4) + { + LD_SP2_INC(px, 4, x0, x1); + MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); + SHF_W2_SP(d0, d1, d0, d1, SHF_177); + FMADD2(x0, x1, da_r_vec, d0, d1); + ST_SP2_INC(d0, d1, x, 4); + } + + if (n & 2) + { + LD_GP4_INC(px, 1, f0, f1, f2, f3); + + tp0 = da_r * f0; + tp0 -= da_i * f1; + tp1 = da_r * f1; + tp1 += da_i * f0; + tp2 = da_r * f2; + tp2 -= da_i * f3; + tp3 = da_r * f3; + tp3 += da_i * f2; + + ST_GP4_INC(tp0, tp1, tp2, tp3, x, 1); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + + tp0 = da_r * f0; + tp0 -= da_i * f1; + tp1 = da_r * f1; + tp1 += da_i * f0; + + ST_GP2_INC(tp0, tp1, x, 1); + } + } + } + } + else + { + inc_x2 = 2 * inc_x; + + if ((0.0 == da_r) && (0.0 == da_i)) + { + for (i = n; i--;) + { + *x = 0; + *(x + 1) = 0; + + x += inc_x2; + } + } + else if (0.0 == da_r) + { + da_i_vec = COPY_FLOAT_TO_VECTOR(da_i); + da_i_vec_neg = -da_i_vec; + da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec); + + for (i = (n >> 4); i--;) + { + LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, + x10, x11, x12, x13, x14, x15); + PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); + PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7); + MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec, + d0, d1, d2, d3); + MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec, + d4, d5, d6, d7); + + *x = d0[1]; + *(x + 1) = d0[0]; + x += inc_x2; + *x = d0[3]; + *(x + 1) = d0[2]; + x += inc_x2; + *x = d1[1]; + *(x + 1) = d1[0]; + x += inc_x2; + *x = d1[3]; + *(x + 1) = d1[2]; + x += inc_x2; + *x = d2[1]; + *(x + 1) = d2[0]; + x += inc_x2; + *x = d2[3]; + *(x + 1) = d2[2]; + x += inc_x2; + *x = d3[1]; + *(x + 1) = d3[0]; + x += inc_x2; + *x = d3[3]; + *(x + 1) = d3[2]; + x += inc_x2; + *x = d4[1]; + *(x + 1) = d4[0]; + x += inc_x2; + *x = d4[3]; + *(x + 1) = d4[2]; + x += inc_x2; + *x = d5[1]; + *(x + 1) = d5[0]; + x += inc_x2; + *x = d5[3]; + *(x + 1) = d5[2]; + x += inc_x2; + *x = d6[1]; + *(x + 1) = d6[0]; + x += inc_x2; + *x = d6[3]; + *(x + 1) = d6[2]; + x += inc_x2; + *x = d7[1]; + *(x + 1) = d7[0]; + x += inc_x2; + *x = d7[3]; + *(x + 1) = d7[2]; + x += inc_x2; + } + + if (n & 15) + { + if (n & 8) + { + LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); + MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec, + d0, d1, d2, d3); + + *x = d0[1]; + *(x + 1) = d0[0]; + x += inc_x2; + *x = d0[3]; + *(x + 1) = d0[2]; + x += inc_x2; + *x = d1[1]; + *(x + 1) = d1[0]; + x += inc_x2; + *x = d1[3]; + *(x + 1) = d1[2]; + x += inc_x2; + *x = d2[1]; + *(x + 1) = d2[0]; + x += inc_x2; + *x = d2[3]; + *(x + 1) = d2[2]; + x += inc_x2; + *x = d3[1]; + *(x + 1) = d3[0]; + x += inc_x2; + *x = d3[3]; + *(x + 1) = d3[2]; + x += inc_x2; + } + + if (n & 4) + { + LD_SP4_INC(px, inc_x2, x0, x1, x2, x3); + PCKEV_D2_SP(x1, x0, x3, x2, d0, d1); + MUL2(d0, da_i_vec, d1, da_i_vec, d0, d1); + + *x = d0[1]; + *(x + 1) = d0[0]; + x += inc_x2; + *x = d0[3]; + *(x + 1) = d0[2]; + x += inc_x2; + *x = d1[1]; + *(x + 1) = d1[0]; + x += inc_x2; + *x = d1[3]; + *(x + 1) = d1[2]; + x += inc_x2; + } + + if (n & 2) + { + f0 = *px; + f1 = *(px + 1); + px += inc_x2; + f2 = *px; + f3 = *(px + 1); + px += inc_x2; + + MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, f0, f1, f2, f3); + + *x = f1; + *(x + 1) = f0; + x += inc_x2; + *x = f3; + *(x + 1) = f2; + x += inc_x2; + } + + if (n & 1) + { + f0 = *x; + f1 = *(x + 1); + + MUL2(f0, da_i, f1, -da_i, f0, f1); + + *x = f1; + *(x + 1) = f0; + } + } + } + else if (0.0 == da_i) + { + da_r_vec = COPY_FLOAT_TO_VECTOR(da_r); + + for (i = (n >> 4); i--;) + { + LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, + x10, x11, x12, x13, x14, x15); + PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); + PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7); + MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec, + d0, d1, d2, d3); + MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec, + d4, d5, d6, d7); + + *x = d0[0]; + *(x + 1) = d0[1]; + x += inc_x2; + *x = d0[2]; + *(x + 1) = d0[3]; + x += inc_x2; + *x = d1[0]; + *(x + 1) = d1[1]; + x += inc_x2; + *x = d1[2]; + *(x + 1) = d1[3]; + x += inc_x2; + *x = d2[0]; + *(x + 1) = d2[1]; + x += inc_x2; + *x = d2[2]; + *(x + 1) = d2[3]; + x += inc_x2; + *x = d3[0]; + *(x + 1) = d3[1]; + x += inc_x2; + *x = d3[2]; + *(x + 1) = d3[3]; + x += inc_x2; + *x = d4[0]; + *(x + 1) = d4[1]; + x += inc_x2; + *x = d4[2]; + *(x + 1) = d4[3]; + x += inc_x2; + *x = d5[0]; + *(x + 1) = d5[1]; + x += inc_x2; + *x = d5[2]; + *(x + 1) = d5[3]; + x += inc_x2; + *x = d6[0]; + *(x + 1) = d6[1]; + x += inc_x2; + *x = d6[2]; + *(x + 1) = d6[3]; + x += inc_x2; + *x = d7[0]; + *(x + 1) = d7[1]; + x += inc_x2; + *x = d7[2]; + *(x + 1) = d7[3]; + x += inc_x2; + } + + if (n & 15) + { + if (n & 8) + { + LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); + MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec, + d0, d1, d2, d3); + + *x = d0[0]; + *(x + 1) = d0[1]; + x += inc_x2; + *x = d0[2]; + *(x + 1) = d0[3]; + x += inc_x2; + *x = d1[0]; + *(x + 1) = d1[1]; + x += inc_x2; + *x = d1[2]; + *(x + 1) = d1[3]; + x += inc_x2; + *x = d2[0]; + *(x + 1) = d2[1]; + x += inc_x2; + *x = d2[2]; + *(x + 1) = d2[3]; + x += inc_x2; + *x = d3[0]; + *(x + 1) = d3[1]; + x += inc_x2; + *x = d3[2]; + *(x + 1) = d3[3]; + x += inc_x2; + } + + if (n & 4) + { + LD_SP4_INC(px, inc_x2, x0, x1, x2, x3); + PCKEV_D2_SP(x1, x0, x3, x2, d0, d1); + MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1); + + *x = d0[0]; + *(x + 1) = d0[1]; + x += inc_x2; + *x = d0[2]; + *(x + 1) = d0[3]; + x += inc_x2; + *x = d1[0]; + *(x + 1) = d1[1]; + x += inc_x2; + *x = d1[2]; + *(x + 1) = d1[3]; + x += inc_x2; + } + + if (n & 2) + { + f0 = *px; + f1 = *(px + 1); + px += inc_x2; + f2 = *px; + f3 = *(px + 1); + px += inc_x2; + + MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3); + + *x = f0; + *(x + 1) = f1; + x += inc_x2; + *x = f2; + *(x + 1) = f3; + x += inc_x2; + } + + if (n & 1) + { + f0 = *x; + f1 = *(x + 1); + + MUL2(f0, da_r, f1, da_r, f0, f1); + + *x = f0; + *(x + 1) = f1; + } + } + } + else + { + da_i_vec = COPY_FLOAT_TO_VECTOR(da_i); + da_i_vec_neg = -da_i_vec; + da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec); + + da_r_vec = COPY_FLOAT_TO_VECTOR(da_r); + + for (i = (n >> 4); i--;) + { + LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, + x10, x11, x12, x13, x14, x15); + PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); + PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7); + MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec, + x0, x1, x2, x3); + MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec, + x4, x5, x6, x7); + MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec, + d0, d1, d2, d3); + MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec, + d4, d5, d6, d7); + SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); + SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177); + ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3); + ADD4(d4, x4, d5, x5, d6, x6, d7, x7, d4, d5, d6, d7); + + *x = d0[0]; + *(x + 1) = d0[1]; + x += inc_x2; + *x = d0[2]; + *(x + 1) = d0[3]; + x += inc_x2; + *x = d1[0]; + *(x + 1) = d1[1]; + x += inc_x2; + *x = d1[2]; + *(x + 1) = d1[3]; + x += inc_x2; + *x = d2[0]; + *(x + 1) = d2[1]; + x += inc_x2; + *x = d2[2]; + *(x + 1) = d2[3]; + x += inc_x2; + *x = d3[0]; + *(x + 1) = d3[1]; + x += inc_x2; + *x = d3[2]; + *(x + 1) = d3[3]; + x += inc_x2; + *x = d4[0]; + *(x + 1) = d4[1]; + x += inc_x2; + *x = d4[2]; + *(x + 1) = d4[3]; + x += inc_x2; + *x = d5[0]; + *(x + 1) = d5[1]; + x += inc_x2; + *x = d5[2]; + *(x + 1) = d5[3]; + x += inc_x2; + *x = d6[0]; + *(x + 1) = d6[1]; + x += inc_x2; + *x = d6[2]; + *(x + 1) = d6[3]; + x += inc_x2; + *x = d7[0]; + *(x + 1) = d7[1]; + x += inc_x2; + *x = d7[2]; + *(x + 1) = d7[3]; + x += inc_x2; + } + + if (n & 15) + { + if (n & 8) + { + LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3); + MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec, + x0, x1, x2, x3); + MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec, + d0, d1, d2, d3); + SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177); + ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3); + + *x = d0[0]; + *(x + 1) = d0[1]; + x += inc_x2; + *x = d0[2]; + *(x + 1) = d0[3]; + x += inc_x2; + *x = d1[0]; + *(x + 1) = d1[1]; + x += inc_x2; + *x = d1[2]; + *(x + 1) = d1[3]; + x += inc_x2; + *x = d2[0]; + *(x + 1) = d2[1]; + x += inc_x2; + *x = d2[2]; + *(x + 1) = d2[3]; + x += inc_x2; + *x = d3[0]; + *(x + 1) = d3[1]; + x += inc_x2; + *x = d3[2]; + *(x + 1) = d3[3]; + x += inc_x2; + } + + if (n & 4) + { + LD_SP4_INC(px, inc_x2, x0, x1, x2, x3); + PCKEV_D2_SP(x1, x0, x3, x2, d0, d1); + MUL2(d0, da_i_vec, d1, da_i_vec, x0, x1); + MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1); + SHF_W2_SP(x0, x1, x0, x1, SHF_177); + ADD2(d0, x0, d1, x1, d0, d1); + + *x = d0[0]; + *(x + 1) = d0[1]; + x += inc_x2; + *x = d0[2]; + *(x + 1) = d0[3]; + x += inc_x2; + *x = d1[0]; + *(x + 1) = d1[1]; + x += inc_x2; + *x = d1[2]; + *(x + 1) = d1[3]; + x += inc_x2; + } + + if (n & 2) + { + f0 = *px;; + f1 = *(px + 1); + px += inc_x2; + f2 = *px; + f3 = *(px + 1); + px += inc_x2; + + tp0 = da_r * f0; + tp0 -= da_i * f1; + tp1 = da_r * f1; + tp1 += da_i * f0; + tp2 = da_r * f2; + tp2 -= da_i * f3; + tp3 = da_r * f3; + tp3 += da_i * f2; + + *x = tp0; + *(x + 1) = tp1; + x += inc_x2; + *x = tp2; + *(x + 1) = tp3; + x += inc_x2; + } + + if (n & 1) + { + f0 = *px; px += 1; + f1 = *px; + + tp0 = da_r * f0; + tp0 -= da_i * f1; + tp1 = da_r * f1; + tp1 += da_i * f0; + + *x = tp0; x += 1; + *x = tp1; + } + } + } + } + + return (0); +} diff --git a/kernel/mips/cswap_msa.c b/kernel/mips/cswap_msa.c new file mode 100644 index 0000000..632726a --- /dev/null +++ b/kernel/mips/cswap_msa.c @@ -0,0 +1,281 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, + BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, pref_offsetx, pref_offsety; + FLOAT *px, *py; + BLASLONG inc_x2, inc_y2; + FLOAT x0, x1, x2, x3, x4, x5, x6, x7; + FLOAT y0, y1, y2, y3, y4, y5, y6, y7; + v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; + v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; + + if (n < 0) return (0); + + pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + px = srcx; + py = srcy; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ((1 == inc_x) && (1 == inc_y)) + { + if (n >> 4) + { + LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); + + for (i = (n >> 4) - 1; i--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + yv0 = LD_SP(py); py += 4; + ST_SP(xv0, srcy); srcy += 4; + yv1 = LD_SP(py); py += 4; + ST_SP(xv1, srcy); srcy += 4; + yv2 = LD_SP(py); py += 4; + ST_SP(xv2, srcy); srcy += 4; + yv3 = LD_SP(py); py += 4; + ST_SP(xv3, srcy); srcy += 4; + yv4 = LD_SP(py); py += 4; + ST_SP(xv4, srcy); srcy += 4; + yv5 = LD_SP(py); py += 4; + ST_SP(xv5, srcy); srcy += 4; + yv6 = LD_SP(py); py += 4; + ST_SP(xv6, srcy); srcy += 4; + yv7 = LD_SP(py); py += 4; + ST_SP(xv7, srcy); srcy += 4; + + xv0 = LD_SP(px); px += 4; + ST_SP(yv0, srcx); srcx += 4; + xv1 = LD_SP(px); px += 4; + ST_SP(yv1, srcx); srcx += 4; + xv2 = LD_SP(px); px += 4; + ST_SP(yv2, srcx); srcx += 4; + xv3 = LD_SP(px); px += 4; + ST_SP(yv3, srcx); srcx += 4; + xv4 = LD_SP(px); px += 4; + ST_SP(yv4, srcx); srcx += 4; + xv5 = LD_SP(px); px += 4; + ST_SP(yv5, srcx); srcx += 4; + xv6 = LD_SP(px); px += 4; + ST_SP(yv6, srcx); srcx += 4; + xv7 = LD_SP(px); px += 4; + ST_SP(yv7, srcx); srcx += 4; + } + + LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); + ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4); + ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4); + } + + if (n & 15) + { + if ((n & 8) && (n & 4) && (n & 2)) + { + LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6); + LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6); + ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4); + ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4); + } + else if ((n & 8) && (n & 4)) + { + LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5); + LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5); + ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4); + ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4); + } + else if ((n & 8) && (n & 2)) + { + LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4); + LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4); + ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4); + ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4); + } + else if ((n & 4) && (n & 2)) + { + LD_SP3_INC(px, 4, xv0, xv1, xv2); + LD_SP3_INC(py, 4, yv0, yv1, yv2); + ST_SP3_INC(xv0, xv1, xv2, srcy, 4); + ST_SP3_INC(yv0, yv1, yv2, srcx, 4); + } + else if (n & 8) + { + LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3); + LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3); + ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4); + ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4); + } + else if (n & 4) + { + LD_SP2_INC(px, 4, xv0, xv1); + LD_SP2_INC(py, 4, yv0, yv1); + ST_SP2_INC(xv0, xv1, srcy, 4); + ST_SP2_INC(yv0, yv1, srcx, 4); + } + else if (n & 2) + { + xv0 = LD_SP(px); + yv0 = LD_SP(py); + + px += 4; + py += 4; + + ST_SP(xv0, srcy); + ST_SP(yv0, srcx); + + srcx += 4; + srcy += 4; + } + + if (n & 1) + { + LD_GP2_INC(px, 1, x0, x1); + LD_GP2_INC(py, 1, y0, y1); + ST_GP2_INC(x0, x1, srcy, 1); + ST_GP2_INC(y0, y1, srcx, 1); + } + } + } + else + { + for (i = (n >> 2); i--;) + { + x0 = srcx[0 * inc_x2]; + x1 = srcx[0 * inc_x2 + 1]; + x2 = srcx[1 * inc_x2]; + x3 = srcx[1 * inc_x2 + 1]; + x4 = srcx[2 * inc_x2]; + x5 = srcx[2 * inc_x2 + 1]; + x6 = srcx[3 * inc_x2]; + x7 = srcx[3 * inc_x2 + 1]; + + y0 = srcy[0 * inc_y2]; + y1 = srcy[0 * inc_y2 + 1]; + y2 = srcy[1 * inc_y2]; + y3 = srcy[1 * inc_y2 + 1]; + y4 = srcy[2 * inc_y2]; + y5 = srcy[2 * inc_y2 + 1]; + y6 = srcy[3 * inc_y2]; + y7 = srcy[3 * inc_y2 + 1]; + + srcx[0 * inc_x2] = y0; + srcx[0 * inc_x2 + 1] = y1; + srcx[1 * inc_x2] = y2; + srcx[1 * inc_x2 + 1] = y3; + srcx[2 * inc_x2] = y4; + srcx[2 * inc_x2 + 1] = y5; + srcx[3 * inc_x2] = y6; + srcx[3 * inc_x2 + 1] = y7; + + srcy[0 * inc_y2] = x0; + srcy[0 * inc_y2 + 1] = x1; + srcy[1 * inc_y2] = x2; + srcy[1 * inc_y2 + 1] = x3; + srcy[2 * inc_y2] = x4; + srcy[2 * inc_y2 + 1] = x5; + srcy[3 * inc_y2] = x6; + srcy[3 * inc_y2 + 1] = x7; + + srcx += 4 * inc_x2; + srcy += 4 * inc_y2; + } + + if (n & 2) + { + x0 = srcx[0 * inc_x2]; + x1 = srcx[0 * inc_x2 + 1]; + x2 = srcx[1 * inc_x2]; + x3 = srcx[1 * inc_x2 + 1]; + + y0 = srcy[0 * inc_y2]; + y1 = srcy[0 * inc_y2 + 1]; + y2 = srcy[1 * inc_y2]; + y3 = srcy[1 * inc_y2 + 1]; + + srcx[0 * inc_x2] = y0; + srcx[0 * inc_x2 + 1] = y1; + srcx[1 * inc_x2] = y2; + srcx[1 * inc_x2 + 1] = y3; + + srcy[0 * inc_y2] = x0; + srcy[0 * inc_y2 + 1] = x1; + srcy[1 * inc_y2] = x2; + srcy[1 * inc_y2 + 1] = x3; + + srcx += 2 * inc_x2; + srcy += 2 * inc_y2; + } + + if (n & 1) + { + x0 = srcx[0 * inc_x2]; + x1 = srcx[0 * inc_x2 + 1]; + + y0 = srcy[0 * inc_y2]; + y1 = srcy[0 * inc_y2 + 1]; + + srcx[0 * inc_x2] = y0; + srcx[0 * inc_x2 + 1] = y1; + + srcy[0 * inc_y2] = x0; + srcy[0 * inc_y2 + 1] = x1; + + srcx += inc_x2; + srcy += inc_y2; + } + } + + return (0); +} diff --git a/kernel/mips/daxpy_msa.c b/kernel/mips/daxpy_msa.c new file mode 100644 index 0000000..789b78c --- /dev/null +++ b/kernel/mips/daxpy_msa.c @@ -0,0 +1,246 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#if !defined(CONJ) + #define OP0 += + #define OP1 -= + #define OP2 += +#else + #define OP0 -= + #define OP1 += + #define OP2 -= +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) +{ + BLASLONG i; + FLOAT *py; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; + v2f64 da_vec, zero_v = {0}; + + if ((n < 0) || (da == 0.0)) return(0); + + py = y; + + if ((1 == inc_x) && (1 == inc_y)) + { + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 32; + + da_vec = COPY_DOUBLE_TO_VECTOR(da); + + for (i = (n >> 4); i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + x_pref += 16; + y_pref += 16; + + LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); + LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); + FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); + FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP4_INC(x, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); + ST_DP4_INC(y0, y1, y2, y3, y, 2); + } + + if (n & 4) + { + LD_DP2_INC(x, 2, x0, x1); + LD_DP2_INC(py, 2, y0, y1); + FMADD2(x0, x1, da_vec, y0, y1); + ST_DP2_INC(y0, y1, y, 2); + } + + if (n & 2) + { + x0 = LD_DP(x); x += 2; + y0 = LD_DP(py); py += 2; + y0 += da_vec * x0; + ST_DP(y0, y); y += 2; + } + + if (n & 1) + { + y[0] += da * x[0]; + } + } + } + else if (1 == inc_y) + { + FLOAT *y_pref; + BLASLONG pref_offset; + v2f64 x8, x9, x10, x11, x12, x13, x14; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 32; + + da_vec = COPY_DOUBLE_TO_VECTOR(da); + + for (i = (n >> 4); i--;) + { + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + y_pref += 16; + + LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x14); + LD_DP7_INC(x, inc_x, x8, x9, x10, x11, x12, x13, x7); + + PCKEV_D2_SD(x1, x0, x3, x2, x0, x1); + PCKEV_D2_SD(x5, x4, x14, x6, x2, x3); + PCKEV_D2_SD(x9, x8, x11, x10, x4, x5); + x6 = (v2f64) __msa_pckev_d((v2i64) x13, (v2i64) x12); + x7 = (v2f64) __msa_insert_d((v2i64) x7, 1, *((BLASLONG *) x)); + x += inc_x; + + LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); + FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); + FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP7_INC(x, inc_x, x0, x1, x2, x6, x4, x5, x3); + + PCKEV_D2_SD(x1, x0, x6, x2, x0, x1); + x2 = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4); + x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((BLASLONG *) x)); + x += inc_x; + + LD_DP4_INC(py, 2, y0, y1, y2, y3); + FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); + ST_DP4_INC(y0, y1, y2, y3, y, 2); + } + + if (n & 4) + { + LD_DP3_INC(x, inc_x, x0, x2, x1); + + x0 = (v2f64) __msa_pckev_d((v2i64) x2, (v2i64) x0); + x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((BLASLONG *) x)); + x += inc_x; + + LD_DP2_INC(py, 2, y0, y1); + FMADD2(x0, x1, da_vec, y0, y1); + ST_DP2_INC(y0, y1, y, 2); + } + + if (n & 2) + { + x0 = (v2f64) __msa_insert_d((v2i64) zero_v, 0, *((BLASLONG *) x)); + x += inc_x; + x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((BLASLONG *) x)); + x += inc_x; + + y0 = LD_DP(py); py += 2; + y0 += da_vec * x0; + ST_DP(y0, y); y += 2; + } + + if (n & 1) + { + y[0] += da * x[0]; + } + } + } + else + { + FLOAT x0, x1, x2, x3, y0, y1, y2, y3; + + for (i = (n >> 2); i--;) + { + LD_GP4_INC(x, inc_x, x0, x1, x2, x3); + LD_GP4_INC(py, inc_y, y0, y1, y2, y3); + FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3); + ST_GP4_INC(y0, y1, y2, y3, y, inc_y); + } + + if (n & 3) + { + if (n & 2) + { + LD_GP2_INC(x, inc_x, x0, x1); + LD_GP2_INC(py, inc_y, y0, y1); + FMADD2(x0, x1, da, y0, y1); + ST_GP2_INC(y0, y1, y, inc_y); + } + + if (n & 1) + { + *y += da * *x; + } + } + } + + return (0); +} diff --git a/kernel/mips/dcopy_msa.c b/kernel/mips/dcopy_msa.c new file mode 100644 index 0000000..e73bf34 --- /dev/null +++ b/kernel/mips/dcopy_msa.c @@ -0,0 +1,180 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + + if (n < 0) return (0); + + if ((1 == inc_x) && (1 == inc_y)) + { + if (n > 31) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64 + 16; + + LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = (n >> 5) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 32; + + x8 = LD_DP(x); x += 2; + ST_DP(x0, y); y += 2; + x9 = LD_DP(x); x += 2; + ST_DP(x1, y); y += 2; + x10 = LD_DP(x); x += 2; + ST_DP(x2, y); y += 2; + x11 = LD_DP(x); x += 2; + ST_DP(x3, y); y += 2; + x12 = LD_DP(x); x += 2; + ST_DP(x4, y); y += 2; + x13 = LD_DP(x); x += 2; + ST_DP(x5, y); y += 2; + x14 = LD_DP(x); x += 2; + ST_DP(x6, y); y += 2; + x15 = LD_DP(x); x += 2; + ST_DP(x7, y); y += 2; + x0 = LD_DP(x); x += 2; + ST_DP(x8, y); y += 2; + x1 = LD_DP(x); x += 2; + ST_DP(x9, y); y += 2; + x2 = LD_DP(x); x += 2; + ST_DP(x10, y); y += 2; + x3 = LD_DP(x); x += 2; + ST_DP(x11, y); y += 2; + x4 = LD_DP(x); x += 2; + ST_DP(x12, y); y += 2; + x5 = LD_DP(x); x += 2; + ST_DP(x13, y); y += 2; + x6 = LD_DP(x); x += 2; + ST_DP(x14, y); y += 2; + x7 = LD_DP(x); x += 2; + ST_DP(x15, y); y += 2; + } + + x8 = LD_DP(x); x += 2; + x9 = LD_DP(x); x += 2; + ST_DP(x0, y); y += 2; + x10 = LD_DP(x); x += 2; + ST_DP(x1, y); y += 2; + x11 = LD_DP(x); x += 2; + ST_DP(x2, y); y += 2; + x12 = LD_DP(x); x += 2; + ST_DP(x3, y); y += 2; + x13 = LD_DP(x); x += 2; + ST_DP(x4, y); y += 2; + x14 = LD_DP(x); x += 2; + ST_DP(x5, y); y += 2; + x15 = LD_DP(x); x += 2; + ST_DP(x6, y); y += 2; + ST_DP(x7, y); y += 2; + + ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2); + } + + if (n & 31) + { + if (n & 16) + { + LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2); + } + + if (n & 8) + { + LD_DP4_INC(x, 2, x0, x1, x2, x3); + ST_DP4_INC(x0, x1, x2, x3, y, 2); + } + + if (n & 4) + { + LD_GP4_INC(x, 1, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, y, 1); + } + + if (n & 2) + { + LD_GP2_INC(x, 1, f0, f1); + ST_GP2_INC(f0, f1, y, 1); + } + + if (n & 1) + { + *y = *x; + } + } + } + else + { + for (i = (n >> 3); i--;) + { + LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); + ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y); + } + + if (n & 4) + { + LD_GP4_INC(x, inc_x, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, y, inc_y); + } + + if (n & 2) + { + LD_GP2_INC(x, inc_x, f0, f1); + ST_GP2_INC(f0, f1, y, inc_y); + } + + if (n & 1) + { + *y = *x; + } + } + + return (0); +} diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c new file mode 100644 index 0000000..6ce0375 --- /dev/null +++ b/kernel/mips/dscal_msa.c @@ -0,0 +1,368 @@ +/******************************************************************************* +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) +{ + BLASLONG i; + FLOAT *px; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + v2f64 da_vec; + + px = x; + + if (1 == inc_x) + { + if (0.0 == da) + { + v2f64 zero_v = __msa_cast_to_vector_double(0); + zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); + zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + + for (i = (n >> 5); i--;) + { + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 2); + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 2); + } + + if (n & 31) + { + if (n & 16) + { + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 2); + } + + if (n & 8) + { + ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2); + } + + if (n & 4) + { + ST_DP2_INC(zero_v, zero_v, x, 2); + } + + if (n & 2) + { + *x = 0; x += 1; + *x = 0; x += 1; + } + + if (n & 1) + { + *x = 0; + } + } + } + else + { + da_vec = COPY_DOUBLE_TO_VECTOR(da); + + if (n > 31) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32 + 16; + + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = 0; i < (n >> 5) - 1; i++) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 32; + + x8 = LD_DP(px); px += 2; + x0 *= da_vec; + x9 = LD_DP(px); px += 2; + x1 *= da_vec; + x10 = LD_DP(px); px += 2; + x2 *= da_vec; + x11 = LD_DP(px); px += 2; + x3 *= da_vec; + x12 = LD_DP(px); px += 2; + x4 *= da_vec; + x13 = LD_DP(px); px += 2; + x5 *= da_vec; + x14 = LD_DP(px); px += 2; + x6 *= da_vec; + x15 = LD_DP(px); px += 2; + x7 *= da_vec; + x8 *= da_vec; + ST_DP(x0, x); x += 2; + x9 *= da_vec; + ST_DP(x1, x); x += 2; + x10 *= da_vec; + ST_DP(x2, x); x += 2; + x11 *= da_vec; + ST_DP(x3, x); x += 2; + x12 *= da_vec; + ST_DP(x4, x); x += 2; + x13 *= da_vec; + ST_DP(x5, x); x += 2; + x14 *= da_vec; + ST_DP(x6, x); x += 2; + x15 *= da_vec; + ST_DP(x7, x); x += 2; + ST_DP(x8, x); x += 2; + x0 = LD_DP(px); px += 2; + ST_DP(x9, x); x += 2; + x1 = LD_DP(px); px += 2; + ST_DP(x10, x); x += 2; + x2 = LD_DP(px); px += 2; + ST_DP(x11, x); x += 2; + x3 = LD_DP(px); px += 2; + ST_DP(x12, x); x += 2; + x4 = LD_DP(px); px += 2; + ST_DP(x13, x); x += 2; + x5 = LD_DP(px); px += 2; + ST_DP(x14, x); x += 2; + x6 = LD_DP(px); px += 2; + ST_DP(x15, x); x += 2; + x7 = LD_DP(px); px += 2; + } + + x8 = LD_DP(px); px += 2; + x0 *= da_vec; + x9 = LD_DP(px); px += 2; + x1 *= da_vec; + x10 = LD_DP(px); px += 2; + x2 *= da_vec; + x11 = LD_DP(px); px += 2; + x3 *= da_vec; + x12 = LD_DP(px); px += 2; + x4 *= da_vec; + x13 = LD_DP(px); px += 2; + x5 *= da_vec; + x14 = LD_DP(px); px += 2; + x6 *= da_vec; + x15 = LD_DP(px); px += 2; + x7 *= da_vec; + x8 *= da_vec; + ST_DP(x0, x); x += 2; + x9 *= da_vec; + ST_DP(x1, x); x += 2; + x10 *= da_vec; + ST_DP(x2, x); x += 2; + x11 *= da_vec; + ST_DP(x3, x); x += 2; + x12 *= da_vec; + ST_DP(x4, x); x += 2; + x13 *= da_vec; + ST_DP(x5, x); x += 2; + x15 *= da_vec; + ST_DP(x6, x); x += 2; + x14 *= da_vec; + ST_DP(x7, x); x += 2; + + ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2); + } + + if (n & 31) + { + if (n & 16) + { + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); + MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); + } + + if (n & 8) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); + ST_DP4_INC(x0, x1, x2, x3, x, 2); + } + + if (n & 4) + { + LD_DP2_INC(px, 2, x0, x1); + MUL2(x0, da_vec, x1, da_vec, x0, x1); + ST_DP2_INC(x0, x1, x, 2); + } + + if (n & 2) + { + LD_GP2_INC(px, 1, f0, f1); + MUL2(f0, da, f1, da, f0, f1); + ST_GP2_INC(f0, f1, x, 1); + } + + if (n & 1) + { + *x *= da; + } + } + } + } + else + { + if (da == 0.0) + { + for (i = n; i--;) + { + *x = 0.0; + + x += inc_x; + } + } + else + { + if (n > 15) + { + LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); + for (i = 0; i < (n >> 4) - 1; i++) + { + LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + + f4 *= da; + f5 *= da; + *x = f0; x += inc_x; + f6 *= da; + *x = f1; x += inc_x; + f7 *= da; + *x = f2; x += inc_x; + f8 *= da; + *x = f3; x += inc_x; + f9 *= da; + *x = f4; x += inc_x; + f10 *= da; + *x = f5; x += inc_x; + f11 *= da; + *x = f6; x += inc_x; + f12 *= da; + *x = f7; x += inc_x; + f13 *= da; + *x = f8; x += inc_x; + f14 *= da; + *x = f9; x += inc_x; + f15 *= da; + *x = f10; x += inc_x; + *x = f11; x += inc_x; + f0 = *px; px += inc_x; + *x = f12; x += inc_x; + f1 = *px; px += inc_x; + *x = f13; x += inc_x; + f2 = *px; px += inc_x; + *x = f14; x += inc_x; + f3 = *px; px += inc_x; + *x = f15; x += inc_x; + f4 = *px; px += inc_x; + f5 = *px; px += inc_x; + f6 = *px; px += inc_x; + f7 = *px; px += inc_x; + } + + LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + + f4 *= da; + f5 *= da; + *x = f0; x += inc_x; + f6 *= da; + *x = f1; x += inc_x; + f7 *= da; + *x = f2; x += inc_x; + f8 *= da; + *x = f3; x += inc_x; + f9 *= da; + *x = f4; x += inc_x; + f10 *= da; + *x = f5; x += inc_x; + f11 *= da; + *x = f6; x += inc_x; + f12 *= da; + *x = f7; x += inc_x; + f13 *= da; + *x = f8; x += inc_x; + f14 *= da; + *x = f9; x += inc_x; + f15 *= da; + *x = f10; x += inc_x; + *x = f11; x += inc_x; + *x = f12; x += inc_x; + *x = f13; x += inc_x; + *x = f14; x += inc_x; + *x = f15; x += inc_x; + } + + if (n & 15) + { + if (n & 8) + { + LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7); + ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x); + } + + if (n & 4) + { + LD_GP4_INC(px, inc_x, f0, f1, f2, f3); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, x, inc_x); + } + + if (n & 2) + { + LD_GP2_INC(px, inc_x, f0, f1); + MUL2(f0, da, f1, da, f0, f1); + ST_GP2_INC(f0, f1, x, inc_x); + } + + if (n & 1) + { + *x *= da; + } + } + } + } + + return 0; +} diff --git a/kernel/mips/dswap_msa.c b/kernel/mips/dswap_msa.c new file mode 100644 index 0000000..7b1f024 --- /dev/null +++ b/kernel/mips/dswap_msa.c @@ -0,0 +1,253 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, pref_offsetx, pref_offsety; + FLOAT *px, *py; + FLOAT x0, x1, x2, x3, x4, x5, x6, x7; + FLOAT y0, y1, y2, y3, y4, y5, y6, y7; + v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; + v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; + + if (n < 0) return (0); + + pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + px = srcx; + py = srcy; + + if ((1 == inc_x) && (1 == inc_y)) + { + if (n >> 4) + { + LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); + + for (i = (n >> 4) - 1; i--;) + { + PREFETCH(px + pref_offsetx + 16); + PREFETCH(px + pref_offsetx + 20); + PREFETCH(px + pref_offsetx + 24); + PREFETCH(px + pref_offsetx + 28); + + PREFETCH(py + pref_offsety + 16); + PREFETCH(py + pref_offsety + 20); + PREFETCH(py + pref_offsety + 24); + PREFETCH(py + pref_offsety + 28); + + yv0 = LD_DP(py); py += 2; + ST_DP(xv0, srcy); srcy += 2; + yv1 = LD_DP(py); py += 2; + ST_DP(xv1, srcy); srcy += 2; + yv2 = LD_DP(py); py += 2; + ST_DP(xv2, srcy); srcy += 2; + yv3 = LD_DP(py); py += 2; + ST_DP(xv3, srcy); srcy += 2; + yv4 = LD_DP(py); py += 2; + ST_DP(xv4, srcy); srcy += 2; + yv5 = LD_DP(py); py += 2; + ST_DP(xv5, srcy); srcy += 2; + yv6 = LD_DP(py); py += 2; + ST_DP(xv6, srcy); srcy += 2; + yv7 = LD_DP(py); py += 2; + ST_DP(xv7, srcy); srcy += 2; + + xv0 = LD_DP(px); px += 2; + ST_DP(yv0, srcx); srcx += 2; + xv1 = LD_DP(px); px += 2; + ST_DP(yv1, srcx); srcx += 2; + xv2 = LD_DP(px); px += 2; + ST_DP(yv2, srcx); srcx += 2; + xv3 = LD_DP(px); px += 2; + ST_DP(yv3, srcx); srcx += 2; + xv4 = LD_DP(px); px += 2; + ST_DP(yv4, srcx); srcx += 2; + xv5 = LD_DP(px); px += 2; + ST_DP(yv5, srcx); srcx += 2; + xv6 = LD_DP(px); px += 2; + ST_DP(yv6, srcx); srcx += 2; + xv7 = LD_DP(px); px += 2; + ST_DP(yv7, srcx); srcx += 2; + } + + LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); + ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2); + ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2); + } + + if (n & 15) + { + if ((n & 8) && (n & 4) && (n & 2)) + { + LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6); + LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6); + ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2); + ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2); + } + else if ((n & 8) && (n & 4)) + { + LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5); + LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5); + ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2); + ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2); + } + else if ((n & 8) && (n & 2)) + { + LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4); + LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4); + ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2); + ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2); + } + else if ((n & 4) && (n & 2)) + { + LD_DP3_INC(px, 2, xv0, xv1, xv2); + LD_DP3_INC(py, 2, yv0, yv1, yv2); + ST_DP3_INC(xv0, xv1, xv2, srcy, 2); + ST_DP3_INC(yv0, yv1, yv2, srcx, 2); + } + else if (n & 8) + { + LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3); + LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3); + ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2); + ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2); + } + else if (n & 4) + { + LD_DP2_INC(px, 2, xv0, xv1); + LD_DP2_INC(py, 2, yv0, yv1); + ST_DP2_INC(xv0, xv1, srcy, 2); + ST_DP2_INC(yv0, yv1, srcx, 2); + } + else if (n & 2) + { + xv0 = LD_DP(px); + yv0 = LD_DP(py); + + px += 2; + py += 2; + + ST_DP(xv0, srcy); + ST_DP(yv0, srcx); + + srcx += 2; + srcy += 2; + } + + if (n & 1) + { + x0 = px[0]; + y0 = py[0]; + srcx[0] = y0; + srcy[0] = x0; + } + } + } + else + { + for (i = (n >> 3); i--;) + { + LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); + LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7); + ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y); + ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x); + } + + if (n & 7) + { + if ((n & 4) && (n & 2) && (n & 1)) + { + LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6); + LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6); + ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y); + ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x); + } + else if ((n & 4) && (n & 2)) + { + LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5); + LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5); + ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y); + ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x); + } + else if ((n & 4) && (n & 1)) + { + LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4); + LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4); + ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y); + ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x); + } + else if ((n & 2) && (n & 1)) + { + LD_GP3_INC(px, inc_x, x0, x1, x2); + LD_GP3_INC(py, inc_y, y0, y1, y2); + ST_GP3_INC(x0, x1, x2, srcy, inc_y); + ST_GP3_INC(y0, y1, y2, srcx, inc_x); + } + else if (n & 4) + { + LD_GP4_INC(px, inc_x, x0, x1, x2, x3); + LD_GP4_INC(py, inc_y, y0, y1, y2, y3); + ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y); + ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x); + } + else if (n & 2) + { + LD_GP2_INC(px, inc_x, x0, x1); + LD_GP2_INC(py, inc_y, y0, y1); + ST_GP2_INC(x0, x1, srcy, inc_y); + ST_GP2_INC(y0, y1, srcx, inc_x); + } + else if (n & 1) + { + x0 = *srcx; + y0 = *srcy; + + *srcx = y0; + *srcy = x0; + } + } + } + + return (0); +} diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index a2f90fe..ee0dea0 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -722,6 +722,31 @@ inline static void prefetch_load_lf(unsigned char *src) MUL2(in4, in5, in6, in7, out2, out3); \ } +/* Description : Multiplication of pairs of vectors and added in output + Arguments : Inputs - in0, in1, vec, out0, out1 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'vec' + and the result is added to 'out0' +*/ +#define FMADD2(in0, in1, vec, inout0, inout1) \ +{ \ + inout0 += in0 * vec; \ + inout1 += in1 * vec; \ +} +#define FMADD3(in0, in1, in2, vec, \ + inout0, inout1, inout2) \ +{ \ + inout0 += in0 * vec; \ + inout1 += in1 * vec; \ + inout2 += in2 * vec; \ +} +#define FMADD4(in0, in1, in2, in3, vec, \ + inout0, inout1, inout2, inout3) \ +{ \ + FMADD2(in0, in1, vec, inout0, inout1); \ + FMADD2(in2, in3, vec, inout2, inout3); \ +} + /* Description : Addition of 2 pairs of variables Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 diff --git a/kernel/mips/saxpy_msa.c b/kernel/mips/saxpy_msa.c new file mode 100644 index 0000000..3238dbb --- /dev/null +++ b/kernel/mips/saxpy_msa.c @@ -0,0 +1,265 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#if !defined(CONJ) + #define OP0 += + #define OP1 -= + #define OP2 += +#else + #define OP0 -= + #define OP1 += + #define OP2 -= +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i; + FLOAT *py; + v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; + v4f32 da_vec, zero_v = {0}; + + if ((n < 0) || (da == 0.0)) return(0); + + py = y; + + if ((1 == inc_x) && (1 == inc_y)) + { + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 64; + + da_vec = COPY_FLOAT_TO_VECTOR(da); + + for (i = (n >> 5); i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + x_pref += 32; + y_pref += 32; + + LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); + LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); + FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); + FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7); + ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); + } + + if (n & 31) + { + if (n & 16) + { + LD_SP4_INC(x, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); + ST_SP4_INC(y0, y1, y2, y3, y, 4); + } + + if (n & 8) + { + LD_SP2_INC(x, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + FMADD2(x0, x1, da_vec, y0, y1); + ST_SP2_INC(y0, y1, y, 4); + } + + if (n & 4) + { + x0 = LD_SP(x); x += 4; + y0 = LD_SP(py); py += 4; + y0 += da_vec * x0; + ST_SP(y0, y); y += 4; + } + + if (n & 2) + { + FMADD2(x[0], x[1], da, y[0], y[1]); + x += 2; + y += 2; + } + + if (n & 1) + { + y[0] += da * x[0]; + } + } + } + else if (1 == inc_y) + { + da_vec = COPY_FLOAT_TO_VECTOR(da); + + for (i = (n >> 4); i--;) + { + x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); + x += inc_x; + x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x)); + x += inc_x; + x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); + x += inc_x; + x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x)); + x += inc_x; + x2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + x2 = (v4f32) __msa_insert_w((v4i32) x2, 1, *((int *) x)); + x += inc_x; + x2 = (v4f32) __msa_insert_w((v4i32) x2, 2, *((int *) x)); + x += inc_x; + x2 = (v4f32) __msa_insert_w((v4i32) x2, 3, *((int *) x)); + x += inc_x; + x3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + x3 = (v4f32) __msa_insert_w((v4i32) x3, 1, *((int *) x)); + x += inc_x; + x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x)); + x += inc_x; + x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) x)); + x += inc_x; + + LD_SP4_INC(py, 4, y0, y1, y2, y3); + FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3); + ST_SP4_INC(y0, y1, y2, y3, y, 4); + } + + if (n & 15) + { + if (n & 8) + { + x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); + x += inc_x; + x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x)); + x += inc_x; + x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x)); + x += inc_x; + x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x)); + x += inc_x; + + LD_SP2_INC(py, 4, y0, y1); + FMADD2(x0, x1, da_vec, y0, y1); + ST_SP2_INC(y0, y1, y, 4); + } + + if (n & 4) + { + x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x)); + x += inc_x; + x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x)); + x += inc_x; + + y0 = LD_SP(py); py += 4; + y0 += da_vec * x0; + ST_SP(y0, y); y += 4; + } + + if (n & 2) + { + FMADD2(x[0], x[inc_x], da, y[0], y[1]); + + x += 2 * inc_x; + y += 2; + } + + if (n & 1) + { + y[0] += da * x[0]; + } + } + } + else + { + FLOAT x0, x1, x2, x3, y0, y1, y2, y3; + + for (i = (n >> 2); i--;) + { + LD_GP4_INC(x, inc_x, x0, x1, x2, x3); + LD_GP4_INC(py, inc_y, y0, y1, y2, y3); + FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3); + ST_GP4_INC(y0, y1, y2, y3, y, inc_y); + } + + if (n & 3) + { + if (n & 2) + { + LD_GP2_INC(x, inc_x, x0, x1); + LD_GP2_INC(py, inc_y, y0, y1); + FMADD2(x0, x1, da, y0, y1); + ST_GP2_INC(y0, y1, y, inc_y); + } + + if (n & 1) + { + *y += da * *x; + } + } + } + + return (0); +} diff --git a/kernel/mips/scopy_msa.c b/kernel/mips/scopy_msa.c new file mode 100644 index 0000000..0b459ec --- /dev/null +++ b/kernel/mips/scopy_msa.c @@ -0,0 +1,186 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + + if (n < 0) return (0); + + if ((1 == inc_x) && (1 == inc_y)) + { + if (n > 63) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 128 + 32; + + LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = (n >> 6) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 64; + + x8 = LD_SP(x); x += 4; + ST_SP(x0, y); y += 4; + x9 = LD_SP(x); x += 4; + ST_SP(x1, y); y += 4; + x10 = LD_SP(x); x += 4; + ST_SP(x2, y); y += 4; + x11 = LD_SP(x); x += 4; + ST_SP(x3, y); y += 4; + x12 = LD_SP(x); x += 4; + ST_SP(x4, y); y += 4; + x13 = LD_SP(x); x += 4; + ST_SP(x5, y); y += 4; + x14 = LD_SP(x); x += 4; + ST_SP(x6, y); y += 4; + x15 = LD_SP(x); x += 4; + ST_SP(x7, y); y += 4; + x0 = LD_SP(x); x += 4; + ST_SP(x8, y); y += 4; + x1 = LD_SP(x); x += 4; + ST_SP(x9, y); y += 4; + x2 = LD_SP(x); x += 4; + ST_SP(x10, y); y += 4; + x3 = LD_SP(x); x += 4; + ST_SP(x11, y); y += 4; + x4 = LD_SP(x); x += 4; + ST_SP(x12, y); y += 4; + x5 = LD_SP(x); x += 4; + ST_SP(x13, y); y += 4; + x6 = LD_SP(x); x += 4; + ST_SP(x14, y); y += 4; + x7 = LD_SP(x); x += 4; + ST_SP(x15, y); y += 4; + } + + x8 = LD_SP(x); x += 4; + x9 = LD_SP(x); x += 4; + ST_SP(x0, y); y += 4; + x10 = LD_SP(x); x += 4; + ST_SP(x1, y); y += 4; + x11 = LD_SP(x); x += 4; + ST_SP(x2, y); y += 4; + x12 = LD_SP(x); x += 4; + ST_SP(x3, y); y += 4; + x13 = LD_SP(x); x += 4; + ST_SP(x4, y); y += 4; + x14 = LD_SP(x); x += 4; + ST_SP(x5, y); y += 4; + x15 = LD_SP(x); x += 4; + ST_SP(x6, y); y += 4; + ST_SP(x7, y); y += 4; + + ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4); + } + + if (n & 63) + { + if (n & 32) + { + LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7); + ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4); + } + + if (n & 16) + { + LD_SP4_INC(x, 4, x0, x1, x2, x3); + ST_SP4_INC(x0, x1, x2, x3, y, 4); + } + + if (n & 8) + { + LD_SP2_INC(x, 4, x0, x1); + ST_SP2_INC(x0, x1, y, 4); + } + + if (n & 4) + { + LD_GP4_INC(x, 1, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, y, 1); + } + + if (n & 2) + { + LD_GP2_INC(x, 1, f0, f1); + ST_GP2_INC(f0, f1, y, 1); + } + + if (n & 1) + { + *y = *x; + } + } + } + else + { + for (i = (n >> 3); i--;) + { + LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); + ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y); + } + + if (n & 4) + { + LD_GP4_INC(x, inc_x, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, y, inc_y); + } + + if (n & 2) + { + LD_GP2_INC(x, inc_x, f0, f1); + ST_GP2_INC(f0, f1, y, inc_y); + } + + if (n & 1) + { + *y = *x; + } + } + + return (0); +} diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c new file mode 100644 index 0000000..64b62d6 --- /dev/null +++ b/kernel/mips/sscal_msa.c @@ -0,0 +1,385 @@ +/******************************************************************************* +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) +{ + BLASLONG i; + FLOAT *px; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15; + v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + v4f32 da_vec; + + px = x; + + if (1 == inc_x) + { + if (0.0 == da) + { + v4f32 zero_v = __msa_cast_to_vector_float(0); + zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); + zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); + zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); + zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + + for (i = (n >> 6); i--;) + { + ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 4); + ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 4); + } + + if (n & 63) + { + if (n & 32) + { + ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 4); + } + + if (n & 16) + { + ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4); + } + + if (n & 8) + { + ST_SP2_INC(zero_v, zero_v, x, 4); + } + + if (n & 4) + { + *x = 0; x += 1; + *x = 0; x += 1; + *x = 0; x += 1; + *x = 0; x += 1; + } + + if (n & 2) + { + *x = 0; x += 1; + *x = 0; x += 1; + } + + if (n & 1) + { + *x = 0; + } + } + } + else + { + da_vec = COPY_FLOAT_TO_VECTOR(da); + + if (n > 63) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64 + 32; + + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = 0; i < (n >> 6) - 1; i++) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 64; + + x8 = LD_SP(px); px += 4; + x0 *= da_vec; + x9 = LD_SP(px); px += 4; + x1 *= da_vec; + x10 = LD_SP(px); px += 4; + x2 *= da_vec; + x11 = LD_SP(px); px += 4; + x3 *= da_vec; + x12 = LD_SP(px); px += 4; + x4 *= da_vec; + x13 = LD_SP(px); px += 4; + x5 *= da_vec; + x14 = LD_SP(px); px += 4; + x6 *= da_vec; + x15 = LD_SP(px); px += 4; + x7 *= da_vec; + x8 *= da_vec; + ST_SP(x0, x); x += 4; + x9 *= da_vec; + ST_SP(x1, x); x += 4; + x10 *= da_vec; + ST_SP(x2, x); x += 4; + x11 *= da_vec; + ST_SP(x3, x); x += 4; + x12 *= da_vec; + ST_SP(x4, x); x += 4; + x13 *= da_vec; + ST_SP(x5, x); x += 4; + x14 *= da_vec; + ST_SP(x6, x); x += 4; + x15 *= da_vec; + ST_SP(x7, x); x += 4; + ST_SP(x8, x); x += 4; + x0 = LD_SP(px); px += 4; + ST_SP(x9, x); x += 4; + x1 = LD_SP(px); px += 4; + ST_SP(x10, x); x += 4; + x2 = LD_SP(px); px += 4; + ST_SP(x11, x); x += 4; + x3 = LD_SP(px); px += 4; + ST_SP(x12, x); x += 4; + x4 = LD_SP(px); px += 4; + ST_SP(x13, x); x += 4; + x5 = LD_SP(px); px += 4; + ST_SP(x14, x); x += 4; + x6 = LD_SP(px); px += 4; + ST_SP(x15, x); x += 4; + x7 = LD_SP(px); px += 4; + } + + x8 = LD_SP(px); px += 4; + x0 *= da_vec; + x9 = LD_SP(px); px += 4; + x1 *= da_vec; + x10 = LD_SP(px); px += 4; + x2 *= da_vec; + x11 = LD_SP(px); px += 4; + x3 *= da_vec; + x12 = LD_SP(px); px += 4; + x4 *= da_vec; + x13 = LD_SP(px); px += 4; + x5 *= da_vec; + x14 = LD_SP(px); px += 4; + x6 *= da_vec; + x15 = LD_SP(px); px += 4; + x7 *= da_vec; + x8 *= da_vec; + ST_SP(x0, x); x += 4; + x9 *= da_vec; + ST_SP(x1, x); x += 4; + x10 *= da_vec; + ST_SP(x2, x); x += 4; + x11 *= da_vec; + ST_SP(x3, x); x += 4; + x12 *= da_vec; + ST_SP(x4, x); x += 4; + x13 *= da_vec; + ST_SP(x5, x); x += 4; + x15 *= da_vec; + ST_SP(x6, x); x += 4; + x14 *= da_vec; + ST_SP(x7, x); x += 4; + + ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4); + } + + if (n & 63) + { + if (n & 32) + { + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); + MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7); + ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); + } + + if (n & 16) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3); + ST_SP4_INC(x0, x1, x2, x3, x, 4); + } + + if (n & 8) + { + LD_SP2_INC(px, 4, x0, x1); + MUL2(x0, da_vec, x1, da_vec, x0, x1); + ST_SP2_INC(x0, x1, x, 4); + } + + if (n & 4) + { + LD_GP4_INC(px, 1, f0, f1, f2, f3); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, x, 1); + } + + if (n & 2) + { + LD_GP2_INC(px, 1, f0, f1); + MUL2(f0, da, f1, da, f0, f1); + ST_GP2_INC(f0, f1, x, 1); + } + + if (n & 1) + { + *x *= da; + } + } + } + } + else + { + if (0.0 == da) + { + for (i = n; i--;) + { + *x = 0; + x += inc_x; + } + } + else + { + if (n > 15) + { + LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); + + for (i = 0; i < (n >> 4) - 1; i++) + { + LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + + f4 *= da; + f5 *= da; + *x = f0; x += inc_x; + f6 *= da; + *x = f1; x += inc_x; + f7 *= da; + *x = f2; x += inc_x; + f8 *= da; + *x = f3; x += inc_x; + f9 *= da; + *x = f4; x += inc_x; + f10 *= da; + *x = f5; x += inc_x; + f11 *= da; + *x = f6; x += inc_x; + f12 *= da; + *x = f7; x += inc_x; + f13 *= da; + *x = f8; x += inc_x; + f14 *= da; + *x = f9; x += inc_x; + f15 *= da; + *x = f10; x += inc_x; + *x = f11; x += inc_x; + f0 = *px; px += inc_x; + *x = f12; x += inc_x; + f1 = *px; px += inc_x; + *x = f13; x += inc_x; + f2 = *px; px += inc_x; + *x = f14; x += inc_x; + f3 = *px; px += inc_x; + *x = f15; x += inc_x; + f4 = *px; px += inc_x; + f5 = *px; px += inc_x; + f6 = *px; px += inc_x; + f7 = *px; px += inc_x; + } + + LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + + f4 *= da; + f5 *= da; + *x = f0; x += inc_x; + f6 *= da; + *x = f1; x += inc_x; + f7 *= da; + *x = f2; x += inc_x; + f8 *= da; + *x = f3; x += inc_x; + f9 *= da; + *x = f4; x += inc_x; + f10 *= da; + *x = f5; x += inc_x; + f11 *= da; + *x = f6; x += inc_x; + f12 *= da; + *x = f7; x += inc_x; + f13 *= da; + *x = f8; x += inc_x; + f14 *= da; + *x = f9; x += inc_x; + f15 *= da; + *x = f10; x += inc_x; + *x = f11; x += inc_x; + *x = f12; x += inc_x; + *x = f13; x += inc_x; + *x = f14; x += inc_x; + *x = f15; x += inc_x; + } + + if (n & 15) + { + if (n & 8) + { + LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7); + ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x); + } + + if (n & 4) + { + LD_GP4_INC(px, inc_x, f0, f1, f2, f3); + MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3); + ST_GP4_INC(f0, f1, f2, f3, x, inc_x); + } + + if (n & 2) + { + LD_GP2_INC(px, inc_x, f0, f1); + MUL2(f0, da, f1, da, f0, f1); + ST_GP2_INC(f0, f1, x, inc_x); + } + + if (n & 1) + { + *x *= da; + } + } + } + } + + return 0; +} diff --git a/kernel/mips/sswap_msa.c b/kernel/mips/sswap_msa.c new file mode 100644 index 0000000..46fa8aa --- /dev/null +++ b/kernel/mips/sswap_msa.c @@ -0,0 +1,267 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, pref_offsetx, pref_offsety; + FLOAT *px, *py; + FLOAT x0, x1, x2, x3, x4, x5, x6, x7; + FLOAT y0, y1, y2, y3, y4, y5, y6, y7; + v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7; + v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7; + + if (n < 0) return (0); + + pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + px = srcx; + py = srcy; + + if ((1 == inc_x) && (1 == inc_y)) + { + if (n >> 5) + { + LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7); + + for (i = (n >> 5) - 1; i--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + yv0 = LD_SP(py); py += 4; + ST_SP(xv0, srcy); srcy += 4; + yv1 = LD_SP(py); py += 4; + ST_SP(xv1, srcy); srcy += 4; + yv2 = LD_SP(py); py += 4; + ST_SP(xv2, srcy); srcy += 4; + yv3 = LD_SP(py); py += 4; + ST_SP(xv3, srcy); srcy += 4; + yv4 = LD_SP(py); py += 4; + ST_SP(xv4, srcy); srcy += 4; + yv5 = LD_SP(py); py += 4; + ST_SP(xv5, srcy); srcy += 4; + yv6 = LD_SP(py); py += 4; + ST_SP(xv6, srcy); srcy += 4; + yv7 = LD_SP(py); py += 4; + ST_SP(xv7, srcy); srcy += 4; + + xv0 = LD_SP(px); px += 4; + ST_SP(yv0, srcx); srcx += 4; + xv1 = LD_SP(px); px += 4; + ST_SP(yv1, srcx); srcx += 4; + xv2 = LD_SP(px); px += 4; + ST_SP(yv2, srcx); srcx += 4; + xv3 = LD_SP(px); px += 4; + ST_SP(yv3, srcx); srcx += 4; + xv4 = LD_SP(px); px += 4; + ST_SP(yv4, srcx); srcx += 4; + xv5 = LD_SP(px); px += 4; + ST_SP(yv5, srcx); srcx += 4; + xv6 = LD_SP(px); px += 4; + ST_SP(yv6, srcx); srcx += 4; + xv7 = LD_SP(px); px += 4; + ST_SP(yv7, srcx); srcx += 4; + } + + LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7); + ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4); + ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4); + } + + if (n & 31) + { + if ((n & 16) && (n & 8) && (n & 4)) + { + LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6); + LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6); + ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4); + ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4); + } + else if ((n & 16) && (n & 8)) + { + LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5); + LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5); + ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4); + ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4); + } + else if ((n & 16) && (n & 4)) + { + LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4); + LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4); + ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4); + ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4); + } + else if ((n & 8) && (n & 4)) + { + LD_SP3_INC(px, 4, xv0, xv1, xv2); + LD_SP3_INC(py, 4, yv0, yv1, yv2); + ST_SP3_INC(xv0, xv1, xv2, srcy, 4); + ST_SP3_INC(yv0, yv1, yv2, srcx, 4); + } + else if (n & 16) + { + LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3); + LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3); + ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4); + ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4); + } + else if (n & 8) + { + LD_SP2_INC(px, 4, xv0, xv1); + LD_SP2_INC(py, 4, yv0, yv1); + ST_SP2_INC(xv0, xv1, srcy, 4); + ST_SP2_INC(yv0, yv1, srcx, 4); + } + else if (n & 4) + { + xv0 = LD_SP(px); + yv0 = LD_SP(py); + + px += 4; + py += 4; + + ST_SP(xv0, srcy); + ST_SP(yv0, srcx); + + srcx += 4; + srcy += 4; + } + + if ((n & 2) && (n & 1)) + { + LD_GP3_INC(px, 1, x0, x1, x3); + LD_GP3_INC(py, 1, y0, y1, y3); + ST_GP3_INC(x0, x1, x3, srcy, 1); + ST_GP3_INC(y0, y1, y3, srcx, 1); + } + else if (n & 2) + { + LD_GP2_INC(px, 1, x0, x1); + LD_GP2_INC(py, 1, y0, y1); + ST_GP2_INC(x0, x1, srcy, 1); + ST_GP2_INC(y0, y1, srcx, 1); + } + else if (n & 1) + { + x0 = px[0]; + y0 = py[0]; + srcx[0] = y0; + srcy[0] = x0; + } + } + } + else + { + for (i = (n >> 3); i--;) + { + LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); + LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7); + ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y); + ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x); + } + + if (n & 7) + { + if ((n & 4) && (n & 2) && (n & 1)) + { + LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6); + LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6); + ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y); + ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x); + } + else if ((n & 4) && (n & 2)) + { + LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5); + LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5); + ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y); + ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x); + } + else if ((n & 4) && (n & 1)) + { + LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4); + LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4); + ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y); + ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x); + } + else if ((n & 2) && (n & 1)) + { + LD_GP3_INC(px, inc_x, x0, x1, x2); + LD_GP3_INC(py, inc_y, y0, y1, y2); + ST_GP3_INC(x0, x1, x2, srcy, inc_y); + ST_GP3_INC(y0, y1, y2, srcx, inc_x); + } + else if (n & 4) + { + LD_GP4_INC(px, inc_x, x0, x1, x2, x3); + LD_GP4_INC(py, inc_y, y0, y1, y2, y3); + ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y); + ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x); + } + else if (n & 2) + { + LD_GP2_INC(px, inc_x, x0, x1); + LD_GP2_INC(py, inc_y, y0, y1); + ST_GP2_INC(x0, x1, srcy, inc_y); + ST_GP2_INC(y0, y1, srcx, inc_x); + } + else if (n & 1) + { + x0 = *srcx; + y0 = *srcy; + + *srcx = y0; + *srcy = x0; + } + } + } + + return (0); +} diff --git a/kernel/mips/zaxpy_msa.c b/kernel/mips/zaxpy_msa.c new file mode 100644 index 0000000..f17748f --- /dev/null +++ b/kernel/mips/zaxpy_msa.c @@ -0,0 +1,494 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#if !defined(CONJ) + #define OP0 += + #define OP1 -= + #define OP2 += +#else + #define OP0 -= + #define OP1 += + #define OP2 -= +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) +{ + BLASLONG i, inc_x2, inc_y2; + FLOAT *py; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7; + v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec; + v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i; + v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i; + FLOAT xd0, xd1, yd0, yd1; + + if (n < 0) return(0); + if ((da_r == 0.0) && (da_i == 0.0)) return(0); + + py = y; + + dar_vec = COPY_DOUBLE_TO_VECTOR(da_r); + dai_vec = COPY_DOUBLE_TO_VECTOR(da_i); + + if ((1 == inc_x) && (1 == inc_y)) + { + FLOAT *x_pref, *y_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 32; + + for (i = (n >> 3); i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + x_pref += 16; + y_pref += 16; + + LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); + LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + PCKEVOD_D2_DP(x3, x2, x1r, x1i); + PCKEVOD_D2_DP(y3, y2, y1r, y1i); + PCKEVOD_D2_DP(x5, x4, x2r, x2i); + PCKEVOD_D2_DP(y5, y4, y2r, y2i); + PCKEVOD_D2_DP(x7, x6, x3r, x3i); + PCKEVOD_D2_DP(y7, y6, y3r, y3i); + + FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y2i OP0 dar_vec * x2i; + y3i OP0 dar_vec * x3i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y2r OP1 dai_vec * x2i; + y3r OP1 dai_vec * x3i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + y2i OP2 dai_vec * x2r; + y3i OP2 dai_vec * x3r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ILVRL_D2_DP(y1i, y1r, y2, y3); + ILVRL_D2_DP(y2i, y2r, y4, y5); + ILVRL_D2_DP(y3i, y3r, y6, y7); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); + } + + if (n & 7) + { + if (n & 4) + { + LD_DP4_INC(x, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + PCKEVOD_D2_DP(x3, x2, x1r, x1i); + PCKEVOD_D2_DP(y3, y2, y1r, y1i); + + FMADD2(x0r, x1r, dar_vec, y0r, y1r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ILVRL_D2_DP(y1i, y1r, y2, y3); + ST_DP4_INC(y0, y1, y2, y3, y, 2); + } + + if (n & 2) + { + LD_DP2_INC(x, 2, x0, x1); + LD_DP2_INC(py, 2, y0, y1); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + + y0r += dar_vec * x0r; + y0i OP0 dar_vec * x0i; + y0r OP1 dai_vec * x0i; + y0i OP2 dai_vec * x0r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ST_DP2_INC(y0, y1, y, 2); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, xd0, xd1); + LD_GP2_INC(py, 1, yd0, yd1); + + yd0 += da_r * xd0; + yd1 OP0 da_r * xd1; + yd0 OP1 da_i * xd1; + yd1 OP2 da_i * xd0; + + ST_GP2_INC(yd0, yd1, y, 1); + } + } + } + else if (1 == inc_y) + { + FLOAT *y_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + y_pref = y + pref_offset + 32; + + inc_x2 = 2 * inc_x; + + for (i = (n >> 3); i--;) + { + PREF_OFFSET(y_pref, 0); + PREF_OFFSET(y_pref, 32); + PREF_OFFSET(y_pref, 64); + PREF_OFFSET(y_pref, 96); + y_pref += 16; + + LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + PCKEVOD_D2_DP(x3, x2, x1r, x1i); + PCKEVOD_D2_DP(y3, y2, y1r, y1i); + PCKEVOD_D2_DP(x5, x4, x2r, x2i); + PCKEVOD_D2_DP(y5, y4, y2r, y2i); + PCKEVOD_D2_DP(x7, x6, x3r, x3i); + PCKEVOD_D2_DP(y7, y6, y3r, y3i); + + FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y2i OP0 dar_vec * x2i; + y3i OP0 dar_vec * x3i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y2r OP1 dai_vec * x2i; + y3r OP1 dai_vec * x3i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + y2i OP2 dai_vec * x2r; + y3i OP2 dai_vec * x3r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ILVRL_D2_DP(y1i, y1r, y2, y3); + ILVRL_D2_DP(y2i, y2r, y4, y5); + ILVRL_D2_DP(y3i, y3r, y6, y7); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); + } + + if (n & 7) + { + if (n & 4) + { + LD_DP4_INC(x, inc_x2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + PCKEVOD_D2_DP(x3, x2, x1r, x1i); + PCKEVOD_D2_DP(y3, y2, y1r, y1i); + + FMADD2(x0r, x1r, dar_vec, y0r, y1r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ILVRL_D2_DP(y1i, y1r, y2, y3); + ST_DP4_INC(y0, y1, y2, y3, y, 2); + } + + if (n & 2) + { + LD_DP2_INC(x, inc_x2, x0, x1); + LD_DP2_INC(py, 2, y0, y1); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + + y0r += dar_vec * x0r; + y0i OP0 dar_vec * x0i; + y0r OP1 dai_vec * x0i; + y0i OP2 dai_vec * x0r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ST_DP2_INC(y0, y1, y, 2); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, xd0, xd1); + LD_GP2_INC(py, 1, yd0, yd1); + + yd0 += da_r * xd0; + yd1 OP0 da_r * xd1; + yd0 OP1 da_i * xd1; + yd1 OP2 da_i * xd0; + + ST_GP2_INC(yd0, yd1, y, 1); + } + } + } + else if (1 == inc_x) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32; + + inc_y2 = 2 * inc_y; + + for (i = (n >> 3); i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + x_pref += 16; + + LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); + LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + PCKEVOD_D2_DP(x3, x2, x1r, x1i); + PCKEVOD_D2_DP(y3, y2, y1r, y1i); + PCKEVOD_D2_DP(x5, x4, x2r, x2i); + PCKEVOD_D2_DP(y5, y4, y2r, y2i); + PCKEVOD_D2_DP(x7, x6, x3r, x3i); + PCKEVOD_D2_DP(y7, y6, y3r, y3i); + + FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y2i OP0 dar_vec * x2i; + y3i OP0 dar_vec * x3i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y2r OP1 dai_vec * x2i; + y3r OP1 dai_vec * x3i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + y2i OP2 dai_vec * x2r; + y3i OP2 dai_vec * x3r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ILVRL_D2_DP(y1i, y1r, y2, y3); + ILVRL_D2_DP(y2i, y2r, y4, y5); + ILVRL_D2_DP(y3i, y3r, y6, y7); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); + } + + if (n & 7) + { + if (n & 4) + { + LD_DP4_INC(x, 2, x0, x1, x2, x3); + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + PCKEVOD_D2_DP(x3, x2, x1r, x1i); + PCKEVOD_D2_DP(y3, y2, y1r, y1i); + + FMADD2(x0r, x1r, dar_vec, y0r, y1r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ILVRL_D2_DP(y1i, y1r, y2, y3); + ST_DP4_INC(y0, y1, y2, y3, y, inc_y2); + } + + if (n & 2) + { + LD_DP2_INC(x, 2, x0, x1); + LD_DP2_INC(py, inc_y2, y0, y1); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + + y0r += dar_vec * x0r; + y0i OP0 dar_vec * x0i; + y0r OP1 dai_vec * x0i; + y0i OP2 dai_vec * x0r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ST_DP2_INC(y0, y1, y, inc_y2); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, xd0, xd1); + LD_GP2_INC(py, 1, yd0, yd1); + + yd0 += da_r * xd0; + yd1 OP0 da_r * xd1; + yd0 OP1 da_i * xd1; + yd1 OP2 da_i * xd0; + + ST_GP2_INC(yd0, yd1, y, 1); + } + } + } + else + { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + for (i = (n >> 3); i--;) + { + LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + PCKEVOD_D2_DP(x3, x2, x1r, x1i); + PCKEVOD_D2_DP(y3, y2, y1r, y1i); + PCKEVOD_D2_DP(x5, x4, x2r, x2i); + PCKEVOD_D2_DP(y5, y4, y2r, y2i); + PCKEVOD_D2_DP(x7, x6, x3r, x3i); + PCKEVOD_D2_DP(y7, y6, y3r, y3i); + + FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y2i OP0 dar_vec * x2i; + y3i OP0 dar_vec * x3i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y2r OP1 dai_vec * x2i; + y3r OP1 dai_vec * x3i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + y2i OP2 dai_vec * x2r; + y3i OP2 dai_vec * x3r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ILVRL_D2_DP(y1i, y1r, y2, y3); + ILVRL_D2_DP(y2i, y2r, y4, y5); + ILVRL_D2_DP(y3i, y3r, y6, y7); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); + } + + if (n & 7) + { + if (n & 4) + { + LD_DP4_INC(x, inc_x2, x0, x1, x2, x3); + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + PCKEVOD_D2_DP(x3, x2, x1r, x1i); + PCKEVOD_D2_DP(y3, y2, y1r, y1i); + + FMADD2(x0r, x1r, dar_vec, y0r, y1r); + y0i OP0 dar_vec * x0i; + y1i OP0 dar_vec * x1i; + y0r OP1 dai_vec * x0i; + y1r OP1 dai_vec * x1i; + y0i OP2 dai_vec * x0r; + y1i OP2 dai_vec * x1r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ILVRL_D2_DP(y1i, y1r, y2, y3); + ST_DP4_INC(y0, y1, y2, y3, y, inc_y2); + } + + if (n & 2) + { + LD_DP2_INC(x, inc_x2, x0, x1); + LD_DP2_INC(py, inc_y2, y0, y1); + PCKEVOD_D2_DP(x1, x0, x0r, x0i); + PCKEVOD_D2_DP(y1, y0, y0r, y0i); + + y0r += dar_vec * x0r; + y0i OP0 dar_vec * x0i; + y0r OP1 dai_vec * x0i; + y0i OP2 dai_vec * x0r; + + ILVRL_D2_DP(y0i, y0r, y0, y1); + ST_DP2_INC(y0, y1, y, inc_y2); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, xd0, xd1); + LD_GP2_INC(py, 1, yd0, yd1); + + yd0 += da_r * xd0; + yd1 OP0 da_r * xd1; + yd0 OP1 da_i * xd1; + yd1 OP2 da_i * xd0; + + ST_GP2_INC(yd0, yd1, y, 1); + } + } + } + + return (0); +} diff --git a/kernel/mips/zcopy_msa.c b/kernel/mips/zcopy_msa.c new file mode 100644 index 0000000..690732d --- /dev/null +++ b/kernel/mips/zcopy_msa.c @@ -0,0 +1,218 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + FLOAT f0, f1; + + if (n < 0) return (0); + + if ((1 == inc_x) && (1 == inc_y)) + { + if (n > 15) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 64 + 16; + + LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = (n >> 4) - 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 32; + + x8 = LD_DP(x); x += 2; + ST_DP(x0, y); y += 2; + x9 = LD_DP(x); x += 2; + ST_DP(x1, y); y += 2; + x10 = LD_DP(x); x += 2; + ST_DP(x2, y); y += 2; + x11 = LD_DP(x); x += 2; + ST_DP(x3, y); y += 2; + x12 = LD_DP(x); x += 2; + ST_DP(x4, y); y += 2; + x13 = LD_DP(x); x += 2; + ST_DP(x5, y); y += 2; + x14 = LD_DP(x); x += 2; + ST_DP(x6, y); y += 2; + x15 = LD_DP(x); x += 2; + ST_DP(x7, y); y += 2; + x0 = LD_DP(x); x += 2; + ST_DP(x8, y); y += 2; + x1 = LD_DP(x); x += 2; + ST_DP(x9, y); y += 2; + x2 = LD_DP(x); x += 2; + ST_DP(x10, y); y += 2; + x3 = LD_DP(x); x += 2; + ST_DP(x11, y); y += 2; + x4 = LD_DP(x); x += 2; + ST_DP(x12, y); y += 2; + x5 = LD_DP(x); x += 2; + ST_DP(x13, y); y += 2; + x6 = LD_DP(x); x += 2; + ST_DP(x14, y); y += 2; + x7 = LD_DP(x); x += 2; + ST_DP(x15, y); y += 2; + } + + x8 = LD_DP(x); x += 2; + x9 = LD_DP(x); x += 2; + ST_DP(x0, y); y += 2; + x10 = LD_DP(x); x += 2; + ST_DP(x1, y); y += 2; + x11 = LD_DP(x); x += 2; + ST_DP(x2, y); y += 2; + x12 = LD_DP(x); x += 2; + ST_DP(x3, y); y += 2; + x13 = LD_DP(x); x += 2; + ST_DP(x4, y); y += 2; + x14 = LD_DP(x); x += 2; + ST_DP(x5, y); y += 2; + x15 = LD_DP(x); x += 2; + ST_DP(x6, y); y += 2; + ST_DP(x7, y); y += 2; + + ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2); + } + + if (n & 4) + { + LD_DP4_INC(x, 2, x0, x1, x2, x3); + ST_DP4_INC(x0, x1, x2, x3, y, 2); + } + + if (n & 2) + { + LD_DP2_INC(x, 2, x0, x1); + ST_DP2_INC(x0, x1, y, 2); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, f0, f1); + ST_GP2_INC(f0, f1, y, 1); + } + } + } + else + { + inc_x *= 2; + inc_y *= 2; + + for (i = (n >> 4); i--;) + { + x0 = LD_DP(x); x += inc_x; + x1 = LD_DP(x); x += inc_x; + x2 = LD_DP(x); x += inc_x; + x3 = LD_DP(x); x += inc_x; + x4 = LD_DP(x); x += inc_x; + x5 = LD_DP(x); x += inc_x; + x6 = LD_DP(x); x += inc_x; + x7 = LD_DP(x); x += inc_x; + x8 = LD_DP(x); x += inc_x; + ST_DP(x0, y); y += inc_y; + x9 = LD_DP(x); x += inc_x; + ST_DP(x1, y); y += inc_y; + x10 = LD_DP(x); x += inc_x; + ST_DP(x2, y); y += inc_y; + x11 = LD_DP(x); x += inc_x; + ST_DP(x3, y); y += inc_y; + x12 = LD_DP(x); x += inc_x; + ST_DP(x4, y); y += inc_y; + x13 = LD_DP(x); x += inc_x; + ST_DP(x5, y); y += inc_y; + x14 = LD_DP(x); x += inc_x; + ST_DP(x6, y); y += inc_y; + x15 = LD_DP(x); x += inc_x; + ST_DP(x7, y); y += inc_y; + ST_DP(x8, y); y += inc_y; + ST_DP(x9, y); y += inc_y; + ST_DP(x10, y); y += inc_y; + ST_DP(x11, y); y += inc_y; + ST_DP(x12, y); y += inc_y; + ST_DP(x13, y); y += inc_y; + ST_DP(x14, y); y += inc_y; + ST_DP(x15, y); y += inc_y; + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x7); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, inc_y); + } + + if (n & 4) + { + LD_DP4_INC(x, inc_x, x0, x1, x2, x3); + ST_DP4_INC(x0, x1, x2, x3, y, inc_y); + } + + if (n & 2) + { + LD_DP2_INC(x, inc_x, x0, x1); + ST_DP2_INC(x0, x1, y, inc_y); + } + + if (n & 1) + { + LD_GP2_INC(x, 1, f0, f1); + ST_GP2_INC(f0, f1, y, 1); + } + } + } + + return (0); +} diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c new file mode 100644 index 0000000..5a8766d --- /dev/null +++ b/kernel/mips/zscal_msa.c @@ -0,0 +1,717 @@ +/******************************************************************************* +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +/* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10) + 0 1 2 3 => 2 3 0 1 */ +#define SHF_78 78 + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) +{ + BLASLONG i, inc_x2; + FLOAT *px; + FLOAT tp0, tp1, f0, f1; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15; + v2f64 da_i_vec, da_i_vec_neg, da_r_vec; + + px = x; + + if (1 == inc_x) + { + if ((0.0 == da_r) && (0.0 == da_i)) + { + v2f64 zero_v = __msa_cast_to_vector_double(0); + zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); + zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + + for (i = (n >> 4); i--;) + { + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 2); + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 2); + } + + if (n & 15) + { + if (n & 8) + { + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, 2); + } + + if (n & 4) + { + ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2); + } + + if (n & 2) + { + ST_DP2_INC(zero_v, zero_v, x, 2); + } + + if (n & 1) + { + ST_DP(zero_v, x); + } + } + } + else if (0.0 == da_r) + { + da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); + da_i_vec_neg = -da_i_vec; + da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); + + if (n > 15) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32 + 16; + + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = (n >> 4)- 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 32; + + x8 = LD_DP(px); px += 2; + x0 *= da_i_vec; + x9 = LD_DP(px); px += 2; + x1 *= da_i_vec; + x10 = LD_DP(px); px += 2; + x2 *= da_i_vec; + x11 = LD_DP(px); px += 2; + x3 *= da_i_vec; + x12 = LD_DP(px); px += 2; + x4 *= da_i_vec; + x13 = LD_DP(px); px += 2; + x5 *= da_i_vec; + x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78); + x14 = LD_DP(px); px += 2; + x6 *= da_i_vec; + x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78); + x15 = LD_DP(px); px += 2; + x7 *= da_i_vec; + x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78); + x8 *= da_i_vec; + x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78); + ST_DP(x0, x); x += 2; + x9 *= da_i_vec; + x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78); + ST_DP(x1, x); x += 2; + x10 *= da_i_vec; + x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78); + ST_DP(x2, x); x += 2; + x11 *= da_i_vec; + x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78); + ST_DP(x3, x); x += 2; + x12 *= da_i_vec; + x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78); + ST_DP(x4, x); x += 2; + x13 *= da_i_vec; + x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78); + ST_DP(x5, x); x += 2; + x14 *= da_i_vec; + x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78); + ST_DP(x6, x); x += 2; + x15 *= da_i_vec; + x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78); + ST_DP(x7, x); x += 2; + x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78); + ST_DP(x8, x); x += 2; + x0 = LD_DP(px); px += 2; + x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78); + ST_DP(x9, x); x += 2; + x1 = LD_DP(px); px += 2; + x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78); + ST_DP(x10, x); x += 2; + x2 = LD_DP(px); px += 2; + x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78); + ST_DP(x11, x); x += 2; + x3 = LD_DP(px); px += 2; + x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78); + ST_DP(x12, x); x += 2; + x4 = LD_DP(px); px += 2; + ST_DP(x13, x); x += 2; + x5 = LD_DP(px); px += 2; + ST_DP(x14, x); x += 2; + x6 = LD_DP(px); px += 2; + ST_DP(x15, x); x += 2; + x7 = LD_DP(px); px += 2; + } + + LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + x4, x5, x6, x7); + MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, + x8, x9, x10, x11); + MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, + x12, x13, x14, x15); + SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); + SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); + SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78); + SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78); + ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, + x12, x13, x14, x15, x, 2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + x4, x5, x6, x7); + SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); + SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); + } + + if (n & 4) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); + ST_DP4_INC(x0, x1, x2, x3, x, 2); + } + + if (n & 2) + { + LD_DP2_INC(px, 2, x0, x1); + MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); + SHF_W2_DP(x0, x1, x0, x1, SHF_78); + ST_DP2_INC(x0, x1, x, 2); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + MUL2(f0, da_i, f1, -da_i, f0, f1); + ST_GP2_INC(f1, f0, x, 1); + } + } + } + else if (0.0 == da_i) + { + da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); + + if (n > 15) + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32 + 16; + + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + for (i = (n >> 4)- 1; i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 32; + + x8 = LD_DP(px); px += 2; + x0 *= da_r_vec; + x9 = LD_DP(px); px += 2; + x1 *= da_r_vec; + x10 = LD_DP(px); px += 2; + x2 *= da_r_vec; + x11 = LD_DP(px); px += 2; + x3 *= da_r_vec; + x12 = LD_DP(px); px += 2; + x4 *= da_r_vec; + x13 = LD_DP(px); px += 2; + x5 *= da_r_vec; + ST_DP(x0, x); x += 2; + x14 = LD_DP(px); px += 2; + x6 *= da_r_vec; + ST_DP(x1, x); x += 2; + x15 = LD_DP(px); px += 2; + x7 *= da_r_vec; + ST_DP(x2, x); x += 2; + x8 *= da_r_vec; + ST_DP(x3, x); x += 2; + x9 *= da_r_vec; + ST_DP(x4, x); x += 2; + x10 *= da_r_vec; + ST_DP(x5, x); x += 2; + x11 *= da_r_vec; + ST_DP(x6, x); x += 2; + x12 *= da_r_vec; + ST_DP(x7, x); x += 2; + x13 *= da_r_vec; + ST_DP(x8, x); x += 2; + x0 = LD_DP(px); px += 2; + x14 *= da_r_vec; + ST_DP(x9, x); x += 2; + x1 = LD_DP(px); px += 2; + x15 *= da_r_vec; + ST_DP(x10, x); x += 2; + x2 = LD_DP(px); px += 2; + ST_DP(x11, x); x += 2; + x3 = LD_DP(px); px += 2; + ST_DP(x12, x); x += 2; + x4 = LD_DP(px); px += 2; + ST_DP(x13, x); x += 2; + x5 = LD_DP(px); px += 2; + ST_DP(x14, x); x += 2; + x6 = LD_DP(px); px += 2; + ST_DP(x15, x); x += 2; + x7 = LD_DP(px); px += 2; + } + + LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, + x4, x5, x6, x7); + MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, + x8, x9, x10, x11); + MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, + x12, x13, x14, x15); + ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, + x12, x13, x14, x15, x, 2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, + x4, x5, x6, x7); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); + } + + if (n & 4) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + ST_DP4_INC(x0, x1, x2, x3, x, 2); + } + + if (n & 2) + { + LD_DP2_INC(px, 2, x0, x1); + MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); + ST_DP2_INC(x0, x1, x, 2); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + MUL2(f0, da_r, f1, da_r, f0, f1); + ST_GP2_INC(f0, f1, x, 1); + } + } + } + else + { + FLOAT *x_pref; + BLASLONG pref_offset; + + pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1); + if (pref_offset > 0) + { + pref_offset = L1_DATA_LINESIZE - pref_offset; + pref_offset = pref_offset / sizeof(FLOAT); + } + x_pref = x + pref_offset + 32; + + da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); + da_i_vec_neg = -da_i_vec; + da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); + + da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); + + for (i = (n >> 4); i--;) + { + PREF_OFFSET(x_pref, 0); + PREF_OFFSET(x_pref, 32); + PREF_OFFSET(x_pref, 64); + PREF_OFFSET(x_pref, 96); + PREF_OFFSET(x_pref, 128); + PREF_OFFSET(x_pref, 160); + PREF_OFFSET(x_pref, 192); + PREF_OFFSET(x_pref, 224); + x_pref += 32; + + LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, + x11, x12, x13, x14, x15); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + d4, d5, d6, d7); + MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, + d8, d9, d10, d11); + MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, + d12, d13, d14, d15); + SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); + SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); + SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78); + SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); + FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); + FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); + ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, + d12, d13, d14, d15, x, 2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + d4, d5, d6, d7); + SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); + SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); + ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2); + } + + if (n & 4) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + ST_DP4_INC(d0, d1, d2, d3, x, 2); + } + + if (n & 2) + { + LD_DP2_INC(px, 2, x0, x1); + MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); + SHF_W2_DP(d0, d1, d0, d1, SHF_78); + FMADD2(x0, x1, da_r_vec, d0, d1); + ST_DP2_INC(d0, d1, x, 2); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + + tp0 = da_r * f0; + tp0 -= da_i * f1; + tp1 = da_r * f1; + tp1 += da_i * f0; + + ST_GP2_INC(tp0, tp1, x, 1); + } + } + } + } + else + { + inc_x2 = 2 * inc_x; + + if ((0.0 == da_r) && (0.0 == da_i)) + { + v2f64 zero_v = __msa_cast_to_vector_double(0); + zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); + zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + + for (i = (n >> 4); i--;) + { + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, inc_x2); + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, inc_x2); + } + + if (n & 15) + { + if (n & 8) + { + ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v, + zero_v, zero_v, x, inc_x2); + } + + if (n & 4) + { + ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2); + } + + if (n & 2) + { + ST_DP2_INC(zero_v, zero_v, x, inc_x2); + } + + if (n & 1) + { + ST_DP(zero_v, x); + } + } + } + else if (0.0 == da_r) + { + da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); + da_i_vec_neg = -da_i_vec; + da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); + + for (i = (n >> 4); i--;) + { + LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, + x10, x11, x12, x13, x14, x15); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + x4, x5, x6, x7); + MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, + x8, x9, x10, x11); + MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, + x12, x13, x14, x15); + SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); + SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); + SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78); + SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78); + ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, + x12, x13, x14, x15, x, inc_x2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + x4, x5, x6, x7); + SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); + SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); + } + + if (n & 4) + { + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + x0, x1, x2, x3); + SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78); + ST_DP4_INC(x0, x1, x2, x3, x, inc_x2); + } + + if (n & 2) + { + LD_DP2_INC(px, inc_x2, x0, x1); + MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1); + SHF_W2_DP(x0, x1, x0, x1, SHF_78); + ST_DP2_INC(x0, x1, x, inc_x2); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + MUL2(f0, da_i, f1, -da_i, f0, f1); + ST_GP2_INC(f1, f0, x, 1); + } + } + } + else if (0.0 == da_i) + { + da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); + + for (i = (n >> 4); i--;) + { + LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, + x10, x11, x12, x13, x14, x15); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, + x4, x5, x6, x7); + MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec, + x8, x9, x10, x11); + MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec, + x12, x13, x14, x15); + ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, + x12, x13, x14, x15, x, inc_x2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec, + x4, x5, x6, x7); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); + } + + if (n & 4) + { + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); + MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec, + x0, x1, x2, x3); + ST_DP4_INC(x0, x1, x2, x3, x, inc_x2); + } + + if (n & 2) + { + LD_DP2_INC(px, inc_x2, x0, x1); + MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1); + ST_DP2_INC(x0, x1, x, inc_x2); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + MUL2(f0, da_r, f1, da_r, f0, f1); + ST_GP2_INC(f0, f1, x, 1); + } + } + } + else + { + da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i); + da_i_vec_neg = -da_i_vec; + da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec); + + da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r); + + for (i = (n >> 4); i--;) + { + LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, + x10, x11, x12, x13, x14, x15); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + d4, d5, d6, d7); + MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec, + d8, d9, d10, d11); + MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec, + d12, d13, d14, d15); + SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); + SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); + SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78); + SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); + FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11); + FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15); + ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, + d12, d13, d14, d15, x, inc_x2); + } + + if (n & 15) + { + if (n & 8) + { + LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec, + d4, d5, d6, d7); + SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); + SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7); + ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2); + } + + if (n & 4) + { + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); + MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec, + d0, d1, d2, d3); + SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78); + FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3); + ST_DP4_INC(d0, d1, d2, d3, x, inc_x2); + } + + if (n & 2) + { + LD_DP2_INC(px, inc_x2, x0, x1); + MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1); + SHF_W2_DP(d0, d1, d0, d1, SHF_78); + FMADD2(x0, x1, da_r_vec, d0, d1); + ST_DP2_INC(d0, d1, x, inc_x2); + } + + if (n & 1) + { + LD_GP2_INC(px, 1, f0, f1); + + tp0 = da_r * f0; + tp0 -= da_i * f1; + tp1 = da_r * f1; + tp1 += da_i * f0; + + ST_GP2_INC(tp0, tp1, x, 1); + } + } + } + } + + return (0); +} diff --git a/kernel/mips/zswap_msa.c b/kernel/mips/zswap_msa.c new file mode 100644 index 0000000..eaf7ec1 --- /dev/null +++ b/kernel/mips/zswap_msa.c @@ -0,0 +1,238 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, + BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i, inc_x2, inc_y2, pref_offsetx, pref_offsety; + FLOAT *px, *py; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7; + v2f64 y0, y1, y2, y3, y4, y5, y6, y7; + + if (n < 0) return (0); + + pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + px = srcx; + py = srcy; + + if ((1 == inc_x) && (1 == inc_y)) + { + if (n >> 3) + { + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + + for (i = (n >> 3) - 1; i--;) + { + PREFETCH(px + pref_offsetx + 16); + PREFETCH(px + pref_offsetx + 20); + PREFETCH(px + pref_offsetx + 24); + PREFETCH(px + pref_offsetx + 28); + + PREFETCH(py + pref_offsety + 16); + PREFETCH(py + pref_offsety + 20); + PREFETCH(py + pref_offsety + 24); + PREFETCH(py + pref_offsety + 28); + + y0 = LD_DP(py); py += 2; + ST_DP(x0, srcy); srcy += 2; + y1 = LD_DP(py); py += 2; + ST_DP(x1, srcy); srcy += 2; + y2 = LD_DP(py); py += 2; + ST_DP(x2, srcy); srcy += 2; + y3 = LD_DP(py); py += 2; + ST_DP(x3, srcy); srcy += 2; + y4 = LD_DP(py); py += 2; + ST_DP(x4, srcy); srcy += 2; + y5 = LD_DP(py); py += 2; + ST_DP(x5, srcy); srcy += 2; + y6 = LD_DP(py); py += 2; + ST_DP(x6, srcy); srcy += 2; + y7 = LD_DP(py); py += 2; + ST_DP(x7, srcy); srcy += 2; + + x0 = LD_DP(px); px += 2; + ST_DP(y0, srcx); srcx += 2; + x1 = LD_DP(px); px += 2; + ST_DP(y1, srcx); srcx += 2; + x2 = LD_DP(px); px += 2; + ST_DP(y2, srcx); srcx += 2; + x3 = LD_DP(px); px += 2; + ST_DP(y3, srcx); srcx += 2; + x4 = LD_DP(px); px += 2; + ST_DP(y4, srcx); srcx += 2; + x5 = LD_DP(px); px += 2; + ST_DP(y5, srcx); srcx += 2; + x6 = LD_DP(px); px += 2; + ST_DP(y6, srcx); srcx += 2; + x7 = LD_DP(px); px += 2; + ST_DP(y7, srcx); srcx += 2; + } + + LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, 2); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, 2); + } + + if (n & 7) + { + if ((n & 4) && (n & 2) && (n & 1)) + { + LD_DP7_INC(px, 2, x0, x1, x2, x3, x4, x5, x6); + LD_DP7_INC(py, 2, y0, y1, y2, y3, y4, y5, y6); + ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, 2); + ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, 2); + } + else if ((n & 4) && (n & 2)) + { + LD_DP6_INC(px, 2, x0, x1, x2, x3, x4, x5); + LD_DP6_INC(py, 2, y0, y1, y2, y3, y4, y5); + ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, 2); + ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, 2); + } + else if ((n & 4) && (n & 1)) + { + LD_DP5_INC(px, 2, x0, x1, x2, x3, x4); + LD_DP5_INC(py, 2, y0, y1, y2, y3, y4); + ST_DP5_INC(x0, x1, x2, x3, x4, srcy, 2); + ST_DP5_INC(y0, y1, y2, y3, y4, srcx, 2); + } + else if ((n & 2) && (n & 1)) + { + LD_DP3_INC(px, 2, x0, x1, x2); + LD_DP3_INC(py, 2, y0, y1, y2); + ST_DP3_INC(x0, x1, x2, srcy, 2); + ST_DP3_INC(y0, y1, y2, srcx, 2); + } + else if (n & 4) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + ST_DP4_INC(x0, x1, x2, x3, srcy, 2); + ST_DP4_INC(y0, y1, y2, y3, srcx, 2); + } + else if (n & 2) + { + LD_DP2_INC(px, 2, x0, x1); + LD_DP2_INC(py, 2, y0, y1); + ST_DP2_INC(x0, x1, srcy, 2); + ST_DP2_INC(y0, y1, srcx, 2); + } + else if (n & 1) + { + x0 = LD_DP(px); + y0 = LD_DP(py); + ST_DP(y0, srcx); + ST_DP(x0, srcy); + } + } + } + else + { + for (i = (n >> 3); i--;) + { + LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); + LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y2); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x2); + } + + if (n & 7) + { + if ((n & 4) && (n & 2) && (n & 1)) + { + LD_DP7_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6); + LD_DP7_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6); + ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y2); + ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x2); + } + else if ((n & 4) && (n & 2)) + { + LD_DP6_INC(px, inc_x2, x0, x1, x2, x3, x4, x5); + LD_DP6_INC(py, inc_y2, y0, y1, y2, y3, y4, y5); + ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y2); + ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x2); + } + else if ((n & 4) && (n & 1)) + { + LD_DP5_INC(px, inc_x2, x0, x1, x2, x3, x4); + LD_DP5_INC(py, inc_y2, y0, y1, y2, y3, y4); + ST_DP5_INC(x0, x1, x2, x3, x4, srcy, inc_y2); + ST_DP5_INC(y0, y1, y2, y3, y4, srcx, inc_x2); + } + else if ((n & 2) && (n & 1)) + { + LD_DP3_INC(px, inc_x2, x0, x1, x2); + LD_DP3_INC(py, inc_y2, y0, y1, y2); + ST_DP3_INC(x0, x1, x2, srcy, inc_y2); + ST_DP3_INC(y0, y1, y2, srcx, inc_x2); + } + else if (n & 4) + { + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); + ST_DP4_INC(x0, x1, x2, x3, srcy, inc_y2); + ST_DP4_INC(y0, y1, y2, y3, srcx, inc_x2); + } + else if (n & 2) + { + LD_DP2_INC(px, inc_x2, x0, x1); + LD_DP2_INC(py, inc_y2, y0, y1); + ST_DP2_INC(x0, x1, srcy, inc_y2); + ST_DP2_INC(y0, y1, srcx, inc_x2); + } + else if (n & 1) + { + x0 = LD_DP(px); + y0 = LD_DP(py); + ST_DP(y0, srcx); + ST_DP(x0, srcy); + } + } + } + + return (0); +}