ZASUMKERNEL = ../mips/asum.c
endif
+ifdef HAVE_MSA
+SAXPYKERNEL = ../mips/saxpy_msa.c
+DAXPYKERNEL = ../mips/daxpy_msa.c
+CAXPYKERNEL = ../mips/caxpy_msa.c
+ZAXPYKERNEL = ../mips/zaxpy_msa.c
+else
SAXPYKERNEL = ../mips/axpy.c
DAXPYKERNEL = ../mips/axpy.c
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
+endif
+ifdef HAVE_MSA
+SCOPYKERNEL = ../mips/scopy_msa.c
+DCOPYKERNEL = ../mips/dcopy_msa.c
+CCOPYKERNEL = ../mips/ccopy_msa.c
+ZCOPYKERNEL = ../mips/zcopy_msa.c
+else
SCOPYKERNEL = ../mips/copy.c
DCOPYKERNEL = ../mips/copy.c
CCOPYKERNEL = ../mips/zcopy.c
ZCOPYKERNEL = ../mips/zcopy.c
+endif
ifdef HAVE_MSA
SDOTKERNEL = ../mips/sdot_msa.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
+ifdef HAVE_MSA
+SSCALKERNEL = ../mips/sscal_msa.c
+DSCALKERNEL = ../mips/dscal_msa.c
+CSCALKERNEL = ../mips/cscal_msa.c
+ZSCALKERNEL = ../mips/zscal_msa.c
+else
SSCALKERNEL = ../mips/scal.c
DSCALKERNEL = ../mips/scal.c
CSCALKERNEL = ../mips/zscal.c
ZSCALKERNEL = ../mips/zscal.c
+endif
+ifdef HAVE_MSA
+SSWAPKERNEL = ../mips/sswap.c
+DSWAPKERNEL = ../mips/dswap.c
+CSWAPKERNEL = ../mips/cswap.c
+ZSWAPKERNEL = ../mips/zswap.c
+else
SSWAPKERNEL = ../mips/swap.c
DSWAPKERNEL = ../mips/swap.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
+endif
ifdef HAVE_MSA
SGEMVNKERNEL = ../mips/sgemv_n_msa.c
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#if !defined(CONJ)
+ #define OP0 +=
+ #define OP1 -=
+ #define OP2 +=
+#else
+ #define OP0 -=
+ #define OP1 +=
+ #define OP2 -=
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
+ FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+ BLASLONG dummy2)
+{
+ BLASLONG i, inc_x2, inc_y2;
+ FLOAT *py;
+ v4f32 x0, x1, x2, x3, x4, x5, x6, x7;
+ v4f32 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
+ v4f32 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
+ v4f32 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
+ FLOAT xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7;
+ FLOAT yd0, yd1, yd2, yd3, yd4, yd5, yd6, yd7;
+
+ if (n < 0) return(0);
+ if ((da_r == 0.0) && (da_i == 0.0)) return(0);
+
+ py = y;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ FLOAT *x_pref, *y_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 64;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ y_pref = y + pref_offset + 64;
+
+ dar_vec = COPY_FLOAT_TO_VECTOR(da_r);
+ dai_vec = COPY_FLOAT_TO_VECTOR(da_i);
+
+ for (i = (n >> 4); i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(y_pref, 0);
+ PREF_OFFSET(y_pref, 32);
+ PREF_OFFSET(y_pref, 64);
+ PREF_OFFSET(y_pref, 96);
+ x_pref += 32;
+ y_pref += 32;
+
+ LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i);
+ PCKEVOD_W2_SP(y1, y0, y0r, y0i);
+ PCKEVOD_W2_SP(x3, x2, x1r, x1i);
+ PCKEVOD_W2_SP(y3, y2, y1r, y1i);
+ PCKEVOD_W2_SP(x5, x4, x2r, x2i);
+ PCKEVOD_W2_SP(y5, y4, y2r, y2i);
+ PCKEVOD_W2_SP(x7, x6, x3r, x3i);
+ PCKEVOD_W2_SP(y7, y6, y3r, y3i);
+
+ FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y2i OP0 dar_vec * x2i;
+ y3i OP0 dar_vec * x3i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y2r OP1 dai_vec * x2i;
+ y3r OP1 dai_vec * x3i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+ y2i OP2 dai_vec * x2r;
+ y3i OP2 dai_vec * x3r;
+
+ ILVRL_W2_SP(y0i, y0r, y0, y1);
+ ILVRL_W2_SP(y1i, y1r, y2, y3);
+ ILVRL_W2_SP(y2i, y2r, y4, y5);
+ ILVRL_W2_SP(y3i, y3r, y6, y7);
+ ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_SP4_INC(x, 4, x0, x1, x2, x3);
+ LD_SP4_INC(py, 4, y0, y1, y2, y3);
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i);
+ PCKEVOD_W2_SP(y1, y0, y0r, y0i);
+ PCKEVOD_W2_SP(x3, x2, x1r, x1i);
+ PCKEVOD_W2_SP(y3, y2, y1r, y1i);
+
+ FMADD2(x0r, x1r, dar_vec, y0r, y1r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+
+ ILVRL_W2_SP(y0i, y0r, y0, y1);
+ ILVRL_W2_SP(y1i, y1r, y2, y3);
+ ST_SP4_INC(y0, y1, y2, y3, y, 4);
+ }
+
+ if (n & 4)
+ {
+ LD_SP2_INC(x, 4, x0, x1);
+ LD_SP2_INC(py, 4, y0, y1);
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i);
+ PCKEVOD_W2_SP(y1, y0, y0r, y0i);
+
+ y0r += dar_vec * x0r;
+ y0i OP0 dar_vec * x0i;
+ y0r OP1 dai_vec * x0i;
+ y0i OP2 dai_vec * x0r;
+
+ ILVRL_W2_SP(y0i, y0r, y0, y1);
+ ST_SP2_INC(y0, y1, y, 4);
+ }
+
+ if (n & 2)
+ {
+ LD_GP4_INC(x, 1, xd0, xd1, xd2, xd3);
+ LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3);
+
+ FMADD2(xd0, xd2, da_r, yd0, yd2);
+ yd1 OP0 da_r * xd1;
+ yd3 OP0 da_r * xd3;
+ yd0 OP1 da_i * xd1;
+ yd2 OP1 da_i * xd3;
+ yd1 OP2 da_i * xd0;
+ yd3 OP2 da_i * xd2;
+
+ ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, xd0, xd1);
+ LD_GP2_INC(py, 1, yd0, yd1);
+
+ yd0 += da_r * xd0;
+ yd1 OP0 da_r * xd1;
+ yd0 OP1 da_i * xd1;
+ yd1 OP2 da_i * xd0;
+
+ ST_GP2_INC(yd0, yd1, y, 1);
+ }
+ }
+ }
+ else if (1 == inc_y)
+ {
+ FLOAT *y_pref;
+ BLASLONG pref_offset;
+ v4f32 x8, x9, x10, x11, x12, x13, x14;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ y_pref = y + pref_offset + 64;
+
+ inc_x2 = 2 * inc_x;
+
+ dar_vec = COPY_FLOAT_TO_VECTOR(da_r);
+ dai_vec = COPY_FLOAT_TO_VECTOR(da_i);
+
+ for (i = (n >> 4); i--;)
+ {
+ PREF_OFFSET(y_pref, 0);
+ PREF_OFFSET(y_pref, 32);
+ PREF_OFFSET(y_pref, 64);
+ PREF_OFFSET(y_pref, 96);
+ y_pref += 32;
+
+ LD_SP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x14);
+ LD_SP7_INC(x, inc_x2, x8, x9, x10, x11, x12, x13, x7);
+ PCKEV_D2_SP(x1, x0, x3, x2, x0, x1);
+ PCKEV_D2_SP(x5, x4, x14, x6, x2, x3);
+ PCKEV_D2_SP(x9, x8, x11, x10, x4, x5);
+ x6 = (v4f32) __msa_pckev_d((v2i64) x13, (v2i64) x12);
+ x7 = (v4f32) __msa_insert_w((v4i32) x7, 2, *((int *) x));
+ x7 = (v4f32) __msa_insert_w((v4i32) x7, 3, *((int *) (x + 1)));
+ x += inc_x2;
+
+ LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i);
+ PCKEVOD_W2_SP(y1, y0, y0r, y0i);
+ PCKEVOD_W2_SP(x3, x2, x1r, x1i);
+ PCKEVOD_W2_SP(y3, y2, y1r, y1i);
+ PCKEVOD_W2_SP(x5, x4, x2r, x2i);
+ PCKEVOD_W2_SP(y5, y4, y2r, y2i);
+ PCKEVOD_W2_SP(x7, x6, x3r, x3i);
+ PCKEVOD_W2_SP(y7, y6, y3r, y3i);
+
+ FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y2i OP0 dar_vec * x2i;
+ y3i OP0 dar_vec * x3i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y2r OP1 dai_vec * x2i;
+ y3r OP1 dai_vec * x3i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+ y2i OP2 dai_vec * x2r;
+ y3i OP2 dai_vec * x3r;
+
+ ILVRL_W2_SP(y0i, y0r, y0, y1);
+ ILVRL_W2_SP(y1i, y1r, y2, y3);
+ ILVRL_W2_SP(y2i, y2r, y4, y5);
+ ILVRL_W2_SP(y3i, y3r, y6, y7);
+
+ ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_SP7_INC(x, inc_x2, x0, x1, x2, x6, x4, x5, x3);
+ PCKEV_D2_SP(x1, x0, x6, x2, x0, x1);
+
+ x2 = (v4f32) __msa_pckev_d((v2i64) x5, (v2i64) x4);
+ x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x));
+ x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) (x + 1)));
+ x += inc_x2;
+
+ LD_SP4_INC(py, 4, y0, y1, y2, y3);
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i);
+ PCKEVOD_W2_SP(y1, y0, y0r, y0i);
+ PCKEVOD_W2_SP(x3, x2, x1r, x1i);
+ PCKEVOD_W2_SP(y3, y2, y1r, y1i);
+
+ FMADD2(x0r, x1r, dar_vec, y0r, y1r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+
+ ILVRL_W2_SP(y0i, y0r, y0, y1);
+ ILVRL_W2_SP(y1i, y1r, y2, y3);
+ ST_SP4_INC(y0, y1, y2, y3, y, 4);
+ }
+
+ if (n & 4)
+ {
+ LD_SP3_INC(x, inc_x2, x0, x2, x1);
+
+ x0 = (v4f32) __msa_pckev_d((v2i64) x2, (v2i64) x0);
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) (x + 1)));
+ x += inc_x2;
+
+ LD_SP2_INC(py, 4, y0, y1);
+ PCKEVOD_W2_SP(x1, x0, x0r, x0i);
+ PCKEVOD_W2_SP(y1, y0, y0r, y0i);
+
+ y0r += dar_vec * x0r;
+ y0i OP0 dar_vec * x0i;
+ y0r OP1 dai_vec * x0i;
+ y0i OP2 dai_vec * x0r;
+
+ ILVRL_W2_SP(y0i, y0r, y0, y1);
+ ST_SP2_INC(y0, y1, y, 4);
+ }
+
+ if (n & 2)
+ {
+ xd0 = x[0];
+ xd1 = x[1];
+ x += inc_x2;
+ xd2 = x[0];
+ xd3 = x[1];
+ x += inc_x2;
+
+ LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3);
+
+ FMADD2(xd0, xd2, da_r, yd0, yd2);
+ yd1 OP0 da_r * xd1;
+ yd3 OP0 da_r * xd3;
+ yd0 OP1 da_i * xd1;
+ yd2 OP1 da_i * xd3;
+ yd1 OP2 da_i * xd0;
+ yd3 OP2 da_i * xd2;
+
+ ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, xd0, xd1);
+ LD_GP2_INC(py, 1, yd0, yd1);
+
+ yd0 += da_r * xd0;
+ yd1 OP0 da_r * xd1;
+ yd0 OP1 da_i * xd1;
+ yd1 OP2 da_i * xd0;
+
+ ST_GP2_INC(yd0, yd1, y, 1);
+ }
+ }
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ for (i = (n >> 2); i--;)
+ {
+ xd0 = x[0];
+ xd1 = x[1];
+ x += inc_x2;
+ xd2 = x[0];
+ xd3 = x[1];
+ x += inc_x2;
+ xd4 = x[0];
+ xd5 = x[1];
+ x += inc_x2;
+ xd6 = x[0];
+ xd7 = x[1];
+ x += inc_x2;
+
+ yd0 = py[0];
+ yd1 = py[1];
+ py += inc_y2;
+ yd2 = py[0];
+ yd3 = py[1];
+ py += inc_y2;
+ yd4 = py[0];
+ yd5 = py[1];
+ py += inc_y2;
+ yd6 = py[0];
+ yd7 = py[1];
+ py += inc_y2;
+
+ FMADD4(xd0, xd2, xd4, xd6, da_r, yd0, yd2, yd4, yd6);
+ yd1 OP0 da_r * xd1;
+ yd3 OP0 da_r * xd3;
+ yd5 OP0 da_r * xd5;
+ yd7 OP0 da_r * xd7;
+ yd0 OP1 da_i * xd1;
+ yd2 OP1 da_i * xd3;
+ yd4 OP1 da_i * xd5;
+ yd6 OP1 da_i * xd7;
+ yd1 OP2 da_i * xd0;
+ yd3 OP2 da_i * xd2;
+ yd5 OP2 da_i * xd4;
+ yd7 OP2 da_i * xd6;
+
+ y[0] = yd0;
+ y[1] = yd1;
+ y += inc_y2;
+ y[0] = yd2;
+ y[1] = yd3;
+ y += inc_y2;
+ y[0] = yd4;
+ y[1] = yd5;
+ y += inc_y2;
+ y[0] = yd6;
+ y[1] = yd7;
+ y += inc_y2;
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ xd0 = x[0];
+ xd1 = x[1];
+ x += inc_x2;
+ xd2 = x[0];
+ xd3 = x[1];
+ x += inc_x2;
+
+ yd0 = py[0];
+ yd1 = py[1];
+ py += inc_y2;
+ yd2 = py[0];
+ yd3 = py[1];
+ py += inc_y2;
+
+ FMADD2(xd0, xd2, da_r, yd0, yd2);
+ yd1 OP0 da_r * xd1;
+ yd3 OP0 da_r * xd3;
+ yd0 OP1 da_i * xd1;
+ yd2 OP1 da_i * xd3;
+ yd1 OP2 da_i * xd0;
+ yd3 OP2 da_i * xd2;
+
+ y[0] = yd0;
+ y[1] = yd1;
+ y += inc_y2;
+ y[0] = yd2;
+ y[1] = yd3;
+ y += inc_y2;
+ }
+
+ if (n & 1)
+ {
+ xd0 = x[0];
+ xd1 = x[1];
+
+ yd0 = y[0];
+ yd1 = y[1];
+
+ yd0 += da_r * xd0;
+ yd1 OP0 da_r * xd1;
+ yd0 OP1 da_i * xd1;
+ yd1 OP2 da_i * xd0;
+
+ y[0] = yd0;
+ y[1] = yd1;
+ }
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i, inc_x2, inc_y2;
+ v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+
+ if (n < 0) return (0);
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ if (n > 31)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 128 + 32;
+
+ LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = (n >> 5) - 1; i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 64;
+
+ x8 = LD_SP(x); x += 4;
+ ST_SP(x0, y); y += 4;
+ x9 = LD_SP(x); x += 4;
+ ST_SP(x1, y); y += 4;
+ x10 = LD_SP(x); x += 4;
+ ST_SP(x2, y); y += 4;
+ x11 = LD_SP(x); x += 4;
+ ST_SP(x3, y); y += 4;
+ x12 = LD_SP(x); x += 4;
+ ST_SP(x4, y); y += 4;
+ x13 = LD_SP(x); x += 4;
+ ST_SP(x5, y); y += 4;
+ x14 = LD_SP(x); x += 4;
+ ST_SP(x6, y); y += 4;
+ x15 = LD_SP(x); x += 4;
+ ST_SP(x7, y); y += 4;
+ x0 = LD_SP(x); x += 4;
+ ST_SP(x8, y); y += 4;
+ x1 = LD_SP(x); x += 4;
+ ST_SP(x9, y); y += 4;
+ x2 = LD_SP(x); x += 4;
+ ST_SP(x10, y); y += 4;
+ x3 = LD_SP(x); x += 4;
+ ST_SP(x11, y); y += 4;
+ x4 = LD_SP(x); x += 4;
+ ST_SP(x12, y); y += 4;
+ x5 = LD_SP(x); x += 4;
+ ST_SP(x13, y); y += 4;
+ x6 = LD_SP(x); x += 4;
+ ST_SP(x14, y); y += 4;
+ x7 = LD_SP(x); x += 4;
+ ST_SP(x15, y); y += 4;
+ }
+
+ x8 = LD_SP(x); x += 4;
+ x9 = LD_SP(x); x += 4;
+ ST_SP(x0, y); y += 4;
+ x10 = LD_SP(x); x += 4;
+ ST_SP(x1, y); y += 4;
+ x11 = LD_SP(x); x += 4;
+ ST_SP(x2, y); y += 4;
+ x12 = LD_SP(x); x += 4;
+ ST_SP(x3, y); y += 4;
+ x13 = LD_SP(x); x += 4;
+ ST_SP(x4, y); y += 4;
+ x14 = LD_SP(x); x += 4;
+ ST_SP(x5, y); y += 4;
+ x15 = LD_SP(x); x += 4;
+ ST_SP(x6, y); y += 4;
+ ST_SP(x7, y); y += 4;
+
+ ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4);
+ }
+
+ if (n & 8)
+ {
+ LD_SP4_INC(x, 4, x0, x1, x2, x3);
+ ST_SP4_INC(x0, x1, x2, x3, y, 4);
+ }
+
+ if (n & 4)
+ {
+ LD_SP2_INC(x, 4, x0, x1);
+ ST_SP2_INC(x0, x1, y, 4);
+ }
+
+ if (n & 2)
+ {
+ LD_GP4_INC(x, 1, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, y, 1);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, f0, f1);
+ ST_GP2_INC(f0, f1, y, 1);
+ }
+ }
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ for (i = (n >> 2); i--;)
+ {
+ f0 = *x;
+ f1 = *(x+1); x += inc_x2;
+ f2 = *x;
+ f3 = *(x+1); x += inc_x2;
+ f4 = *x;
+ f5 = *(x+1); x += inc_x2;
+ f6 = *x;
+ f7 = *(x+1); x += inc_x2;
+
+ *y = f0;
+ *(y+1) = f1; y += inc_y2;
+ *y = f2;
+ *(y+1) = f3; y += inc_y2;
+ *y = f4;
+ *(y+1) = f5; y += inc_y2;
+ *y = f6;
+ *(y+1) = f7; y += inc_y2;
+ }
+
+ if (n & 2)
+ {
+ f0 = *x;
+ f1 = *(x+1); x += inc_x2;
+ f2 = *x;
+ f3 = *(x+1); x += inc_x2;
+
+ *y = f0;
+ *(y+1) = f1; y += inc_y2;
+ *y = f2;
+ *(y+1) = f3; y += inc_y2;
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, f0, f1);
+ ST_GP2_INC(f0, f1, y, 1);
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+/* This will shuffle the elements in 'in' vector as (mask needed :: 10 11 00 01)
+ 0 1 2 3 => 1 0 3 2 */
+#define SHF_177 177
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
+ FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+ BLASLONG dummy2)
+{
+ BLASLONG i, inc_x2;
+ FLOAT *px;
+ FLOAT tp0, tp1, tp2, tp3, f0, f1, f2, f3;
+ v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ v4f32 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
+ v4f32 da_i_vec, da_i_vec_neg, da_r_vec;
+
+ px = x;
+
+ if (1 == inc_x)
+ {
+ if ((0.0 == da_r) && (0.0 == da_i))
+ {
+ v4f32 zero_v = __msa_cast_to_vector_float(0);
+ zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
+ zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
+ zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
+ zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
+
+ for (i = (n >> 5); i--;)
+ {
+ ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 4);
+ ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 4);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 4);
+ }
+
+ if (n & 8)
+ {
+ ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
+ }
+
+ if (n & 4)
+ {
+ ST_SP2_INC(zero_v, zero_v, x, 4);
+ }
+
+ if (n & 2)
+ {
+ ST_SP(zero_v, x); x += 4;
+ }
+
+ if (n & 1)
+ {
+ *x = 0; x += 1;
+ *x = 0;
+ }
+ }
+ }
+ else if (0.0 == da_r)
+ {
+ da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
+ da_i_vec_neg = -da_i_vec;
+ da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
+
+ if (n > 31)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 64 + 32;
+
+ LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = (n >> 5)- 1; i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 64;
+
+ x8 = LD_SP(px); px += 4;
+ x0 *= da_i_vec;
+ x9 = LD_SP(px); px += 4;
+ x1 *= da_i_vec;
+ x10 = LD_SP(px); px += 4;
+ x2 *= da_i_vec;
+ x11 = LD_SP(px); px += 4;
+ x3 *= da_i_vec;
+ x12 = LD_SP(px); px += 4;
+ x4 *= da_i_vec;
+ x13 = LD_SP(px); px += 4;
+ x5 *= da_i_vec;
+ x0 = (v4f32) __msa_shf_w((v4i32) x0, SHF_177);
+ x14 = LD_SP(px); px += 4;
+ x6 *= da_i_vec;
+ x1 = (v4f32) __msa_shf_w((v4i32) x1, SHF_177);
+ x15 = LD_SP(px); px += 4;
+ x7 *= da_i_vec;
+ x2 = (v4f32) __msa_shf_w((v4i32) x2, SHF_177);
+ x8 *= da_i_vec;
+ x3 = (v4f32) __msa_shf_w((v4i32) x3, SHF_177);
+ ST_SP(x0, x); x += 4;
+ x9 *= da_i_vec;
+ x4 = (v4f32) __msa_shf_w((v4i32) x4, SHF_177);
+ ST_SP(x1, x); x += 4;
+ x10 *= da_i_vec;
+ x5 = (v4f32) __msa_shf_w((v4i32) x5, SHF_177);
+ ST_SP(x2, x); x += 4;
+ x11 *= da_i_vec;
+ x6 = (v4f32) __msa_shf_w((v4i32) x6, SHF_177);
+ ST_SP(x3, x); x += 4;
+ x12 *= da_i_vec;
+ x7 = (v4f32) __msa_shf_w((v4i32) x7, SHF_177);
+ ST_SP(x4, x); x += 4;
+ x13 *= da_i_vec;
+ x8 = (v4f32) __msa_shf_w((v4i32) x8, SHF_177);
+ ST_SP(x5, x); x += 4;
+ x14 *= da_i_vec;
+ x9 = (v4f32) __msa_shf_w((v4i32) x9, SHF_177);
+ ST_SP(x6, x); x += 4;
+ x15 *= da_i_vec;
+ x10 = (v4f32) __msa_shf_w((v4i32) x10, SHF_177);
+ ST_SP(x7, x); x += 4;
+ x11 = (v4f32) __msa_shf_w((v4i32) x11, SHF_177);
+ ST_SP(x8, x); x += 4;
+ x0 = LD_SP(px); px += 4;
+ x12 = (v4f32) __msa_shf_w((v4i32) x12, SHF_177);
+ ST_SP(x9, x); x += 4;
+ x1 = LD_SP(px); px += 4;
+ x13 = (v4f32) __msa_shf_w((v4i32) x13, SHF_177);
+ ST_SP(x10, x); x += 4;
+ x2 = LD_SP(px); px += 4;
+ x14 = (v4f32) __msa_shf_w((v4i32) x14, SHF_177);
+ ST_SP(x11, x); x += 4;
+ x3 = LD_SP(px); px += 4;
+ x15 = (v4f32) __msa_shf_w((v4i32) x15, SHF_177);
+ ST_SP(x12, x); x += 4;
+ x4 = LD_SP(px); px += 4;
+ ST_SP(x13, x); x += 4;
+ x5 = LD_SP(px); px += 4;
+ ST_SP(x14, x); x += 4;
+ x6 = LD_SP(px); px += 4;
+ ST_SP(x15, x); x += 4;
+ x7 = LD_SP(px); px += 4;
+ }
+
+ LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ x4, x5, x6, x7);
+ MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
+ x8, x9, x10, x11);
+ MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
+ x12, x13, x14, x15);
+ SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
+ SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
+ SHF_W4_SP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_177);
+ SHF_W4_SP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_177);
+ ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+ x12, x13, x14, x15, x, 4);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ x4, x5, x6, x7);
+ SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
+ SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
+ ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
+ }
+
+ if (n & 8)
+ {
+ LD_SP4_INC(px, 4, x0, x1, x2, x3);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
+ ST_SP4_INC(x0, x1, x2, x3, x, 4);
+ }
+
+ if (n & 4)
+ {
+ LD_SP2_INC(px, 4, x0, x1);
+ MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
+ SHF_W2_SP(x0, x1, x0, x1, SHF_177);
+ ST_SP2_INC(x0, x1, x, 4);
+ }
+
+ if (n & 2)
+ {
+ LD_GP4_INC(px, 1, f0, f1, f2, f3);
+ MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i,
+ f0, f1, f2, f3);
+ ST_GP4_INC(f1, f0, f3, f2, x, 1);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+ MUL2(f0, da_i, f1, -da_i, f0, f1);
+ ST_GP2_INC(f1, f0, x, 1);
+ }
+ }
+ }
+ else if (0.0 == da_i)
+ {
+ da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
+
+ if (n > 31)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 64 + 32;
+
+ LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = (n >> 5)- 1; i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 64;
+
+ x8 = LD_SP(px); px += 4;
+ x0 *= da_r_vec;
+ x9 = LD_SP(px); px += 4;
+ x1 *= da_r_vec;
+ x10 = LD_SP(px); px += 4;
+ x2 *= da_r_vec;
+ x11 = LD_SP(px); px += 4;
+ x3 *= da_r_vec;
+ x12 = LD_SP(px); px += 4;
+ x4 *= da_r_vec;
+ x13 = LD_SP(px); px += 4;
+ x5 *= da_r_vec;
+ ST_SP(x0, x); x += 4;
+ x14 = LD_SP(px); px += 4;
+ x6 *= da_r_vec;
+ ST_SP(x1, x); x += 4;
+ x15 = LD_SP(px); px += 4;
+ x7 *= da_r_vec;
+ ST_SP(x2, x); x += 4;
+ x8 *= da_r_vec;
+ ST_SP(x3, x); x += 4;
+ x9 *= da_r_vec;
+ ST_SP(x4, x); x += 4;
+ x10 *= da_r_vec;
+ ST_SP(x5, x); x += 4;
+ x11 *= da_r_vec;
+ ST_SP(x6, x); x += 4;
+ x12 *= da_r_vec;
+ ST_SP(x7, x); x += 4;
+ x13 *= da_r_vec;
+ ST_SP(x8, x); x += 4;
+ x0 = LD_SP(px); px += 4;
+ x14 *= da_r_vec;
+ ST_SP(x9, x); x += 4;
+ x1 = LD_SP(px); px += 4;
+ x15 *= da_r_vec;
+ ST_SP(x10, x); x += 4;
+ x2 = LD_SP(px); px += 4;
+ ST_SP(x11, x); x += 4;
+ x3 = LD_SP(px); px += 4;
+ ST_SP(x12, x); x += 4;
+ x4 = LD_SP(px); px += 4;
+ ST_SP(x13, x); x += 4;
+ x5 = LD_SP(px); px += 4;
+ ST_SP(x14, x); x += 4;
+ x6 = LD_SP(px); px += 4;
+ ST_SP(x15, x); x += 4;
+ x7 = LD_SP(px); px += 4;
+ }
+
+ LD_SP8_INC(px, 4, x8, x9, x10, x11, x12, x13, x14, x15);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
+ x4, x5, x6, x7);
+ MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
+ x8, x9, x10, x11);
+ MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
+ x12, x13, x14, x15);
+ ST_SP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+ x12, x13, x14, x15, x, 4);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
+ x4, x5, x6, x7);
+ ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
+ }
+
+ if (n & 8)
+ {
+ LD_SP4_INC(px, 4, x0, x1, x2, x3);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ ST_SP4_INC(x0, x1, x2, x3, x, 4);
+ }
+
+ if (n & 4)
+ {
+ LD_SP2_INC(px, 4, x0, x1);
+ MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
+ ST_SP2_INC(x0, x1, x, 4);
+ }
+
+ if (n & 2)
+ {
+ LD_GP4_INC(px, 1, f0, f1, f2, f3);
+ MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, x, 1);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+ MUL2(f0, da_r, f1, da_r, f0, f1);
+ ST_GP2_INC(f0, f1, x, 1);
+ }
+ }
+ }
+ else
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 64;
+
+ da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
+ da_i_vec_neg = -da_i_vec;
+ da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
+
+ da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
+
+ for (i = (n >> 5); i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 64;
+
+ LD_SP16_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
+ x11, x12, x13, x14, x15);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ d4, d5, d6, d7);
+ MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
+ d8, d9, d10, d11);
+ MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
+ d12, d13, d14, d15);
+ SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
+ SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
+ SHF_W4_SP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_177);
+ SHF_W4_SP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_177);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
+ FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
+ FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
+ ST_SP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
+ d12, d13, d14, d15, x, 4);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ d4, d5, d6, d7);
+ SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
+ SHF_W4_SP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_177);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
+ ST_SP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 4);
+ }
+
+ if (n & 8)
+ {
+ LD_SP4_INC(px, 4, x0, x1, x2, x3);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ SHF_W4_SP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_177);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ ST_SP4_INC(d0, d1, d2, d3, x, 4);
+ }
+
+ if (n & 4)
+ {
+ LD_SP2_INC(px, 4, x0, x1);
+ MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
+ SHF_W2_SP(d0, d1, d0, d1, SHF_177);
+ FMADD2(x0, x1, da_r_vec, d0, d1);
+ ST_SP2_INC(d0, d1, x, 4);
+ }
+
+ if (n & 2)
+ {
+ LD_GP4_INC(px, 1, f0, f1, f2, f3);
+
+ tp0 = da_r * f0;
+ tp0 -= da_i * f1;
+ tp1 = da_r * f1;
+ tp1 += da_i * f0;
+ tp2 = da_r * f2;
+ tp2 -= da_i * f3;
+ tp3 = da_r * f3;
+ tp3 += da_i * f2;
+
+ ST_GP4_INC(tp0, tp1, tp2, tp3, x, 1);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+
+ tp0 = da_r * f0;
+ tp0 -= da_i * f1;
+ tp1 = da_r * f1;
+ tp1 += da_i * f0;
+
+ ST_GP2_INC(tp0, tp1, x, 1);
+ }
+ }
+ }
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+
+ if ((0.0 == da_r) && (0.0 == da_i))
+ {
+ for (i = n; i--;)
+ {
+ *x = 0;
+ *(x + 1) = 0;
+
+ x += inc_x2;
+ }
+ }
+ else if (0.0 == da_r)
+ {
+ da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
+ da_i_vec_neg = -da_i_vec;
+ da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
+ x10, x11, x12, x13, x14, x15);
+ PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
+ PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
+ MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
+ d0, d1, d2, d3);
+ MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
+ d4, d5, d6, d7);
+
+ *x = d0[1];
+ *(x + 1) = d0[0];
+ x += inc_x2;
+ *x = d0[3];
+ *(x + 1) = d0[2];
+ x += inc_x2;
+ *x = d1[1];
+ *(x + 1) = d1[0];
+ x += inc_x2;
+ *x = d1[3];
+ *(x + 1) = d1[2];
+ x += inc_x2;
+ *x = d2[1];
+ *(x + 1) = d2[0];
+ x += inc_x2;
+ *x = d2[3];
+ *(x + 1) = d2[2];
+ x += inc_x2;
+ *x = d3[1];
+ *(x + 1) = d3[0];
+ x += inc_x2;
+ *x = d3[3];
+ *(x + 1) = d3[2];
+ x += inc_x2;
+ *x = d4[1];
+ *(x + 1) = d4[0];
+ x += inc_x2;
+ *x = d4[3];
+ *(x + 1) = d4[2];
+ x += inc_x2;
+ *x = d5[1];
+ *(x + 1) = d5[0];
+ x += inc_x2;
+ *x = d5[3];
+ *(x + 1) = d5[2];
+ x += inc_x2;
+ *x = d6[1];
+ *(x + 1) = d6[0];
+ x += inc_x2;
+ *x = d6[3];
+ *(x + 1) = d6[2];
+ x += inc_x2;
+ *x = d7[1];
+ *(x + 1) = d7[0];
+ x += inc_x2;
+ *x = d7[3];
+ *(x + 1) = d7[2];
+ x += inc_x2;
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
+ MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
+ d0, d1, d2, d3);
+
+ *x = d0[1];
+ *(x + 1) = d0[0];
+ x += inc_x2;
+ *x = d0[3];
+ *(x + 1) = d0[2];
+ x += inc_x2;
+ *x = d1[1];
+ *(x + 1) = d1[0];
+ x += inc_x2;
+ *x = d1[3];
+ *(x + 1) = d1[2];
+ x += inc_x2;
+ *x = d2[1];
+ *(x + 1) = d2[0];
+ x += inc_x2;
+ *x = d2[3];
+ *(x + 1) = d2[2];
+ x += inc_x2;
+ *x = d3[1];
+ *(x + 1) = d3[0];
+ x += inc_x2;
+ *x = d3[3];
+ *(x + 1) = d3[2];
+ x += inc_x2;
+ }
+
+ if (n & 4)
+ {
+ LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
+ PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
+ MUL2(d0, da_i_vec, d1, da_i_vec, d0, d1);
+
+ *x = d0[1];
+ *(x + 1) = d0[0];
+ x += inc_x2;
+ *x = d0[3];
+ *(x + 1) = d0[2];
+ x += inc_x2;
+ *x = d1[1];
+ *(x + 1) = d1[0];
+ x += inc_x2;
+ *x = d1[3];
+ *(x + 1) = d1[2];
+ x += inc_x2;
+ }
+
+ if (n & 2)
+ {
+ f0 = *px;
+ f1 = *(px + 1);
+ px += inc_x2;
+ f2 = *px;
+ f3 = *(px + 1);
+ px += inc_x2;
+
+ MUL4(f0, da_i, f1, -da_i, f2, da_i, f3, -da_i, f0, f1, f2, f3);
+
+ *x = f1;
+ *(x + 1) = f0;
+ x += inc_x2;
+ *x = f3;
+ *(x + 1) = f2;
+ x += inc_x2;
+ }
+
+ if (n & 1)
+ {
+ f0 = *x;
+ f1 = *(x + 1);
+
+ MUL2(f0, da_i, f1, -da_i, f0, f1);
+
+ *x = f1;
+ *(x + 1) = f0;
+ }
+ }
+ }
+ else if (0.0 == da_i)
+ {
+ da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
+ x10, x11, x12, x13, x14, x15);
+ PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
+ PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
+ MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
+ d0, d1, d2, d3);
+ MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
+ d4, d5, d6, d7);
+
+ *x = d0[0];
+ *(x + 1) = d0[1];
+ x += inc_x2;
+ *x = d0[2];
+ *(x + 1) = d0[3];
+ x += inc_x2;
+ *x = d1[0];
+ *(x + 1) = d1[1];
+ x += inc_x2;
+ *x = d1[2];
+ *(x + 1) = d1[3];
+ x += inc_x2;
+ *x = d2[0];
+ *(x + 1) = d2[1];
+ x += inc_x2;
+ *x = d2[2];
+ *(x + 1) = d2[3];
+ x += inc_x2;
+ *x = d3[0];
+ *(x + 1) = d3[1];
+ x += inc_x2;
+ *x = d3[2];
+ *(x + 1) = d3[3];
+ x += inc_x2;
+ *x = d4[0];
+ *(x + 1) = d4[1];
+ x += inc_x2;
+ *x = d4[2];
+ *(x + 1) = d4[3];
+ x += inc_x2;
+ *x = d5[0];
+ *(x + 1) = d5[1];
+ x += inc_x2;
+ *x = d5[2];
+ *(x + 1) = d5[3];
+ x += inc_x2;
+ *x = d6[0];
+ *(x + 1) = d6[1];
+ x += inc_x2;
+ *x = d6[2];
+ *(x + 1) = d6[3];
+ x += inc_x2;
+ *x = d7[0];
+ *(x + 1) = d7[1];
+ x += inc_x2;
+ *x = d7[2];
+ *(x + 1) = d7[3];
+ x += inc_x2;
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
+ MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
+ d0, d1, d2, d3);
+
+ *x = d0[0];
+ *(x + 1) = d0[1];
+ x += inc_x2;
+ *x = d0[2];
+ *(x + 1) = d0[3];
+ x += inc_x2;
+ *x = d1[0];
+ *(x + 1) = d1[1];
+ x += inc_x2;
+ *x = d1[2];
+ *(x + 1) = d1[3];
+ x += inc_x2;
+ *x = d2[0];
+ *(x + 1) = d2[1];
+ x += inc_x2;
+ *x = d2[2];
+ *(x + 1) = d2[3];
+ x += inc_x2;
+ *x = d3[0];
+ *(x + 1) = d3[1];
+ x += inc_x2;
+ *x = d3[2];
+ *(x + 1) = d3[3];
+ x += inc_x2;
+ }
+
+ if (n & 4)
+ {
+ LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
+ PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
+ MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
+
+ *x = d0[0];
+ *(x + 1) = d0[1];
+ x += inc_x2;
+ *x = d0[2];
+ *(x + 1) = d0[3];
+ x += inc_x2;
+ *x = d1[0];
+ *(x + 1) = d1[1];
+ x += inc_x2;
+ *x = d1[2];
+ *(x + 1) = d1[3];
+ x += inc_x2;
+ }
+
+ if (n & 2)
+ {
+ f0 = *px;
+ f1 = *(px + 1);
+ px += inc_x2;
+ f2 = *px;
+ f3 = *(px + 1);
+ px += inc_x2;
+
+ MUL4(f0, da_r, f1, da_r, f2, da_r, f3, da_r, f0, f1, f2, f3);
+
+ *x = f0;
+ *(x + 1) = f1;
+ x += inc_x2;
+ *x = f2;
+ *(x + 1) = f3;
+ x += inc_x2;
+ }
+
+ if (n & 1)
+ {
+ f0 = *x;
+ f1 = *(x + 1);
+
+ MUL2(f0, da_r, f1, da_r, f0, f1);
+
+ *x = f0;
+ *(x + 1) = f1;
+ }
+ }
+ }
+ else
+ {
+ da_i_vec = COPY_FLOAT_TO_VECTOR(da_i);
+ da_i_vec_neg = -da_i_vec;
+ da_i_vec = (v4f32) __msa_ilvev_w((v4i32) da_i_vec_neg, (v4i32) da_i_vec);
+
+ da_r_vec = COPY_FLOAT_TO_VECTOR(da_r);
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_SP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
+ x10, x11, x12, x13, x14, x15);
+ PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
+ PCKEV_D4_SP(x9, x8, x11, x10, x13, x12, x15, x14, d4, d5, d6, d7);
+ MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
+ x0, x1, x2, x3);
+ MUL4(d4, da_i_vec, d5, da_i_vec, d6, da_i_vec, d7, da_i_vec,
+ x4, x5, x6, x7);
+ MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
+ d0, d1, d2, d3);
+ MUL4(d4, da_r_vec, d5, da_r_vec, d6, da_r_vec, d7, da_r_vec,
+ d4, d5, d6, d7);
+ SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
+ SHF_W4_SP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_177);
+ ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
+ ADD4(d4, x4, d5, x5, d6, x6, d7, x7, d4, d5, d6, d7);
+
+ *x = d0[0];
+ *(x + 1) = d0[1];
+ x += inc_x2;
+ *x = d0[2];
+ *(x + 1) = d0[3];
+ x += inc_x2;
+ *x = d1[0];
+ *(x + 1) = d1[1];
+ x += inc_x2;
+ *x = d1[2];
+ *(x + 1) = d1[3];
+ x += inc_x2;
+ *x = d2[0];
+ *(x + 1) = d2[1];
+ x += inc_x2;
+ *x = d2[2];
+ *(x + 1) = d2[3];
+ x += inc_x2;
+ *x = d3[0];
+ *(x + 1) = d3[1];
+ x += inc_x2;
+ *x = d3[2];
+ *(x + 1) = d3[3];
+ x += inc_x2;
+ *x = d4[0];
+ *(x + 1) = d4[1];
+ x += inc_x2;
+ *x = d4[2];
+ *(x + 1) = d4[3];
+ x += inc_x2;
+ *x = d5[0];
+ *(x + 1) = d5[1];
+ x += inc_x2;
+ *x = d5[2];
+ *(x + 1) = d5[3];
+ x += inc_x2;
+ *x = d6[0];
+ *(x + 1) = d6[1];
+ x += inc_x2;
+ *x = d6[2];
+ *(x + 1) = d6[3];
+ x += inc_x2;
+ *x = d7[0];
+ *(x + 1) = d7[1];
+ x += inc_x2;
+ *x = d7[2];
+ *(x + 1) = d7[3];
+ x += inc_x2;
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_SP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ PCKEV_D4_SP(x1, x0, x3, x2, x5, x4, x7, x6, d0, d1, d2, d3);
+ MUL4(d0, da_i_vec, d1, da_i_vec, d2, da_i_vec, d3, da_i_vec,
+ x0, x1, x2, x3);
+ MUL4(d0, da_r_vec, d1, da_r_vec, d2, da_r_vec, d3, da_r_vec,
+ d0, d1, d2, d3);
+ SHF_W4_SP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_177);
+ ADD4(d0, x0, d1, x1, d2, x2, d3, x3, d0, d1, d2, d3);
+
+ *x = d0[0];
+ *(x + 1) = d0[1];
+ x += inc_x2;
+ *x = d0[2];
+ *(x + 1) = d0[3];
+ x += inc_x2;
+ *x = d1[0];
+ *(x + 1) = d1[1];
+ x += inc_x2;
+ *x = d1[2];
+ *(x + 1) = d1[3];
+ x += inc_x2;
+ *x = d2[0];
+ *(x + 1) = d2[1];
+ x += inc_x2;
+ *x = d2[2];
+ *(x + 1) = d2[3];
+ x += inc_x2;
+ *x = d3[0];
+ *(x + 1) = d3[1];
+ x += inc_x2;
+ *x = d3[2];
+ *(x + 1) = d3[3];
+ x += inc_x2;
+ }
+
+ if (n & 4)
+ {
+ LD_SP4_INC(px, inc_x2, x0, x1, x2, x3);
+ PCKEV_D2_SP(x1, x0, x3, x2, d0, d1);
+ MUL2(d0, da_i_vec, d1, da_i_vec, x0, x1);
+ MUL2(d0, da_r_vec, d1, da_r_vec, d0, d1);
+ SHF_W2_SP(x0, x1, x0, x1, SHF_177);
+ ADD2(d0, x0, d1, x1, d0, d1);
+
+ *x = d0[0];
+ *(x + 1) = d0[1];
+ x += inc_x2;
+ *x = d0[2];
+ *(x + 1) = d0[3];
+ x += inc_x2;
+ *x = d1[0];
+ *(x + 1) = d1[1];
+ x += inc_x2;
+ *x = d1[2];
+ *(x + 1) = d1[3];
+ x += inc_x2;
+ }
+
+ if (n & 2)
+ {
+ f0 = *px;;
+ f1 = *(px + 1);
+ px += inc_x2;
+ f2 = *px;
+ f3 = *(px + 1);
+ px += inc_x2;
+
+ tp0 = da_r * f0;
+ tp0 -= da_i * f1;
+ tp1 = da_r * f1;
+ tp1 += da_i * f0;
+ tp2 = da_r * f2;
+ tp2 -= da_i * f3;
+ tp3 = da_r * f3;
+ tp3 += da_i * f2;
+
+ *x = tp0;
+ *(x + 1) = tp1;
+ x += inc_x2;
+ *x = tp2;
+ *(x + 1) = tp3;
+ x += inc_x2;
+ }
+
+ if (n & 1)
+ {
+ f0 = *px; px += 1;
+ f1 = *px;
+
+ tp0 = da_r * f0;
+ tp0 -= da_i * f1;
+ tp1 = da_r * f1;
+ tp1 += da_i * f0;
+
+ *x = tp0; x += 1;
+ *x = tp1;
+ }
+ }
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
+ FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy,
+ BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i = 0, pref_offsetx, pref_offsety;
+ FLOAT *px, *py;
+ BLASLONG inc_x2, inc_y2;
+ FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
+ FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
+ v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
+ v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
+
+ if (n < 0) return (0);
+
+ pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
+ if (pref_offsetx > 0)
+ {
+ pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
+ pref_offsetx = pref_offsetx / sizeof(FLOAT);
+ }
+
+ pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
+ if (pref_offsety > 0)
+ {
+ pref_offsety = L1_DATA_LINESIZE - pref_offsety;
+ pref_offsety = pref_offsety / sizeof(FLOAT);
+ }
+
+ px = srcx;
+ py = srcy;
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ if (n >> 4)
+ {
+ LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
+
+ for (i = (n >> 4) - 1; i--;)
+ {
+ PREFETCH(px + pref_offsetx + 32);
+ PREFETCH(px + pref_offsetx + 40);
+ PREFETCH(px + pref_offsetx + 48);
+ PREFETCH(px + pref_offsetx + 56);
+
+ PREFETCH(py + pref_offsety + 32);
+ PREFETCH(py + pref_offsety + 40);
+ PREFETCH(py + pref_offsety + 48);
+ PREFETCH(py + pref_offsety + 56);
+
+ yv0 = LD_SP(py); py += 4;
+ ST_SP(xv0, srcy); srcy += 4;
+ yv1 = LD_SP(py); py += 4;
+ ST_SP(xv1, srcy); srcy += 4;
+ yv2 = LD_SP(py); py += 4;
+ ST_SP(xv2, srcy); srcy += 4;
+ yv3 = LD_SP(py); py += 4;
+ ST_SP(xv3, srcy); srcy += 4;
+ yv4 = LD_SP(py); py += 4;
+ ST_SP(xv4, srcy); srcy += 4;
+ yv5 = LD_SP(py); py += 4;
+ ST_SP(xv5, srcy); srcy += 4;
+ yv6 = LD_SP(py); py += 4;
+ ST_SP(xv6, srcy); srcy += 4;
+ yv7 = LD_SP(py); py += 4;
+ ST_SP(xv7, srcy); srcy += 4;
+
+ xv0 = LD_SP(px); px += 4;
+ ST_SP(yv0, srcx); srcx += 4;
+ xv1 = LD_SP(px); px += 4;
+ ST_SP(yv1, srcx); srcx += 4;
+ xv2 = LD_SP(px); px += 4;
+ ST_SP(yv2, srcx); srcx += 4;
+ xv3 = LD_SP(px); px += 4;
+ ST_SP(yv3, srcx); srcx += 4;
+ xv4 = LD_SP(px); px += 4;
+ ST_SP(yv4, srcx); srcx += 4;
+ xv5 = LD_SP(px); px += 4;
+ ST_SP(yv5, srcx); srcx += 4;
+ xv6 = LD_SP(px); px += 4;
+ ST_SP(yv6, srcx); srcx += 4;
+ xv7 = LD_SP(px); px += 4;
+ ST_SP(yv7, srcx); srcx += 4;
+ }
+
+ LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
+ ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
+ ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
+ }
+
+ if (n & 15)
+ {
+ if ((n & 8) && (n & 4) && (n & 2))
+ {
+ LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
+ LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
+ ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
+ ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
+ }
+ else if ((n & 8) && (n & 4))
+ {
+ LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
+ LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
+ ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
+ ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
+ }
+ else if ((n & 8) && (n & 2))
+ {
+ LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
+ LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
+ ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
+ ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_SP3_INC(px, 4, xv0, xv1, xv2);
+ LD_SP3_INC(py, 4, yv0, yv1, yv2);
+ ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
+ ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
+ }
+ else if (n & 8)
+ {
+ LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
+ LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
+ ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
+ ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
+ }
+ else if (n & 4)
+ {
+ LD_SP2_INC(px, 4, xv0, xv1);
+ LD_SP2_INC(py, 4, yv0, yv1);
+ ST_SP2_INC(xv0, xv1, srcy, 4);
+ ST_SP2_INC(yv0, yv1, srcx, 4);
+ }
+ else if (n & 2)
+ {
+ xv0 = LD_SP(px);
+ yv0 = LD_SP(py);
+
+ px += 4;
+ py += 4;
+
+ ST_SP(xv0, srcy);
+ ST_SP(yv0, srcx);
+
+ srcx += 4;
+ srcy += 4;
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, x0, x1);
+ LD_GP2_INC(py, 1, y0, y1);
+ ST_GP2_INC(x0, x1, srcy, 1);
+ ST_GP2_INC(y0, y1, srcx, 1);
+ }
+ }
+ }
+ else
+ {
+ for (i = (n >> 2); i--;)
+ {
+ x0 = srcx[0 * inc_x2];
+ x1 = srcx[0 * inc_x2 + 1];
+ x2 = srcx[1 * inc_x2];
+ x3 = srcx[1 * inc_x2 + 1];
+ x4 = srcx[2 * inc_x2];
+ x5 = srcx[2 * inc_x2 + 1];
+ x6 = srcx[3 * inc_x2];
+ x7 = srcx[3 * inc_x2 + 1];
+
+ y0 = srcy[0 * inc_y2];
+ y1 = srcy[0 * inc_y2 + 1];
+ y2 = srcy[1 * inc_y2];
+ y3 = srcy[1 * inc_y2 + 1];
+ y4 = srcy[2 * inc_y2];
+ y5 = srcy[2 * inc_y2 + 1];
+ y6 = srcy[3 * inc_y2];
+ y7 = srcy[3 * inc_y2 + 1];
+
+ srcx[0 * inc_x2] = y0;
+ srcx[0 * inc_x2 + 1] = y1;
+ srcx[1 * inc_x2] = y2;
+ srcx[1 * inc_x2 + 1] = y3;
+ srcx[2 * inc_x2] = y4;
+ srcx[2 * inc_x2 + 1] = y5;
+ srcx[3 * inc_x2] = y6;
+ srcx[3 * inc_x2 + 1] = y7;
+
+ srcy[0 * inc_y2] = x0;
+ srcy[0 * inc_y2 + 1] = x1;
+ srcy[1 * inc_y2] = x2;
+ srcy[1 * inc_y2 + 1] = x3;
+ srcy[2 * inc_y2] = x4;
+ srcy[2 * inc_y2 + 1] = x5;
+ srcy[3 * inc_y2] = x6;
+ srcy[3 * inc_y2 + 1] = x7;
+
+ srcx += 4 * inc_x2;
+ srcy += 4 * inc_y2;
+ }
+
+ if (n & 2)
+ {
+ x0 = srcx[0 * inc_x2];
+ x1 = srcx[0 * inc_x2 + 1];
+ x2 = srcx[1 * inc_x2];
+ x3 = srcx[1 * inc_x2 + 1];
+
+ y0 = srcy[0 * inc_y2];
+ y1 = srcy[0 * inc_y2 + 1];
+ y2 = srcy[1 * inc_y2];
+ y3 = srcy[1 * inc_y2 + 1];
+
+ srcx[0 * inc_x2] = y0;
+ srcx[0 * inc_x2 + 1] = y1;
+ srcx[1 * inc_x2] = y2;
+ srcx[1 * inc_x2 + 1] = y3;
+
+ srcy[0 * inc_y2] = x0;
+ srcy[0 * inc_y2 + 1] = x1;
+ srcy[1 * inc_y2] = x2;
+ srcy[1 * inc_y2 + 1] = x3;
+
+ srcx += 2 * inc_x2;
+ srcy += 2 * inc_y2;
+ }
+
+ if (n & 1)
+ {
+ x0 = srcx[0 * inc_x2];
+ x1 = srcx[0 * inc_x2 + 1];
+
+ y0 = srcy[0 * inc_y2];
+ y1 = srcy[0 * inc_y2 + 1];
+
+ srcx[0 * inc_x2] = y0;
+ srcx[0 * inc_x2 + 1] = y1;
+
+ srcy[0 * inc_y2] = x0;
+ srcy[0 * inc_y2 + 1] = x1;
+
+ srcx += inc_x2;
+ srcy += inc_y2;
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#if !defined(CONJ)
+ #define OP0 +=
+ #define OP1 -=
+ #define OP2 +=
+#else
+ #define OP0 -=
+ #define OP1 +=
+ #define OP2 -=
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
+ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+ BLASLONG dummy2)
+{
+ BLASLONG i;
+ FLOAT *py;
+ v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
+ v2f64 da_vec, zero_v = {0};
+
+ if ((n < 0) || (da == 0.0)) return(0);
+
+ py = y;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ FLOAT *x_pref, *y_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 32;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ y_pref = y + pref_offset + 32;
+
+ da_vec = COPY_DOUBLE_TO_VECTOR(da);
+
+ for (i = (n >> 4); i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(y_pref, 0);
+ PREF_OFFSET(y_pref, 32);
+ PREF_OFFSET(y_pref, 64);
+ PREF_OFFSET(y_pref, 96);
+ x_pref += 16;
+ y_pref += 16;
+
+ LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
+ FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
+ FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
+ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP4_INC(x, 2, x0, x1, x2, x3);
+ LD_DP4_INC(py, 2, y0, y1, y2, y3);
+ FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
+ ST_DP4_INC(y0, y1, y2, y3, y, 2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP2_INC(x, 2, x0, x1);
+ LD_DP2_INC(py, 2, y0, y1);
+ FMADD2(x0, x1, da_vec, y0, y1);
+ ST_DP2_INC(y0, y1, y, 2);
+ }
+
+ if (n & 2)
+ {
+ x0 = LD_DP(x); x += 2;
+ y0 = LD_DP(py); py += 2;
+ y0 += da_vec * x0;
+ ST_DP(y0, y); y += 2;
+ }
+
+ if (n & 1)
+ {
+ y[0] += da * x[0];
+ }
+ }
+ }
+ else if (1 == inc_y)
+ {
+ FLOAT *y_pref;
+ BLASLONG pref_offset;
+ v2f64 x8, x9, x10, x11, x12, x13, x14;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ y_pref = y + pref_offset + 32;
+
+ da_vec = COPY_DOUBLE_TO_VECTOR(da);
+
+ for (i = (n >> 4); i--;)
+ {
+ PREF_OFFSET(y_pref, 0);
+ PREF_OFFSET(y_pref, 32);
+ PREF_OFFSET(y_pref, 64);
+ PREF_OFFSET(y_pref, 96);
+ y_pref += 16;
+
+ LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x14);
+ LD_DP7_INC(x, inc_x, x8, x9, x10, x11, x12, x13, x7);
+
+ PCKEV_D2_SD(x1, x0, x3, x2, x0, x1);
+ PCKEV_D2_SD(x5, x4, x14, x6, x2, x3);
+ PCKEV_D2_SD(x9, x8, x11, x10, x4, x5);
+ x6 = (v2f64) __msa_pckev_d((v2i64) x13, (v2i64) x12);
+ x7 = (v2f64) __msa_insert_d((v2i64) x7, 1, *((BLASLONG *) x));
+ x += inc_x;
+
+ LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
+ FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
+ FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
+ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP7_INC(x, inc_x, x0, x1, x2, x6, x4, x5, x3);
+
+ PCKEV_D2_SD(x1, x0, x6, x2, x0, x1);
+ x2 = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);
+ x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((BLASLONG *) x));
+ x += inc_x;
+
+ LD_DP4_INC(py, 2, y0, y1, y2, y3);
+ FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
+ ST_DP4_INC(y0, y1, y2, y3, y, 2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP3_INC(x, inc_x, x0, x2, x1);
+
+ x0 = (v2f64) __msa_pckev_d((v2i64) x2, (v2i64) x0);
+ x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((BLASLONG *) x));
+ x += inc_x;
+
+ LD_DP2_INC(py, 2, y0, y1);
+ FMADD2(x0, x1, da_vec, y0, y1);
+ ST_DP2_INC(y0, y1, y, 2);
+ }
+
+ if (n & 2)
+ {
+ x0 = (v2f64) __msa_insert_d((v2i64) zero_v, 0, *((BLASLONG *) x));
+ x += inc_x;
+ x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((BLASLONG *) x));
+ x += inc_x;
+
+ y0 = LD_DP(py); py += 2;
+ y0 += da_vec * x0;
+ ST_DP(y0, y); y += 2;
+ }
+
+ if (n & 1)
+ {
+ y[0] += da * x[0];
+ }
+ }
+ }
+ else
+ {
+ FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
+ LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
+ FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3);
+ ST_GP4_INC(y0, y1, y2, y3, y, inc_y);
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ LD_GP2_INC(x, inc_x, x0, x1);
+ LD_GP2_INC(py, inc_y, y0, y1);
+ FMADD2(x0, x1, da, y0, y1);
+ ST_GP2_INC(y0, y1, y, inc_y);
+ }
+
+ if (n & 1)
+ {
+ *y += da * *x;
+ }
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i;
+ v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+
+ if (n < 0) return (0);
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ if (n > 31)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 64 + 16;
+
+ LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = (n >> 5) - 1; i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 32;
+
+ x8 = LD_DP(x); x += 2;
+ ST_DP(x0, y); y += 2;
+ x9 = LD_DP(x); x += 2;
+ ST_DP(x1, y); y += 2;
+ x10 = LD_DP(x); x += 2;
+ ST_DP(x2, y); y += 2;
+ x11 = LD_DP(x); x += 2;
+ ST_DP(x3, y); y += 2;
+ x12 = LD_DP(x); x += 2;
+ ST_DP(x4, y); y += 2;
+ x13 = LD_DP(x); x += 2;
+ ST_DP(x5, y); y += 2;
+ x14 = LD_DP(x); x += 2;
+ ST_DP(x6, y); y += 2;
+ x15 = LD_DP(x); x += 2;
+ ST_DP(x7, y); y += 2;
+ x0 = LD_DP(x); x += 2;
+ ST_DP(x8, y); y += 2;
+ x1 = LD_DP(x); x += 2;
+ ST_DP(x9, y); y += 2;
+ x2 = LD_DP(x); x += 2;
+ ST_DP(x10, y); y += 2;
+ x3 = LD_DP(x); x += 2;
+ ST_DP(x11, y); y += 2;
+ x4 = LD_DP(x); x += 2;
+ ST_DP(x12, y); y += 2;
+ x5 = LD_DP(x); x += 2;
+ ST_DP(x13, y); y += 2;
+ x6 = LD_DP(x); x += 2;
+ ST_DP(x14, y); y += 2;
+ x7 = LD_DP(x); x += 2;
+ ST_DP(x15, y); y += 2;
+ }
+
+ x8 = LD_DP(x); x += 2;
+ x9 = LD_DP(x); x += 2;
+ ST_DP(x0, y); y += 2;
+ x10 = LD_DP(x); x += 2;
+ ST_DP(x1, y); y += 2;
+ x11 = LD_DP(x); x += 2;
+ ST_DP(x2, y); y += 2;
+ x12 = LD_DP(x); x += 2;
+ ST_DP(x3, y); y += 2;
+ x13 = LD_DP(x); x += 2;
+ ST_DP(x4, y); y += 2;
+ x14 = LD_DP(x); x += 2;
+ ST_DP(x5, y); y += 2;
+ x15 = LD_DP(x); x += 2;
+ ST_DP(x6, y); y += 2;
+ ST_DP(x7, y); y += 2;
+
+ ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2);
+ }
+
+ if (n & 8)
+ {
+ LD_DP4_INC(x, 2, x0, x1, x2, x3);
+ ST_DP4_INC(x0, x1, x2, x3, y, 2);
+ }
+
+ if (n & 4)
+ {
+ LD_GP4_INC(x, 1, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, y, 1);
+ }
+
+ if (n & 2)
+ {
+ LD_GP2_INC(x, 1, f0, f1);
+ ST_GP2_INC(f0, f1, y, 1);
+ }
+
+ if (n & 1)
+ {
+ *y = *x;
+ }
+ }
+ }
+ else
+ {
+ for (i = (n >> 3); i--;)
+ {
+ LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
+ ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y);
+ }
+
+ if (n & 4)
+ {
+ LD_GP4_INC(x, inc_x, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, y, inc_y);
+ }
+
+ if (n & 2)
+ {
+ LD_GP2_INC(x, inc_x, f0, f1);
+ ST_GP2_INC(f0, f1, y, inc_y);
+ }
+
+ if (n & 1)
+ {
+ *y = *x;
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
+ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+ BLASLONG dummy2)
+{
+ BLASLONG i;
+ FLOAT *px;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
+ v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ v2f64 da_vec;
+
+ px = x;
+
+ if (1 == inc_x)
+ {
+ if (0.0 == da)
+ {
+ v2f64 zero_v = __msa_cast_to_vector_double(0);
+ zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
+ zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+
+ for (i = (n >> 5); i--;)
+ {
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 2);
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 2);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 2);
+ }
+
+ if (n & 8)
+ {
+ ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
+ }
+
+ if (n & 4)
+ {
+ ST_DP2_INC(zero_v, zero_v, x, 2);
+ }
+
+ if (n & 2)
+ {
+ *x = 0; x += 1;
+ *x = 0; x += 1;
+ }
+
+ if (n & 1)
+ {
+ *x = 0;
+ }
+ }
+ }
+ else
+ {
+ da_vec = COPY_DOUBLE_TO_VECTOR(da);
+
+ if (n > 31)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 32 + 16;
+
+ LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = 0; i < (n >> 5) - 1; i++)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 32;
+
+ x8 = LD_DP(px); px += 2;
+ x0 *= da_vec;
+ x9 = LD_DP(px); px += 2;
+ x1 *= da_vec;
+ x10 = LD_DP(px); px += 2;
+ x2 *= da_vec;
+ x11 = LD_DP(px); px += 2;
+ x3 *= da_vec;
+ x12 = LD_DP(px); px += 2;
+ x4 *= da_vec;
+ x13 = LD_DP(px); px += 2;
+ x5 *= da_vec;
+ x14 = LD_DP(px); px += 2;
+ x6 *= da_vec;
+ x15 = LD_DP(px); px += 2;
+ x7 *= da_vec;
+ x8 *= da_vec;
+ ST_DP(x0, x); x += 2;
+ x9 *= da_vec;
+ ST_DP(x1, x); x += 2;
+ x10 *= da_vec;
+ ST_DP(x2, x); x += 2;
+ x11 *= da_vec;
+ ST_DP(x3, x); x += 2;
+ x12 *= da_vec;
+ ST_DP(x4, x); x += 2;
+ x13 *= da_vec;
+ ST_DP(x5, x); x += 2;
+ x14 *= da_vec;
+ ST_DP(x6, x); x += 2;
+ x15 *= da_vec;
+ ST_DP(x7, x); x += 2;
+ ST_DP(x8, x); x += 2;
+ x0 = LD_DP(px); px += 2;
+ ST_DP(x9, x); x += 2;
+ x1 = LD_DP(px); px += 2;
+ ST_DP(x10, x); x += 2;
+ x2 = LD_DP(px); px += 2;
+ ST_DP(x11, x); x += 2;
+ x3 = LD_DP(px); px += 2;
+ ST_DP(x12, x); x += 2;
+ x4 = LD_DP(px); px += 2;
+ ST_DP(x13, x); x += 2;
+ x5 = LD_DP(px); px += 2;
+ ST_DP(x14, x); x += 2;
+ x6 = LD_DP(px); px += 2;
+ ST_DP(x15, x); x += 2;
+ x7 = LD_DP(px); px += 2;
+ }
+
+ x8 = LD_DP(px); px += 2;
+ x0 *= da_vec;
+ x9 = LD_DP(px); px += 2;
+ x1 *= da_vec;
+ x10 = LD_DP(px); px += 2;
+ x2 *= da_vec;
+ x11 = LD_DP(px); px += 2;
+ x3 *= da_vec;
+ x12 = LD_DP(px); px += 2;
+ x4 *= da_vec;
+ x13 = LD_DP(px); px += 2;
+ x5 *= da_vec;
+ x14 = LD_DP(px); px += 2;
+ x6 *= da_vec;
+ x15 = LD_DP(px); px += 2;
+ x7 *= da_vec;
+ x8 *= da_vec;
+ ST_DP(x0, x); x += 2;
+ x9 *= da_vec;
+ ST_DP(x1, x); x += 2;
+ x10 *= da_vec;
+ ST_DP(x2, x); x += 2;
+ x11 *= da_vec;
+ ST_DP(x3, x); x += 2;
+ x12 *= da_vec;
+ ST_DP(x4, x); x += 2;
+ x13 *= da_vec;
+ ST_DP(x5, x); x += 2;
+ x15 *= da_vec;
+ ST_DP(x6, x); x += 2;
+ x14 *= da_vec;
+ ST_DP(x7, x); x += 2;
+
+ ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
+ MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
+ }
+
+ if (n & 8)
+ {
+ LD_DP4_INC(px, 2, x0, x1, x2, x3);
+ MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
+ ST_DP4_INC(x0, x1, x2, x3, x, 2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP2_INC(px, 2, x0, x1);
+ MUL2(x0, da_vec, x1, da_vec, x0, x1);
+ ST_DP2_INC(x0, x1, x, 2);
+ }
+
+ if (n & 2)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+ MUL2(f0, da, f1, da, f0, f1);
+ ST_GP2_INC(f0, f1, x, 1);
+ }
+
+ if (n & 1)
+ {
+ *x *= da;
+ }
+ }
+ }
+ }
+ else
+ {
+ if (da == 0.0)
+ {
+ for (i = n; i--;)
+ {
+ *x = 0.0;
+
+ x += inc_x;
+ }
+ }
+ else
+ {
+ if (n > 15)
+ {
+ LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
+ for (i = 0; i < (n >> 4) - 1; i++)
+ {
+ LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+
+ f4 *= da;
+ f5 *= da;
+ *x = f0; x += inc_x;
+ f6 *= da;
+ *x = f1; x += inc_x;
+ f7 *= da;
+ *x = f2; x += inc_x;
+ f8 *= da;
+ *x = f3; x += inc_x;
+ f9 *= da;
+ *x = f4; x += inc_x;
+ f10 *= da;
+ *x = f5; x += inc_x;
+ f11 *= da;
+ *x = f6; x += inc_x;
+ f12 *= da;
+ *x = f7; x += inc_x;
+ f13 *= da;
+ *x = f8; x += inc_x;
+ f14 *= da;
+ *x = f9; x += inc_x;
+ f15 *= da;
+ *x = f10; x += inc_x;
+ *x = f11; x += inc_x;
+ f0 = *px; px += inc_x;
+ *x = f12; x += inc_x;
+ f1 = *px; px += inc_x;
+ *x = f13; x += inc_x;
+ f2 = *px; px += inc_x;
+ *x = f14; x += inc_x;
+ f3 = *px; px += inc_x;
+ *x = f15; x += inc_x;
+ f4 = *px; px += inc_x;
+ f5 = *px; px += inc_x;
+ f6 = *px; px += inc_x;
+ f7 = *px; px += inc_x;
+ }
+
+ LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+
+ f4 *= da;
+ f5 *= da;
+ *x = f0; x += inc_x;
+ f6 *= da;
+ *x = f1; x += inc_x;
+ f7 *= da;
+ *x = f2; x += inc_x;
+ f8 *= da;
+ *x = f3; x += inc_x;
+ f9 *= da;
+ *x = f4; x += inc_x;
+ f10 *= da;
+ *x = f5; x += inc_x;
+ f11 *= da;
+ *x = f6; x += inc_x;
+ f12 *= da;
+ *x = f7; x += inc_x;
+ f13 *= da;
+ *x = f8; x += inc_x;
+ f14 *= da;
+ *x = f9; x += inc_x;
+ f15 *= da;
+ *x = f10; x += inc_x;
+ *x = f11; x += inc_x;
+ *x = f12; x += inc_x;
+ *x = f13; x += inc_x;
+ *x = f14; x += inc_x;
+ *x = f15; x += inc_x;
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+ MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
+ ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
+ }
+
+ if (n & 4)
+ {
+ LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
+ }
+
+ if (n & 2)
+ {
+ LD_GP2_INC(px, inc_x, f0, f1);
+ MUL2(f0, da, f1, da, f0, f1);
+ ST_GP2_INC(f0, f1, x, inc_x);
+ }
+
+ if (n & 1)
+ {
+ *x *= da;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
+ FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
+ FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i = 0, pref_offsetx, pref_offsety;
+ FLOAT *px, *py;
+ FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
+ FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
+ v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
+ v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
+
+ if (n < 0) return (0);
+
+ pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
+ if (pref_offsetx > 0)
+ {
+ pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
+ pref_offsetx = pref_offsetx / sizeof(FLOAT);
+ }
+
+ pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
+ if (pref_offsety > 0)
+ {
+ pref_offsety = L1_DATA_LINESIZE - pref_offsety;
+ pref_offsety = pref_offsety / sizeof(FLOAT);
+ }
+
+ px = srcx;
+ py = srcy;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ if (n >> 4)
+ {
+ LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
+
+ for (i = (n >> 4) - 1; i--;)
+ {
+ PREFETCH(px + pref_offsetx + 16);
+ PREFETCH(px + pref_offsetx + 20);
+ PREFETCH(px + pref_offsetx + 24);
+ PREFETCH(px + pref_offsetx + 28);
+
+ PREFETCH(py + pref_offsety + 16);
+ PREFETCH(py + pref_offsety + 20);
+ PREFETCH(py + pref_offsety + 24);
+ PREFETCH(py + pref_offsety + 28);
+
+ yv0 = LD_DP(py); py += 2;
+ ST_DP(xv0, srcy); srcy += 2;
+ yv1 = LD_DP(py); py += 2;
+ ST_DP(xv1, srcy); srcy += 2;
+ yv2 = LD_DP(py); py += 2;
+ ST_DP(xv2, srcy); srcy += 2;
+ yv3 = LD_DP(py); py += 2;
+ ST_DP(xv3, srcy); srcy += 2;
+ yv4 = LD_DP(py); py += 2;
+ ST_DP(xv4, srcy); srcy += 2;
+ yv5 = LD_DP(py); py += 2;
+ ST_DP(xv5, srcy); srcy += 2;
+ yv6 = LD_DP(py); py += 2;
+ ST_DP(xv6, srcy); srcy += 2;
+ yv7 = LD_DP(py); py += 2;
+ ST_DP(xv7, srcy); srcy += 2;
+
+ xv0 = LD_DP(px); px += 2;
+ ST_DP(yv0, srcx); srcx += 2;
+ xv1 = LD_DP(px); px += 2;
+ ST_DP(yv1, srcx); srcx += 2;
+ xv2 = LD_DP(px); px += 2;
+ ST_DP(yv2, srcx); srcx += 2;
+ xv3 = LD_DP(px); px += 2;
+ ST_DP(yv3, srcx); srcx += 2;
+ xv4 = LD_DP(px); px += 2;
+ ST_DP(yv4, srcx); srcx += 2;
+ xv5 = LD_DP(px); px += 2;
+ ST_DP(yv5, srcx); srcx += 2;
+ xv6 = LD_DP(px); px += 2;
+ ST_DP(yv6, srcx); srcx += 2;
+ xv7 = LD_DP(px); px += 2;
+ ST_DP(yv7, srcx); srcx += 2;
+ }
+
+ LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
+ ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2);
+ ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2);
+ }
+
+ if (n & 15)
+ {
+ if ((n & 8) && (n & 4) && (n & 2))
+ {
+ LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
+ LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
+ ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2);
+ ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2);
+ }
+ else if ((n & 8) && (n & 4))
+ {
+ LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5);
+ LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5);
+ ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2);
+ ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2);
+ }
+ else if ((n & 8) && (n & 2))
+ {
+ LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4);
+ LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4);
+ ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2);
+ ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_DP3_INC(px, 2, xv0, xv1, xv2);
+ LD_DP3_INC(py, 2, yv0, yv1, yv2);
+ ST_DP3_INC(xv0, xv1, xv2, srcy, 2);
+ ST_DP3_INC(yv0, yv1, yv2, srcx, 2);
+ }
+ else if (n & 8)
+ {
+ LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3);
+ LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3);
+ ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2);
+ ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2);
+ }
+ else if (n & 4)
+ {
+ LD_DP2_INC(px, 2, xv0, xv1);
+ LD_DP2_INC(py, 2, yv0, yv1);
+ ST_DP2_INC(xv0, xv1, srcy, 2);
+ ST_DP2_INC(yv0, yv1, srcx, 2);
+ }
+ else if (n & 2)
+ {
+ xv0 = LD_DP(px);
+ yv0 = LD_DP(py);
+
+ px += 2;
+ py += 2;
+
+ ST_DP(xv0, srcy);
+ ST_DP(yv0, srcx);
+
+ srcx += 2;
+ srcy += 2;
+ }
+
+ if (n & 1)
+ {
+ x0 = px[0];
+ y0 = py[0];
+ srcx[0] = y0;
+ srcy[0] = x0;
+ }
+ }
+ }
+ else
+ {
+ for (i = (n >> 3); i--;)
+ {
+ LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
+ ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
+ ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
+ }
+
+ if (n & 7)
+ {
+ if ((n & 4) && (n & 2) && (n & 1))
+ {
+ LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
+ LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
+ ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
+ ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
+ LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
+ ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
+ ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
+ }
+ else if ((n & 4) && (n & 1))
+ {
+ LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
+ LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
+ ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
+ ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
+ }
+ else if ((n & 2) && (n & 1))
+ {
+ LD_GP3_INC(px, inc_x, x0, x1, x2);
+ LD_GP3_INC(py, inc_y, y0, y1, y2);
+ ST_GP3_INC(x0, x1, x2, srcy, inc_y);
+ ST_GP3_INC(y0, y1, y2, srcx, inc_x);
+ }
+ else if (n & 4)
+ {
+ LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
+ LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
+ ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
+ ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
+ }
+ else if (n & 2)
+ {
+ LD_GP2_INC(px, inc_x, x0, x1);
+ LD_GP2_INC(py, inc_y, y0, y1);
+ ST_GP2_INC(x0, x1, srcy, inc_y);
+ ST_GP2_INC(y0, y1, srcx, inc_x);
+ }
+ else if (n & 1)
+ {
+ x0 = *srcx;
+ y0 = *srcy;
+
+ *srcx = y0;
+ *srcy = x0;
+ }
+ }
+ }
+
+ return (0);
+}
MUL2(in4, in5, in6, in7, out2, out3); \
}
+/* Description : Multiplication of pairs of vectors and added in output
+ Arguments : Inputs - in0, in1, vec, out0, out1
+ Outputs - out0, out1
+ Details : Each element from 'in0' is multiplied with elements from 'vec'
+ and the result is added to 'out0'
+*/
+#define FMADD2(in0, in1, vec, inout0, inout1) \
+{ \
+ inout0 += in0 * vec; \
+ inout1 += in1 * vec; \
+}
+#define FMADD3(in0, in1, in2, vec, \
+ inout0, inout1, inout2) \
+{ \
+ inout0 += in0 * vec; \
+ inout1 += in1 * vec; \
+ inout2 += in2 * vec; \
+}
+#define FMADD4(in0, in1, in2, in3, vec, \
+ inout0, inout1, inout2, inout3) \
+{ \
+ FMADD2(in0, in1, vec, inout0, inout1); \
+ FMADD2(in2, in3, vec, inout2, inout3); \
+}
+
/* Description : Addition of 2 pairs of variables
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#if !defined(CONJ)
+ #define OP0 +=
+ #define OP1 -=
+ #define OP2 +=
+#else
+ #define OP0 -=
+ #define OP1 +=
+ #define OP2 -=
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
+ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i;
+ FLOAT *py;
+ v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
+ v4f32 da_vec, zero_v = {0};
+
+ if ((n < 0) || (da == 0.0)) return(0);
+
+ py = y;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ FLOAT *x_pref, *y_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 64;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ y_pref = y + pref_offset + 64;
+
+ da_vec = COPY_FLOAT_TO_VECTOR(da);
+
+ for (i = (n >> 5); i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(y_pref, 0);
+ PREF_OFFSET(y_pref, 32);
+ PREF_OFFSET(y_pref, 64);
+ PREF_OFFSET(y_pref, 96);
+ x_pref += 32;
+ y_pref += 32;
+
+ LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
+ FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
+ FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
+ ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
+ }
+
+ if (n & 31)
+ {
+ if (n & 16)
+ {
+ LD_SP4_INC(x, 4, x0, x1, x2, x3);
+ LD_SP4_INC(py, 4, y0, y1, y2, y3);
+ FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
+ ST_SP4_INC(y0, y1, y2, y3, y, 4);
+ }
+
+ if (n & 8)
+ {
+ LD_SP2_INC(x, 4, x0, x1);
+ LD_SP2_INC(py, 4, y0, y1);
+ FMADD2(x0, x1, da_vec, y0, y1);
+ ST_SP2_INC(y0, y1, y, 4);
+ }
+
+ if (n & 4)
+ {
+ x0 = LD_SP(x); x += 4;
+ y0 = LD_SP(py); py += 4;
+ y0 += da_vec * x0;
+ ST_SP(y0, y); y += 4;
+ }
+
+ if (n & 2)
+ {
+ FMADD2(x[0], x[1], da, y[0], y[1]);
+ x += 2;
+ y += 2;
+ }
+
+ if (n & 1)
+ {
+ y[0] += da * x[0];
+ }
+ }
+ }
+ else if (1 == inc_y)
+ {
+ da_vec = COPY_FLOAT_TO_VECTOR(da);
+
+ for (i = (n >> 4); i--;)
+ {
+ x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
+ x += inc_x;
+ x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x));
+ x += inc_x;
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
+ x += inc_x;
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x));
+ x += inc_x;
+ x2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ x2 = (v4f32) __msa_insert_w((v4i32) x2, 1, *((int *) x));
+ x += inc_x;
+ x2 = (v4f32) __msa_insert_w((v4i32) x2, 2, *((int *) x));
+ x += inc_x;
+ x2 = (v4f32) __msa_insert_w((v4i32) x2, 3, *((int *) x));
+ x += inc_x;
+ x3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ x3 = (v4f32) __msa_insert_w((v4i32) x3, 1, *((int *) x));
+ x += inc_x;
+ x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x));
+ x += inc_x;
+ x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) x));
+ x += inc_x;
+
+ LD_SP4_INC(py, 4, y0, y1, y2, y3);
+ FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
+ ST_SP4_INC(y0, y1, y2, y3, y, 4);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
+ x += inc_x;
+ x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x));
+ x += inc_x;
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
+ x += inc_x;
+ x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x));
+ x += inc_x;
+
+ LD_SP2_INC(py, 4, y0, y1);
+ FMADD2(x0, x1, da_vec, y0, y1);
+ ST_SP2_INC(y0, y1, y, 4);
+ }
+
+ if (n & 4)
+ {
+ x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
+ x += inc_x;
+ x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
+ x += inc_x;
+
+ y0 = LD_SP(py); py += 4;
+ y0 += da_vec * x0;
+ ST_SP(y0, y); y += 4;
+ }
+
+ if (n & 2)
+ {
+ FMADD2(x[0], x[inc_x], da, y[0], y[1]);
+
+ x += 2 * inc_x;
+ y += 2;
+ }
+
+ if (n & 1)
+ {
+ y[0] += da * x[0];
+ }
+ }
+ }
+ else
+ {
+ FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
+
+ for (i = (n >> 2); i--;)
+ {
+ LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
+ LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
+ FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3);
+ ST_GP4_INC(y0, y1, y2, y3, y, inc_y);
+ }
+
+ if (n & 3)
+ {
+ if (n & 2)
+ {
+ LD_GP2_INC(x, inc_x, x0, x1);
+ LD_GP2_INC(py, inc_y, y0, y1);
+ FMADD2(x0, x1, da, y0, y1);
+ ST_GP2_INC(y0, y1, y, inc_y);
+ }
+
+ if (n & 1)
+ {
+ *y += da * *x;
+ }
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i;
+ v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+
+ if (n < 0) return (0);
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ if (n > 63)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 128 + 32;
+
+ LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = (n >> 6) - 1; i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 64;
+
+ x8 = LD_SP(x); x += 4;
+ ST_SP(x0, y); y += 4;
+ x9 = LD_SP(x); x += 4;
+ ST_SP(x1, y); y += 4;
+ x10 = LD_SP(x); x += 4;
+ ST_SP(x2, y); y += 4;
+ x11 = LD_SP(x); x += 4;
+ ST_SP(x3, y); y += 4;
+ x12 = LD_SP(x); x += 4;
+ ST_SP(x4, y); y += 4;
+ x13 = LD_SP(x); x += 4;
+ ST_SP(x5, y); y += 4;
+ x14 = LD_SP(x); x += 4;
+ ST_SP(x6, y); y += 4;
+ x15 = LD_SP(x); x += 4;
+ ST_SP(x7, y); y += 4;
+ x0 = LD_SP(x); x += 4;
+ ST_SP(x8, y); y += 4;
+ x1 = LD_SP(x); x += 4;
+ ST_SP(x9, y); y += 4;
+ x2 = LD_SP(x); x += 4;
+ ST_SP(x10, y); y += 4;
+ x3 = LD_SP(x); x += 4;
+ ST_SP(x11, y); y += 4;
+ x4 = LD_SP(x); x += 4;
+ ST_SP(x12, y); y += 4;
+ x5 = LD_SP(x); x += 4;
+ ST_SP(x13, y); y += 4;
+ x6 = LD_SP(x); x += 4;
+ ST_SP(x14, y); y += 4;
+ x7 = LD_SP(x); x += 4;
+ ST_SP(x15, y); y += 4;
+ }
+
+ x8 = LD_SP(x); x += 4;
+ x9 = LD_SP(x); x += 4;
+ ST_SP(x0, y); y += 4;
+ x10 = LD_SP(x); x += 4;
+ ST_SP(x1, y); y += 4;
+ x11 = LD_SP(x); x += 4;
+ ST_SP(x2, y); y += 4;
+ x12 = LD_SP(x); x += 4;
+ ST_SP(x3, y); y += 4;
+ x13 = LD_SP(x); x += 4;
+ ST_SP(x4, y); y += 4;
+ x14 = LD_SP(x); x += 4;
+ ST_SP(x5, y); y += 4;
+ x15 = LD_SP(x); x += 4;
+ ST_SP(x6, y); y += 4;
+ ST_SP(x7, y); y += 4;
+
+ ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4);
+ }
+
+ if (n & 63)
+ {
+ if (n & 32)
+ {
+ LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4);
+ }
+
+ if (n & 16)
+ {
+ LD_SP4_INC(x, 4, x0, x1, x2, x3);
+ ST_SP4_INC(x0, x1, x2, x3, y, 4);
+ }
+
+ if (n & 8)
+ {
+ LD_SP2_INC(x, 4, x0, x1);
+ ST_SP2_INC(x0, x1, y, 4);
+ }
+
+ if (n & 4)
+ {
+ LD_GP4_INC(x, 1, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, y, 1);
+ }
+
+ if (n & 2)
+ {
+ LD_GP2_INC(x, 1, f0, f1);
+ ST_GP2_INC(f0, f1, y, 1);
+ }
+
+ if (n & 1)
+ {
+ *y = *x;
+ }
+ }
+ }
+ else
+ {
+ for (i = (n >> 3); i--;)
+ {
+ LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
+ ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y);
+ }
+
+ if (n & 4)
+ {
+ LD_GP4_INC(x, inc_x, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, y, inc_y);
+ }
+
+ if (n & 2)
+ {
+ LD_GP2_INC(x, inc_x, f0, f1);
+ ST_GP2_INC(f0, f1, y, inc_y);
+ }
+
+ if (n & 1)
+ {
+ *y = *x;
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
+ BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+ BLASLONG dummy2)
+{
+ BLASLONG i;
+ FLOAT *px;
+ FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
+ v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ v4f32 da_vec;
+
+ px = x;
+
+ if (1 == inc_x)
+ {
+ if (0.0 == da)
+ {
+ v4f32 zero_v = __msa_cast_to_vector_float(0);
+ zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
+ zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
+ zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
+ zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
+
+ for (i = (n >> 6); i--;)
+ {
+ ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 4);
+ ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 4);
+ }
+
+ if (n & 63)
+ {
+ if (n & 32)
+ {
+ ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 4);
+ }
+
+ if (n & 16)
+ {
+ ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
+ }
+
+ if (n & 8)
+ {
+ ST_SP2_INC(zero_v, zero_v, x, 4);
+ }
+
+ if (n & 4)
+ {
+ *x = 0; x += 1;
+ *x = 0; x += 1;
+ *x = 0; x += 1;
+ *x = 0; x += 1;
+ }
+
+ if (n & 2)
+ {
+ *x = 0; x += 1;
+ *x = 0; x += 1;
+ }
+
+ if (n & 1)
+ {
+ *x = 0;
+ }
+ }
+ }
+ else
+ {
+ da_vec = COPY_FLOAT_TO_VECTOR(da);
+
+ if (n > 63)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 64 + 32;
+
+ LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = 0; i < (n >> 6) - 1; i++)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 64;
+
+ x8 = LD_SP(px); px += 4;
+ x0 *= da_vec;
+ x9 = LD_SP(px); px += 4;
+ x1 *= da_vec;
+ x10 = LD_SP(px); px += 4;
+ x2 *= da_vec;
+ x11 = LD_SP(px); px += 4;
+ x3 *= da_vec;
+ x12 = LD_SP(px); px += 4;
+ x4 *= da_vec;
+ x13 = LD_SP(px); px += 4;
+ x5 *= da_vec;
+ x14 = LD_SP(px); px += 4;
+ x6 *= da_vec;
+ x15 = LD_SP(px); px += 4;
+ x7 *= da_vec;
+ x8 *= da_vec;
+ ST_SP(x0, x); x += 4;
+ x9 *= da_vec;
+ ST_SP(x1, x); x += 4;
+ x10 *= da_vec;
+ ST_SP(x2, x); x += 4;
+ x11 *= da_vec;
+ ST_SP(x3, x); x += 4;
+ x12 *= da_vec;
+ ST_SP(x4, x); x += 4;
+ x13 *= da_vec;
+ ST_SP(x5, x); x += 4;
+ x14 *= da_vec;
+ ST_SP(x6, x); x += 4;
+ x15 *= da_vec;
+ ST_SP(x7, x); x += 4;
+ ST_SP(x8, x); x += 4;
+ x0 = LD_SP(px); px += 4;
+ ST_SP(x9, x); x += 4;
+ x1 = LD_SP(px); px += 4;
+ ST_SP(x10, x); x += 4;
+ x2 = LD_SP(px); px += 4;
+ ST_SP(x11, x); x += 4;
+ x3 = LD_SP(px); px += 4;
+ ST_SP(x12, x); x += 4;
+ x4 = LD_SP(px); px += 4;
+ ST_SP(x13, x); x += 4;
+ x5 = LD_SP(px); px += 4;
+ ST_SP(x14, x); x += 4;
+ x6 = LD_SP(px); px += 4;
+ ST_SP(x15, x); x += 4;
+ x7 = LD_SP(px); px += 4;
+ }
+
+ x8 = LD_SP(px); px += 4;
+ x0 *= da_vec;
+ x9 = LD_SP(px); px += 4;
+ x1 *= da_vec;
+ x10 = LD_SP(px); px += 4;
+ x2 *= da_vec;
+ x11 = LD_SP(px); px += 4;
+ x3 *= da_vec;
+ x12 = LD_SP(px); px += 4;
+ x4 *= da_vec;
+ x13 = LD_SP(px); px += 4;
+ x5 *= da_vec;
+ x14 = LD_SP(px); px += 4;
+ x6 *= da_vec;
+ x15 = LD_SP(px); px += 4;
+ x7 *= da_vec;
+ x8 *= da_vec;
+ ST_SP(x0, x); x += 4;
+ x9 *= da_vec;
+ ST_SP(x1, x); x += 4;
+ x10 *= da_vec;
+ ST_SP(x2, x); x += 4;
+ x11 *= da_vec;
+ ST_SP(x3, x); x += 4;
+ x12 *= da_vec;
+ ST_SP(x4, x); x += 4;
+ x13 *= da_vec;
+ ST_SP(x5, x); x += 4;
+ x15 *= da_vec;
+ ST_SP(x6, x); x += 4;
+ x14 *= da_vec;
+ ST_SP(x7, x); x += 4;
+
+ ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4);
+ }
+
+ if (n & 63)
+ {
+ if (n & 32)
+ {
+ LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
+ MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
+ ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
+ }
+
+ if (n & 16)
+ {
+ LD_SP4_INC(px, 4, x0, x1, x2, x3);
+ MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
+ ST_SP4_INC(x0, x1, x2, x3, x, 4);
+ }
+
+ if (n & 8)
+ {
+ LD_SP2_INC(px, 4, x0, x1);
+ MUL2(x0, da_vec, x1, da_vec, x0, x1);
+ ST_SP2_INC(x0, x1, x, 4);
+ }
+
+ if (n & 4)
+ {
+ LD_GP4_INC(px, 1, f0, f1, f2, f3);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, x, 1);
+ }
+
+ if (n & 2)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+ MUL2(f0, da, f1, da, f0, f1);
+ ST_GP2_INC(f0, f1, x, 1);
+ }
+
+ if (n & 1)
+ {
+ *x *= da;
+ }
+ }
+ }
+ }
+ else
+ {
+ if (0.0 == da)
+ {
+ for (i = n; i--;)
+ {
+ *x = 0;
+ x += inc_x;
+ }
+ }
+ else
+ {
+ if (n > 15)
+ {
+ LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
+
+ for (i = 0; i < (n >> 4) - 1; i++)
+ {
+ LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+
+ f4 *= da;
+ f5 *= da;
+ *x = f0; x += inc_x;
+ f6 *= da;
+ *x = f1; x += inc_x;
+ f7 *= da;
+ *x = f2; x += inc_x;
+ f8 *= da;
+ *x = f3; x += inc_x;
+ f9 *= da;
+ *x = f4; x += inc_x;
+ f10 *= da;
+ *x = f5; x += inc_x;
+ f11 *= da;
+ *x = f6; x += inc_x;
+ f12 *= da;
+ *x = f7; x += inc_x;
+ f13 *= da;
+ *x = f8; x += inc_x;
+ f14 *= da;
+ *x = f9; x += inc_x;
+ f15 *= da;
+ *x = f10; x += inc_x;
+ *x = f11; x += inc_x;
+ f0 = *px; px += inc_x;
+ *x = f12; x += inc_x;
+ f1 = *px; px += inc_x;
+ *x = f13; x += inc_x;
+ f2 = *px; px += inc_x;
+ *x = f14; x += inc_x;
+ f3 = *px; px += inc_x;
+ *x = f15; x += inc_x;
+ f4 = *px; px += inc_x;
+ f5 = *px; px += inc_x;
+ f6 = *px; px += inc_x;
+ f7 = *px; px += inc_x;
+ }
+
+ LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+
+ f4 *= da;
+ f5 *= da;
+ *x = f0; x += inc_x;
+ f6 *= da;
+ *x = f1; x += inc_x;
+ f7 *= da;
+ *x = f2; x += inc_x;
+ f8 *= da;
+ *x = f3; x += inc_x;
+ f9 *= da;
+ *x = f4; x += inc_x;
+ f10 *= da;
+ *x = f5; x += inc_x;
+ f11 *= da;
+ *x = f6; x += inc_x;
+ f12 *= da;
+ *x = f7; x += inc_x;
+ f13 *= da;
+ *x = f8; x += inc_x;
+ f14 *= da;
+ *x = f9; x += inc_x;
+ f15 *= da;
+ *x = f10; x += inc_x;
+ *x = f11; x += inc_x;
+ *x = f12; x += inc_x;
+ *x = f13; x += inc_x;
+ *x = f14; x += inc_x;
+ *x = f15; x += inc_x;
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+ MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
+ ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
+ }
+
+ if (n & 4)
+ {
+ LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
+ MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
+ ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
+ }
+
+ if (n & 2)
+ {
+ LD_GP2_INC(px, inc_x, f0, f1);
+ MUL2(f0, da, f1, da, f0, f1);
+ ST_GP2_INC(f0, f1, x, inc_x);
+ }
+
+ if (n & 1)
+ {
+ *x *= da;
+ }
+ }
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
+ FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
+ FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i = 0, pref_offsetx, pref_offsety;
+ FLOAT *px, *py;
+ FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
+ FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
+ v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
+ v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
+
+ if (n < 0) return (0);
+
+ pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
+ if (pref_offsetx > 0)
+ {
+ pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
+ pref_offsetx = pref_offsetx / sizeof(FLOAT);
+ }
+
+ pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
+ if (pref_offsety > 0)
+ {
+ pref_offsety = L1_DATA_LINESIZE - pref_offsety;
+ pref_offsety = pref_offsety / sizeof(FLOAT);
+ }
+
+ px = srcx;
+ py = srcy;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ if (n >> 5)
+ {
+ LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
+
+ for (i = (n >> 5) - 1; i--;)
+ {
+ PREFETCH(px + pref_offsetx + 32);
+ PREFETCH(px + pref_offsetx + 40);
+ PREFETCH(px + pref_offsetx + 48);
+ PREFETCH(px + pref_offsetx + 56);
+
+ PREFETCH(py + pref_offsety + 32);
+ PREFETCH(py + pref_offsety + 40);
+ PREFETCH(py + pref_offsety + 48);
+ PREFETCH(py + pref_offsety + 56);
+
+ yv0 = LD_SP(py); py += 4;
+ ST_SP(xv0, srcy); srcy += 4;
+ yv1 = LD_SP(py); py += 4;
+ ST_SP(xv1, srcy); srcy += 4;
+ yv2 = LD_SP(py); py += 4;
+ ST_SP(xv2, srcy); srcy += 4;
+ yv3 = LD_SP(py); py += 4;
+ ST_SP(xv3, srcy); srcy += 4;
+ yv4 = LD_SP(py); py += 4;
+ ST_SP(xv4, srcy); srcy += 4;
+ yv5 = LD_SP(py); py += 4;
+ ST_SP(xv5, srcy); srcy += 4;
+ yv6 = LD_SP(py); py += 4;
+ ST_SP(xv6, srcy); srcy += 4;
+ yv7 = LD_SP(py); py += 4;
+ ST_SP(xv7, srcy); srcy += 4;
+
+ xv0 = LD_SP(px); px += 4;
+ ST_SP(yv0, srcx); srcx += 4;
+ xv1 = LD_SP(px); px += 4;
+ ST_SP(yv1, srcx); srcx += 4;
+ xv2 = LD_SP(px); px += 4;
+ ST_SP(yv2, srcx); srcx += 4;
+ xv3 = LD_SP(px); px += 4;
+ ST_SP(yv3, srcx); srcx += 4;
+ xv4 = LD_SP(px); px += 4;
+ ST_SP(yv4, srcx); srcx += 4;
+ xv5 = LD_SP(px); px += 4;
+ ST_SP(yv5, srcx); srcx += 4;
+ xv6 = LD_SP(px); px += 4;
+ ST_SP(yv6, srcx); srcx += 4;
+ xv7 = LD_SP(px); px += 4;
+ ST_SP(yv7, srcx); srcx += 4;
+ }
+
+ LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
+ ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
+ ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
+ }
+
+ if (n & 31)
+ {
+ if ((n & 16) && (n & 8) && (n & 4))
+ {
+ LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
+ LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
+ ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
+ ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
+ }
+ else if ((n & 16) && (n & 8))
+ {
+ LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
+ LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
+ ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
+ ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
+ }
+ else if ((n & 16) && (n & 4))
+ {
+ LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
+ LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
+ ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
+ ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
+ }
+ else if ((n & 8) && (n & 4))
+ {
+ LD_SP3_INC(px, 4, xv0, xv1, xv2);
+ LD_SP3_INC(py, 4, yv0, yv1, yv2);
+ ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
+ ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
+ }
+ else if (n & 16)
+ {
+ LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
+ LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
+ ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
+ ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
+ }
+ else if (n & 8)
+ {
+ LD_SP2_INC(px, 4, xv0, xv1);
+ LD_SP2_INC(py, 4, yv0, yv1);
+ ST_SP2_INC(xv0, xv1, srcy, 4);
+ ST_SP2_INC(yv0, yv1, srcx, 4);
+ }
+ else if (n & 4)
+ {
+ xv0 = LD_SP(px);
+ yv0 = LD_SP(py);
+
+ px += 4;
+ py += 4;
+
+ ST_SP(xv0, srcy);
+ ST_SP(yv0, srcx);
+
+ srcx += 4;
+ srcy += 4;
+ }
+
+ if ((n & 2) && (n & 1))
+ {
+ LD_GP3_INC(px, 1, x0, x1, x3);
+ LD_GP3_INC(py, 1, y0, y1, y3);
+ ST_GP3_INC(x0, x1, x3, srcy, 1);
+ ST_GP3_INC(y0, y1, y3, srcx, 1);
+ }
+ else if (n & 2)
+ {
+ LD_GP2_INC(px, 1, x0, x1);
+ LD_GP2_INC(py, 1, y0, y1);
+ ST_GP2_INC(x0, x1, srcy, 1);
+ ST_GP2_INC(y0, y1, srcx, 1);
+ }
+ else if (n & 1)
+ {
+ x0 = px[0];
+ y0 = py[0];
+ srcx[0] = y0;
+ srcy[0] = x0;
+ }
+ }
+ }
+ else
+ {
+ for (i = (n >> 3); i--;)
+ {
+ LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
+ ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
+ ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
+ }
+
+ if (n & 7)
+ {
+ if ((n & 4) && (n & 2) && (n & 1))
+ {
+ LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
+ LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
+ ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
+ ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
+ LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
+ ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
+ ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
+ }
+ else if ((n & 4) && (n & 1))
+ {
+ LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
+ LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
+ ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
+ ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
+ }
+ else if ((n & 2) && (n & 1))
+ {
+ LD_GP3_INC(px, inc_x, x0, x1, x2);
+ LD_GP3_INC(py, inc_y, y0, y1, y2);
+ ST_GP3_INC(x0, x1, x2, srcy, inc_y);
+ ST_GP3_INC(y0, y1, y2, srcx, inc_x);
+ }
+ else if (n & 4)
+ {
+ LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
+ LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
+ ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
+ ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
+ }
+ else if (n & 2)
+ {
+ LD_GP2_INC(px, inc_x, x0, x1);
+ LD_GP2_INC(py, inc_y, y0, y1);
+ ST_GP2_INC(x0, x1, srcy, inc_y);
+ ST_GP2_INC(y0, y1, srcx, inc_x);
+ }
+ else if (n & 1)
+ {
+ x0 = *srcx;
+ y0 = *srcy;
+
+ *srcx = y0;
+ *srcy = x0;
+ }
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+#if !defined(CONJ)
+ #define OP0 +=
+ #define OP1 -=
+ #define OP2 +=
+#else
+ #define OP0 -=
+ #define OP1 +=
+ #define OP2 -=
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
+ FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+ BLASLONG dummy2)
+{
+ BLASLONG i, inc_x2, inc_y2;
+ FLOAT *py;
+ v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
+ v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
+ v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
+ v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
+ FLOAT xd0, xd1, yd0, yd1;
+
+ if (n < 0) return(0);
+ if ((da_r == 0.0) && (da_i == 0.0)) return(0);
+
+ py = y;
+
+ dar_vec = COPY_DOUBLE_TO_VECTOR(da_r);
+ dai_vec = COPY_DOUBLE_TO_VECTOR(da_i);
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ FLOAT *x_pref, *y_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 32;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ y_pref = y + pref_offset + 32;
+
+ for (i = (n >> 3); i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(y_pref, 0);
+ PREF_OFFSET(y_pref, 32);
+ PREF_OFFSET(y_pref, 64);
+ PREF_OFFSET(y_pref, 96);
+ x_pref += 16;
+ y_pref += 16;
+
+ LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+ PCKEVOD_D2_DP(x3, x2, x1r, x1i);
+ PCKEVOD_D2_DP(y3, y2, y1r, y1i);
+ PCKEVOD_D2_DP(x5, x4, x2r, x2i);
+ PCKEVOD_D2_DP(y5, y4, y2r, y2i);
+ PCKEVOD_D2_DP(x7, x6, x3r, x3i);
+ PCKEVOD_D2_DP(y7, y6, y3r, y3i);
+
+ FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y2i OP0 dar_vec * x2i;
+ y3i OP0 dar_vec * x3i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y2r OP1 dai_vec * x2i;
+ y3r OP1 dai_vec * x3i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+ y2i OP2 dai_vec * x2r;
+ y3i OP2 dai_vec * x3r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ILVRL_D2_DP(y1i, y1r, y2, y3);
+ ILVRL_D2_DP(y2i, y2r, y4, y5);
+ ILVRL_D2_DP(y3i, y3r, y6, y7);
+ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
+ }
+
+ if (n & 7)
+ {
+ if (n & 4)
+ {
+ LD_DP4_INC(x, 2, x0, x1, x2, x3);
+ LD_DP4_INC(py, 2, y0, y1, y2, y3);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+ PCKEVOD_D2_DP(x3, x2, x1r, x1i);
+ PCKEVOD_D2_DP(y3, y2, y1r, y1i);
+
+ FMADD2(x0r, x1r, dar_vec, y0r, y1r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ILVRL_D2_DP(y1i, y1r, y2, y3);
+ ST_DP4_INC(y0, y1, y2, y3, y, 2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, 2, x0, x1);
+ LD_DP2_INC(py, 2, y0, y1);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+
+ y0r += dar_vec * x0r;
+ y0i OP0 dar_vec * x0i;
+ y0r OP1 dai_vec * x0i;
+ y0i OP2 dai_vec * x0r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ST_DP2_INC(y0, y1, y, 2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, xd0, xd1);
+ LD_GP2_INC(py, 1, yd0, yd1);
+
+ yd0 += da_r * xd0;
+ yd1 OP0 da_r * xd1;
+ yd0 OP1 da_i * xd1;
+ yd1 OP2 da_i * xd0;
+
+ ST_GP2_INC(yd0, yd1, y, 1);
+ }
+ }
+ }
+ else if (1 == inc_y)
+ {
+ FLOAT *y_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ y_pref = y + pref_offset + 32;
+
+ inc_x2 = 2 * inc_x;
+
+ for (i = (n >> 3); i--;)
+ {
+ PREF_OFFSET(y_pref, 0);
+ PREF_OFFSET(y_pref, 32);
+ PREF_OFFSET(y_pref, 64);
+ PREF_OFFSET(y_pref, 96);
+ y_pref += 16;
+
+ LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+ PCKEVOD_D2_DP(x3, x2, x1r, x1i);
+ PCKEVOD_D2_DP(y3, y2, y1r, y1i);
+ PCKEVOD_D2_DP(x5, x4, x2r, x2i);
+ PCKEVOD_D2_DP(y5, y4, y2r, y2i);
+ PCKEVOD_D2_DP(x7, x6, x3r, x3i);
+ PCKEVOD_D2_DP(y7, y6, y3r, y3i);
+
+ FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y2i OP0 dar_vec * x2i;
+ y3i OP0 dar_vec * x3i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y2r OP1 dai_vec * x2i;
+ y3r OP1 dai_vec * x3i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+ y2i OP2 dai_vec * x2r;
+ y3i OP2 dai_vec * x3r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ILVRL_D2_DP(y1i, y1r, y2, y3);
+ ILVRL_D2_DP(y2i, y2r, y4, y5);
+ ILVRL_D2_DP(y3i, y3r, y6, y7);
+ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
+ }
+
+ if (n & 7)
+ {
+ if (n & 4)
+ {
+ LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
+ LD_DP4_INC(py, 2, y0, y1, y2, y3);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+ PCKEVOD_D2_DP(x3, x2, x1r, x1i);
+ PCKEVOD_D2_DP(y3, y2, y1r, y1i);
+
+ FMADD2(x0r, x1r, dar_vec, y0r, y1r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ILVRL_D2_DP(y1i, y1r, y2, y3);
+ ST_DP4_INC(y0, y1, y2, y3, y, 2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, inc_x2, x0, x1);
+ LD_DP2_INC(py, 2, y0, y1);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+
+ y0r += dar_vec * x0r;
+ y0i OP0 dar_vec * x0i;
+ y0r OP1 dai_vec * x0i;
+ y0i OP2 dai_vec * x0r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ST_DP2_INC(y0, y1, y, 2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, xd0, xd1);
+ LD_GP2_INC(py, 1, yd0, yd1);
+
+ yd0 += da_r * xd0;
+ yd1 OP0 da_r * xd1;
+ yd0 OP1 da_i * xd1;
+ yd1 OP2 da_i * xd0;
+
+ ST_GP2_INC(yd0, yd1, y, 1);
+ }
+ }
+ }
+ else if (1 == inc_x)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 32;
+
+ inc_y2 = 2 * inc_y;
+
+ for (i = (n >> 3); i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ x_pref += 16;
+
+ LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+ PCKEVOD_D2_DP(x3, x2, x1r, x1i);
+ PCKEVOD_D2_DP(y3, y2, y1r, y1i);
+ PCKEVOD_D2_DP(x5, x4, x2r, x2i);
+ PCKEVOD_D2_DP(y5, y4, y2r, y2i);
+ PCKEVOD_D2_DP(x7, x6, x3r, x3i);
+ PCKEVOD_D2_DP(y7, y6, y3r, y3i);
+
+ FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y2i OP0 dar_vec * x2i;
+ y3i OP0 dar_vec * x3i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y2r OP1 dai_vec * x2i;
+ y3r OP1 dai_vec * x3i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+ y2i OP2 dai_vec * x2r;
+ y3i OP2 dai_vec * x3r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ILVRL_D2_DP(y1i, y1r, y2, y3);
+ ILVRL_D2_DP(y2i, y2r, y4, y5);
+ ILVRL_D2_DP(y3i, y3r, y6, y7);
+ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
+ }
+
+ if (n & 7)
+ {
+ if (n & 4)
+ {
+ LD_DP4_INC(x, 2, x0, x1, x2, x3);
+ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+ PCKEVOD_D2_DP(x3, x2, x1r, x1i);
+ PCKEVOD_D2_DP(y3, y2, y1r, y1i);
+
+ FMADD2(x0r, x1r, dar_vec, y0r, y1r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ILVRL_D2_DP(y1i, y1r, y2, y3);
+ ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, 2, x0, x1);
+ LD_DP2_INC(py, inc_y2, y0, y1);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+
+ y0r += dar_vec * x0r;
+ y0i OP0 dar_vec * x0i;
+ y0r OP1 dai_vec * x0i;
+ y0i OP2 dai_vec * x0r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ST_DP2_INC(y0, y1, y, inc_y2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, xd0, xd1);
+ LD_GP2_INC(py, 1, yd0, yd1);
+
+ yd0 += da_r * xd0;
+ yd1 OP0 da_r * xd1;
+ yd0 OP1 da_i * xd1;
+ yd1 OP2 da_i * xd0;
+
+ ST_GP2_INC(yd0, yd1, y, 1);
+ }
+ }
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+ PCKEVOD_D2_DP(x3, x2, x1r, x1i);
+ PCKEVOD_D2_DP(y3, y2, y1r, y1i);
+ PCKEVOD_D2_DP(x5, x4, x2r, x2i);
+ PCKEVOD_D2_DP(y5, y4, y2r, y2i);
+ PCKEVOD_D2_DP(x7, x6, x3r, x3i);
+ PCKEVOD_D2_DP(y7, y6, y3r, y3i);
+
+ FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y2i OP0 dar_vec * x2i;
+ y3i OP0 dar_vec * x3i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y2r OP1 dai_vec * x2i;
+ y3r OP1 dai_vec * x3i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+ y2i OP2 dai_vec * x2r;
+ y3i OP2 dai_vec * x3r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ILVRL_D2_DP(y1i, y1r, y2, y3);
+ ILVRL_D2_DP(y2i, y2r, y4, y5);
+ ILVRL_D2_DP(y3i, y3r, y6, y7);
+ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
+ }
+
+ if (n & 7)
+ {
+ if (n & 4)
+ {
+ LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
+ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+ PCKEVOD_D2_DP(x3, x2, x1r, x1i);
+ PCKEVOD_D2_DP(y3, y2, y1r, y1i);
+
+ FMADD2(x0r, x1r, dar_vec, y0r, y1r);
+ y0i OP0 dar_vec * x0i;
+ y1i OP0 dar_vec * x1i;
+ y0r OP1 dai_vec * x0i;
+ y1r OP1 dai_vec * x1i;
+ y0i OP2 dai_vec * x0r;
+ y1i OP2 dai_vec * x1r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ILVRL_D2_DP(y1i, y1r, y2, y3);
+ ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, inc_x2, x0, x1);
+ LD_DP2_INC(py, inc_y2, y0, y1);
+ PCKEVOD_D2_DP(x1, x0, x0r, x0i);
+ PCKEVOD_D2_DP(y1, y0, y0r, y0i);
+
+ y0r += dar_vec * x0r;
+ y0i OP0 dar_vec * x0i;
+ y0r OP1 dai_vec * x0i;
+ y0i OP2 dai_vec * x0r;
+
+ ILVRL_D2_DP(y0i, y0r, y0, y1);
+ ST_DP2_INC(y0, y1, y, inc_y2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, xd0, xd1);
+ LD_GP2_INC(py, 1, yd0, yd1);
+
+ yd0 += da_r * xd0;
+ yd1 OP0 da_r * xd1;
+ yd0 OP1 da_i * xd1;
+ yd1 OP2 da_i * xd0;
+
+ ST_GP2_INC(yd0, yd1, y, 1);
+ }
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+ BLASLONG i;
+ v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ FLOAT f0, f1;
+
+ if (n < 0) return (0);
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ if (n > 15)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 64 + 16;
+
+ LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = (n >> 4) - 1; i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 32;
+
+ x8 = LD_DP(x); x += 2;
+ ST_DP(x0, y); y += 2;
+ x9 = LD_DP(x); x += 2;
+ ST_DP(x1, y); y += 2;
+ x10 = LD_DP(x); x += 2;
+ ST_DP(x2, y); y += 2;
+ x11 = LD_DP(x); x += 2;
+ ST_DP(x3, y); y += 2;
+ x12 = LD_DP(x); x += 2;
+ ST_DP(x4, y); y += 2;
+ x13 = LD_DP(x); x += 2;
+ ST_DP(x5, y); y += 2;
+ x14 = LD_DP(x); x += 2;
+ ST_DP(x6, y); y += 2;
+ x15 = LD_DP(x); x += 2;
+ ST_DP(x7, y); y += 2;
+ x0 = LD_DP(x); x += 2;
+ ST_DP(x8, y); y += 2;
+ x1 = LD_DP(x); x += 2;
+ ST_DP(x9, y); y += 2;
+ x2 = LD_DP(x); x += 2;
+ ST_DP(x10, y); y += 2;
+ x3 = LD_DP(x); x += 2;
+ ST_DP(x11, y); y += 2;
+ x4 = LD_DP(x); x += 2;
+ ST_DP(x12, y); y += 2;
+ x5 = LD_DP(x); x += 2;
+ ST_DP(x13, y); y += 2;
+ x6 = LD_DP(x); x += 2;
+ ST_DP(x14, y); y += 2;
+ x7 = LD_DP(x); x += 2;
+ ST_DP(x15, y); y += 2;
+ }
+
+ x8 = LD_DP(x); x += 2;
+ x9 = LD_DP(x); x += 2;
+ ST_DP(x0, y); y += 2;
+ x10 = LD_DP(x); x += 2;
+ ST_DP(x1, y); y += 2;
+ x11 = LD_DP(x); x += 2;
+ ST_DP(x2, y); y += 2;
+ x12 = LD_DP(x); x += 2;
+ ST_DP(x3, y); y += 2;
+ x13 = LD_DP(x); x += 2;
+ ST_DP(x4, y); y += 2;
+ x14 = LD_DP(x); x += 2;
+ ST_DP(x5, y); y += 2;
+ x15 = LD_DP(x); x += 2;
+ ST_DP(x6, y); y += 2;
+ ST_DP(x7, y); y += 2;
+
+ ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(x, 2, x0, x1, x2, x3);
+ ST_DP4_INC(x0, x1, x2, x3, y, 2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, 2, x0, x1);
+ ST_DP2_INC(x0, x1, y, 2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, f0, f1);
+ ST_GP2_INC(f0, f1, y, 1);
+ }
+ }
+ }
+ else
+ {
+ inc_x *= 2;
+ inc_y *= 2;
+
+ for (i = (n >> 4); i--;)
+ {
+ x0 = LD_DP(x); x += inc_x;
+ x1 = LD_DP(x); x += inc_x;
+ x2 = LD_DP(x); x += inc_x;
+ x3 = LD_DP(x); x += inc_x;
+ x4 = LD_DP(x); x += inc_x;
+ x5 = LD_DP(x); x += inc_x;
+ x6 = LD_DP(x); x += inc_x;
+ x7 = LD_DP(x); x += inc_x;
+ x8 = LD_DP(x); x += inc_x;
+ ST_DP(x0, y); y += inc_y;
+ x9 = LD_DP(x); x += inc_x;
+ ST_DP(x1, y); y += inc_y;
+ x10 = LD_DP(x); x += inc_x;
+ ST_DP(x2, y); y += inc_y;
+ x11 = LD_DP(x); x += inc_x;
+ ST_DP(x3, y); y += inc_y;
+ x12 = LD_DP(x); x += inc_x;
+ ST_DP(x4, y); y += inc_y;
+ x13 = LD_DP(x); x += inc_x;
+ ST_DP(x5, y); y += inc_y;
+ x14 = LD_DP(x); x += inc_x;
+ ST_DP(x6, y); y += inc_y;
+ x15 = LD_DP(x); x += inc_x;
+ ST_DP(x7, y); y += inc_y;
+ ST_DP(x8, y); y += inc_y;
+ ST_DP(x9, y); y += inc_y;
+ ST_DP(x10, y); y += inc_y;
+ ST_DP(x11, y); y += inc_y;
+ ST_DP(x12, y); y += inc_y;
+ ST_DP(x13, y); y += inc_y;
+ ST_DP(x14, y); y += inc_y;
+ ST_DP(x15, y); y += inc_y;
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, inc_y);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(x, inc_x, x0, x1, x2, x3);
+ ST_DP4_INC(x0, x1, x2, x3, y, inc_y);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(x, inc_x, x0, x1);
+ ST_DP2_INC(x0, x1, y, inc_y);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(x, 1, f0, f1);
+ ST_GP2_INC(f0, f1, y, 1);
+ }
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+/* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10)
+ 0 1 2 3 => 2 3 0 1 */
+#define SHF_78 78
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
+ FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
+ BLASLONG dummy2)
+{
+ BLASLONG i, inc_x2;
+ FLOAT *px;
+ FLOAT tp0, tp1, f0, f1;
+ v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
+ v2f64 da_i_vec, da_i_vec_neg, da_r_vec;
+
+ px = x;
+
+ if (1 == inc_x)
+ {
+ if ((0.0 == da_r) && (0.0 == da_i))
+ {
+ v2f64 zero_v = __msa_cast_to_vector_double(0);
+ zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
+ zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+
+ for (i = (n >> 4); i--;)
+ {
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 2);
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, 2);
+ }
+
+ if (n & 4)
+ {
+ ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
+ }
+
+ if (n & 2)
+ {
+ ST_DP2_INC(zero_v, zero_v, x, 2);
+ }
+
+ if (n & 1)
+ {
+ ST_DP(zero_v, x);
+ }
+ }
+ }
+ else if (0.0 == da_r)
+ {
+ da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
+ da_i_vec_neg = -da_i_vec;
+ da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
+
+ if (n > 15)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 32 + 16;
+
+ LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = (n >> 4)- 1; i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 32;
+
+ x8 = LD_DP(px); px += 2;
+ x0 *= da_i_vec;
+ x9 = LD_DP(px); px += 2;
+ x1 *= da_i_vec;
+ x10 = LD_DP(px); px += 2;
+ x2 *= da_i_vec;
+ x11 = LD_DP(px); px += 2;
+ x3 *= da_i_vec;
+ x12 = LD_DP(px); px += 2;
+ x4 *= da_i_vec;
+ x13 = LD_DP(px); px += 2;
+ x5 *= da_i_vec;
+ x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78);
+ x14 = LD_DP(px); px += 2;
+ x6 *= da_i_vec;
+ x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78);
+ x15 = LD_DP(px); px += 2;
+ x7 *= da_i_vec;
+ x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78);
+ x8 *= da_i_vec;
+ x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78);
+ ST_DP(x0, x); x += 2;
+ x9 *= da_i_vec;
+ x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78);
+ ST_DP(x1, x); x += 2;
+ x10 *= da_i_vec;
+ x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78);
+ ST_DP(x2, x); x += 2;
+ x11 *= da_i_vec;
+ x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78);
+ ST_DP(x3, x); x += 2;
+ x12 *= da_i_vec;
+ x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78);
+ ST_DP(x4, x); x += 2;
+ x13 *= da_i_vec;
+ x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78);
+ ST_DP(x5, x); x += 2;
+ x14 *= da_i_vec;
+ x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78);
+ ST_DP(x6, x); x += 2;
+ x15 *= da_i_vec;
+ x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78);
+ ST_DP(x7, x); x += 2;
+ x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78);
+ ST_DP(x8, x); x += 2;
+ x0 = LD_DP(px); px += 2;
+ x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78);
+ ST_DP(x9, x); x += 2;
+ x1 = LD_DP(px); px += 2;
+ x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78);
+ ST_DP(x10, x); x += 2;
+ x2 = LD_DP(px); px += 2;
+ x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78);
+ ST_DP(x11, x); x += 2;
+ x3 = LD_DP(px); px += 2;
+ x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78);
+ ST_DP(x12, x); x += 2;
+ x4 = LD_DP(px); px += 2;
+ ST_DP(x13, x); x += 2;
+ x5 = LD_DP(px); px += 2;
+ ST_DP(x14, x); x += 2;
+ x6 = LD_DP(px); px += 2;
+ ST_DP(x15, x); x += 2;
+ x7 = LD_DP(px); px += 2;
+ }
+
+ LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ x4, x5, x6, x7);
+ MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
+ x8, x9, x10, x11);
+ MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
+ x12, x13, x14, x15);
+ SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
+ SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
+ SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
+ SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
+ ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+ x12, x13, x14, x15, x, 2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ x4, x5, x6, x7);
+ SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
+ SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(px, 2, x0, x1, x2, x3);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
+ ST_DP4_INC(x0, x1, x2, x3, x, 2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(px, 2, x0, x1);
+ MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
+ SHF_W2_DP(x0, x1, x0, x1, SHF_78);
+ ST_DP2_INC(x0, x1, x, 2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+ MUL2(f0, da_i, f1, -da_i, f0, f1);
+ ST_GP2_INC(f1, f0, x, 1);
+ }
+ }
+ }
+ else if (0.0 == da_i)
+ {
+ da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
+
+ if (n > 15)
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 32 + 16;
+
+ LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ for (i = (n >> 4)- 1; i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 32;
+
+ x8 = LD_DP(px); px += 2;
+ x0 *= da_r_vec;
+ x9 = LD_DP(px); px += 2;
+ x1 *= da_r_vec;
+ x10 = LD_DP(px); px += 2;
+ x2 *= da_r_vec;
+ x11 = LD_DP(px); px += 2;
+ x3 *= da_r_vec;
+ x12 = LD_DP(px); px += 2;
+ x4 *= da_r_vec;
+ x13 = LD_DP(px); px += 2;
+ x5 *= da_r_vec;
+ ST_DP(x0, x); x += 2;
+ x14 = LD_DP(px); px += 2;
+ x6 *= da_r_vec;
+ ST_DP(x1, x); x += 2;
+ x15 = LD_DP(px); px += 2;
+ x7 *= da_r_vec;
+ ST_DP(x2, x); x += 2;
+ x8 *= da_r_vec;
+ ST_DP(x3, x); x += 2;
+ x9 *= da_r_vec;
+ ST_DP(x4, x); x += 2;
+ x10 *= da_r_vec;
+ ST_DP(x5, x); x += 2;
+ x11 *= da_r_vec;
+ ST_DP(x6, x); x += 2;
+ x12 *= da_r_vec;
+ ST_DP(x7, x); x += 2;
+ x13 *= da_r_vec;
+ ST_DP(x8, x); x += 2;
+ x0 = LD_DP(px); px += 2;
+ x14 *= da_r_vec;
+ ST_DP(x9, x); x += 2;
+ x1 = LD_DP(px); px += 2;
+ x15 *= da_r_vec;
+ ST_DP(x10, x); x += 2;
+ x2 = LD_DP(px); px += 2;
+ ST_DP(x11, x); x += 2;
+ x3 = LD_DP(px); px += 2;
+ ST_DP(x12, x); x += 2;
+ x4 = LD_DP(px); px += 2;
+ ST_DP(x13, x); x += 2;
+ x5 = LD_DP(px); px += 2;
+ ST_DP(x14, x); x += 2;
+ x6 = LD_DP(px); px += 2;
+ ST_DP(x15, x); x += 2;
+ x7 = LD_DP(px); px += 2;
+ }
+
+ LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
+ x4, x5, x6, x7);
+ MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
+ x8, x9, x10, x11);
+ MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
+ x12, x13, x14, x15);
+ ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+ x12, x13, x14, x15, x, 2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
+ x4, x5, x6, x7);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(px, 2, x0, x1, x2, x3);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ ST_DP4_INC(x0, x1, x2, x3, x, 2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(px, 2, x0, x1);
+ MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
+ ST_DP2_INC(x0, x1, x, 2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+ MUL2(f0, da_r, f1, da_r, f0, f1);
+ ST_GP2_INC(f0, f1, x, 1);
+ }
+ }
+ }
+ else
+ {
+ FLOAT *x_pref;
+ BLASLONG pref_offset;
+
+ pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
+ if (pref_offset > 0)
+ {
+ pref_offset = L1_DATA_LINESIZE - pref_offset;
+ pref_offset = pref_offset / sizeof(FLOAT);
+ }
+ x_pref = x + pref_offset + 32;
+
+ da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
+ da_i_vec_neg = -da_i_vec;
+ da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
+
+ da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
+
+ for (i = (n >> 4); i--;)
+ {
+ PREF_OFFSET(x_pref, 0);
+ PREF_OFFSET(x_pref, 32);
+ PREF_OFFSET(x_pref, 64);
+ PREF_OFFSET(x_pref, 96);
+ PREF_OFFSET(x_pref, 128);
+ PREF_OFFSET(x_pref, 160);
+ PREF_OFFSET(x_pref, 192);
+ PREF_OFFSET(x_pref, 224);
+ x_pref += 32;
+
+ LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
+ x11, x12, x13, x14, x15);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ d4, d5, d6, d7);
+ MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
+ d8, d9, d10, d11);
+ MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
+ d12, d13, d14, d15);
+ SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
+ SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
+ SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
+ SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
+ FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
+ FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
+ ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
+ d12, d13, d14, d15, x, 2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ d4, d5, d6, d7);
+ SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
+ SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
+ ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(px, 2, x0, x1, x2, x3);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ ST_DP4_INC(d0, d1, d2, d3, x, 2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(px, 2, x0, x1);
+ MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
+ SHF_W2_DP(d0, d1, d0, d1, SHF_78);
+ FMADD2(x0, x1, da_r_vec, d0, d1);
+ ST_DP2_INC(d0, d1, x, 2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+
+ tp0 = da_r * f0;
+ tp0 -= da_i * f1;
+ tp1 = da_r * f1;
+ tp1 += da_i * f0;
+
+ ST_GP2_INC(tp0, tp1, x, 1);
+ }
+ }
+ }
+ }
+ else
+ {
+ inc_x2 = 2 * inc_x;
+
+ if ((0.0 == da_r) && (0.0 == da_i))
+ {
+ v2f64 zero_v = __msa_cast_to_vector_double(0);
+ zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
+ zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+
+ for (i = (n >> 4); i--;)
+ {
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, inc_x2);
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, inc_x2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
+ zero_v, zero_v, x, inc_x2);
+ }
+
+ if (n & 4)
+ {
+ ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2);
+ }
+
+ if (n & 2)
+ {
+ ST_DP2_INC(zero_v, zero_v, x, inc_x2);
+ }
+
+ if (n & 1)
+ {
+ ST_DP(zero_v, x);
+ }
+ }
+ }
+ else if (0.0 == da_r)
+ {
+ da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
+ da_i_vec_neg = -da_i_vec;
+ da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
+ x10, x11, x12, x13, x14, x15);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ x4, x5, x6, x7);
+ MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
+ x8, x9, x10, x11);
+ MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
+ x12, x13, x14, x15);
+ SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
+ SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
+ SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
+ SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
+ ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+ x12, x13, x14, x15, x, inc_x2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ x4, x5, x6, x7);
+ SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
+ SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ x0, x1, x2, x3);
+ SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
+ ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(px, inc_x2, x0, x1);
+ MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
+ SHF_W2_DP(x0, x1, x0, x1, SHF_78);
+ ST_DP2_INC(x0, x1, x, inc_x2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+ MUL2(f0, da_i, f1, -da_i, f0, f1);
+ ST_GP2_INC(f1, f0, x, 1);
+ }
+ }
+ }
+ else if (0.0 == da_i)
+ {
+ da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
+ x10, x11, x12, x13, x14, x15);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
+ x4, x5, x6, x7);
+ MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
+ x8, x9, x10, x11);
+ MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
+ x12, x13, x14, x15);
+ ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
+ x12, x13, x14, x15, x, inc_x2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
+ x4, x5, x6, x7);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
+ MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
+ x0, x1, x2, x3);
+ ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(px, inc_x2, x0, x1);
+ MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
+ ST_DP2_INC(x0, x1, x, inc_x2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+ MUL2(f0, da_r, f1, da_r, f0, f1);
+ ST_GP2_INC(f0, f1, x, 1);
+ }
+ }
+ }
+ else
+ {
+ da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
+ da_i_vec_neg = -da_i_vec;
+ da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
+
+ da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
+
+ for (i = (n >> 4); i--;)
+ {
+ LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
+ x10, x11, x12, x13, x14, x15);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ d4, d5, d6, d7);
+ MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
+ d8, d9, d10, d11);
+ MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
+ d12, d13, d14, d15);
+ SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
+ SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
+ SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
+ SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
+ FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
+ FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
+ ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
+ d12, d13, d14, d15, x, inc_x2);
+ }
+
+ if (n & 15)
+ {
+ if (n & 8)
+ {
+ LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
+ d4, d5, d6, d7);
+ SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
+ SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
+ ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2);
+ }
+
+ if (n & 4)
+ {
+ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
+ MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
+ d0, d1, d2, d3);
+ SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
+ FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
+ ST_DP4_INC(d0, d1, d2, d3, x, inc_x2);
+ }
+
+ if (n & 2)
+ {
+ LD_DP2_INC(px, inc_x2, x0, x1);
+ MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
+ SHF_W2_DP(d0, d1, d0, d1, SHF_78);
+ FMADD2(x0, x1, da_r_vec, d0, d1);
+ ST_DP2_INC(d0, d1, x, inc_x2);
+ }
+
+ if (n & 1)
+ {
+ LD_GP2_INC(px, 1, f0, f1);
+
+ tp0 = da_r * f0;
+ tp0 -= da_i * f1;
+ tp1 = da_r * f1;
+ tp1 += da_i * f0;
+
+ ST_GP2_INC(tp0, tp1, x, 1);
+ }
+ }
+ }
+ }
+
+ return (0);
+}
--- /dev/null
+/*******************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include "common.h"
+#include "macros_msa.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
+ FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy,
+ BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+ BLASLONG i, inc_x2, inc_y2, pref_offsetx, pref_offsety;
+ FLOAT *px, *py;
+ v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
+ v2f64 y0, y1, y2, y3, y4, y5, y6, y7;
+
+ if (n < 0) return (0);
+
+ pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
+ if (pref_offsetx > 0)
+ {
+ pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
+ pref_offsetx = pref_offsetx / sizeof(FLOAT);
+ }
+
+ pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
+ if (pref_offsety > 0)
+ {
+ pref_offsety = L1_DATA_LINESIZE - pref_offsety;
+ pref_offsety = pref_offsety / sizeof(FLOAT);
+ }
+
+ inc_x2 = 2 * inc_x;
+ inc_y2 = 2 * inc_y;
+
+ px = srcx;
+ py = srcy;
+
+ if ((1 == inc_x) && (1 == inc_y))
+ {
+ if (n >> 3)
+ {
+ LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
+
+ for (i = (n >> 3) - 1; i--;)
+ {
+ PREFETCH(px + pref_offsetx + 16);
+ PREFETCH(px + pref_offsetx + 20);
+ PREFETCH(px + pref_offsetx + 24);
+ PREFETCH(px + pref_offsetx + 28);
+
+ PREFETCH(py + pref_offsety + 16);
+ PREFETCH(py + pref_offsety + 20);
+ PREFETCH(py + pref_offsety + 24);
+ PREFETCH(py + pref_offsety + 28);
+
+ y0 = LD_DP(py); py += 2;
+ ST_DP(x0, srcy); srcy += 2;
+ y1 = LD_DP(py); py += 2;
+ ST_DP(x1, srcy); srcy += 2;
+ y2 = LD_DP(py); py += 2;
+ ST_DP(x2, srcy); srcy += 2;
+ y3 = LD_DP(py); py += 2;
+ ST_DP(x3, srcy); srcy += 2;
+ y4 = LD_DP(py); py += 2;
+ ST_DP(x4, srcy); srcy += 2;
+ y5 = LD_DP(py); py += 2;
+ ST_DP(x5, srcy); srcy += 2;
+ y6 = LD_DP(py); py += 2;
+ ST_DP(x6, srcy); srcy += 2;
+ y7 = LD_DP(py); py += 2;
+ ST_DP(x7, srcy); srcy += 2;
+
+ x0 = LD_DP(px); px += 2;
+ ST_DP(y0, srcx); srcx += 2;
+ x1 = LD_DP(px); px += 2;
+ ST_DP(y1, srcx); srcx += 2;
+ x2 = LD_DP(px); px += 2;
+ ST_DP(y2, srcx); srcx += 2;
+ x3 = LD_DP(px); px += 2;
+ ST_DP(y3, srcx); srcx += 2;
+ x4 = LD_DP(px); px += 2;
+ ST_DP(y4, srcx); srcx += 2;
+ x5 = LD_DP(px); px += 2;
+ ST_DP(y5, srcx); srcx += 2;
+ x6 = LD_DP(px); px += 2;
+ ST_DP(y6, srcx); srcx += 2;
+ x7 = LD_DP(px); px += 2;
+ ST_DP(y7, srcx); srcx += 2;
+ }
+
+ LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, 2);
+ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, 2);
+ }
+
+ if (n & 7)
+ {
+ if ((n & 4) && (n & 2) && (n & 1))
+ {
+ LD_DP7_INC(px, 2, x0, x1, x2, x3, x4, x5, x6);
+ LD_DP7_INC(py, 2, y0, y1, y2, y3, y4, y5, y6);
+ ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, 2);
+ ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, 2);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_DP6_INC(px, 2, x0, x1, x2, x3, x4, x5);
+ LD_DP6_INC(py, 2, y0, y1, y2, y3, y4, y5);
+ ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, 2);
+ ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, 2);
+ }
+ else if ((n & 4) && (n & 1))
+ {
+ LD_DP5_INC(px, 2, x0, x1, x2, x3, x4);
+ LD_DP5_INC(py, 2, y0, y1, y2, y3, y4);
+ ST_DP5_INC(x0, x1, x2, x3, x4, srcy, 2);
+ ST_DP5_INC(y0, y1, y2, y3, y4, srcx, 2);
+ }
+ else if ((n & 2) && (n & 1))
+ {
+ LD_DP3_INC(px, 2, x0, x1, x2);
+ LD_DP3_INC(py, 2, y0, y1, y2);
+ ST_DP3_INC(x0, x1, x2, srcy, 2);
+ ST_DP3_INC(y0, y1, y2, srcx, 2);
+ }
+ else if (n & 4)
+ {
+ LD_DP4_INC(px, 2, x0, x1, x2, x3);
+ LD_DP4_INC(py, 2, y0, y1, y2, y3);
+ ST_DP4_INC(x0, x1, x2, x3, srcy, 2);
+ ST_DP4_INC(y0, y1, y2, y3, srcx, 2);
+ }
+ else if (n & 2)
+ {
+ LD_DP2_INC(px, 2, x0, x1);
+ LD_DP2_INC(py, 2, y0, y1);
+ ST_DP2_INC(x0, x1, srcy, 2);
+ ST_DP2_INC(y0, y1, srcx, 2);
+ }
+ else if (n & 1)
+ {
+ x0 = LD_DP(px);
+ y0 = LD_DP(py);
+ ST_DP(y0, srcx);
+ ST_DP(x0, srcy);
+ }
+ }
+ }
+ else
+ {
+ for (i = (n >> 3); i--;)
+ {
+ LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
+ LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
+ ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y2);
+ ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x2);
+ }
+
+ if (n & 7)
+ {
+ if ((n & 4) && (n & 2) && (n & 1))
+ {
+ LD_DP7_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6);
+ LD_DP7_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6);
+ ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y2);
+ ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x2);
+ }
+ else if ((n & 4) && (n & 2))
+ {
+ LD_DP6_INC(px, inc_x2, x0, x1, x2, x3, x4, x5);
+ LD_DP6_INC(py, inc_y2, y0, y1, y2, y3, y4, y5);
+ ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y2);
+ ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x2);
+ }
+ else if ((n & 4) && (n & 1))
+ {
+ LD_DP5_INC(px, inc_x2, x0, x1, x2, x3, x4);
+ LD_DP5_INC(py, inc_y2, y0, y1, y2, y3, y4);
+ ST_DP5_INC(x0, x1, x2, x3, x4, srcy, inc_y2);
+ ST_DP5_INC(y0, y1, y2, y3, y4, srcx, inc_x2);
+ }
+ else if ((n & 2) && (n & 1))
+ {
+ LD_DP3_INC(px, inc_x2, x0, x1, x2);
+ LD_DP3_INC(py, inc_y2, y0, y1, y2);
+ ST_DP3_INC(x0, x1, x2, srcy, inc_y2);
+ ST_DP3_INC(y0, y1, y2, srcx, inc_x2);
+ }
+ else if (n & 4)
+ {
+ LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
+ LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
+ ST_DP4_INC(x0, x1, x2, x3, srcy, inc_y2);
+ ST_DP4_INC(y0, y1, y2, y3, srcx, inc_x2);
+ }
+ else if (n & 2)
+ {
+ LD_DP2_INC(px, inc_x2, x0, x1);
+ LD_DP2_INC(py, inc_y2, y0, y1);
+ ST_DP2_INC(x0, x1, srcy, inc_y2);
+ ST_DP2_INC(y0, y1, srcx, inc_x2);
+ }
+ else if (n & 1)
+ {
+ x0 = LD_DP(px);
+ y0 = LD_DP(py);
+ ST_DP(y0, srcx);
+ ST_DP(x0, srcy);
+ }
+ }
+ }
+
+ return (0);
+}