From: Shivraj Patil Date: Tue, 17 Jan 2017 06:45:07 +0000 (+0530) Subject: Added rot functions. X-Git-Tag: upstream/0.2.20^2~70^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a4d97d980ff1e5f0a84668235db37cd7b8cf0bcb;p=platform%2Fupstream%2Fopenblas.git Added rot functions. Signed-off-by: Shivraj Patil --- diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 78cb00c..9a16704 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -83,10 +83,17 @@ DNRM2KERNEL = ../mips/nrm2.c CNRM2KERNEL = ../mips/znrm2.c ZNRM2KERNEL = ../mips/znrm2.c +ifdef HAVE_MSA +SROTKERNEL = ../mips/srot_msa.c +DROTKERNEL = ../mips/drot_msa.c +CROTKERNEL = ../mips/crot_msa.c +ZROTKERNEL = ../mips/zrot_msa.c +else SROTKERNEL = ../mips/rot.c DROTKERNEL = ../mips/rot.c CROTKERNEL = ../mips/zrot.c ZROTKERNEL = ../mips/zrot.c +endif ifdef HAVE_MSA SSCALKERNEL = ../mips/sscal_msa.c diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c new file mode 100644 index 0000000..5273e38 --- /dev/null +++ b/kernel/mips/crot_msa.c @@ -0,0 +1,1072 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) +{ + BLASLONG i, j; + FLOAT *px, *py; + FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3; + BLASLONG inc_x2, inc_y2; + v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; + v4f32 out0, out1, out2, out3, out4, out5, out6, out7; + v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0; + + if (n <= 0) return (0); + + px = x; + py = y; + + if ((1 == inc_x) && (1 == inc_y)) + { + if ((0 == c) && (0 == s)) + { + v4f32 zero = __msa_cast_to_vector_float(0); + zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); + zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); + zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); + zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + + /* process 2 elements */ + for (j = (n >> 1); j--;) + { + ST_SP(zero, px); + ST_SP(zero, py); + + px += 4; + py += 4; + } + if (n & 1) + { + px[0] = 0; + px[1] = 0; + py[0] = 0; + py[1] = 0; + } + } + else if ((1 == c) && (1 == s)) + { + if (n >> 4) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + x0 = LD_SP(px); px += 4; + x1 = LD_SP(px); px += 4; + x2 = LD_SP(px); px += 4; + x3 = LD_SP(px); px += 4; + y0 = LD_SP(py); py += 4; + y1 = LD_SP(py); py += 4; + y2 = LD_SP(py); py += 4; + y3 = LD_SP(py); py += 4; + + for (j = (n >> 4) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + out0 = x0 + y0; + x4 = LD_SP(px); px += 4; + out1 = y0 - x0; + x5 = LD_SP(px); px += 4; + out2 = x1 + y1; + x6 = LD_SP(px); px += 4; + out3 = y1 - x1; + x7 = LD_SP(px); px += 4; + out4 = x2 + y2; + y4 = LD_SP(py); py += 4; + out5 = y2 - x2; + y5 = LD_SP(py); py += 4; + out6 = x3 + y3; + y6 = LD_SP(py); py += 4; + out7 = y3 - x3; + y7 = LD_SP(py); py += 4; + + ST_SP(out0, x); x += 4; + out8 = x4 + y4; + ST_SP(out1, y); y += 4; + out9 = y4 - x4; + ST_SP(out2, x); x += 4; + out10 = x5 + y5; + ST_SP(out3, y); y += 4; + out11 = y5 - x5; + ST_SP(out4, x); x += 4; + out12 = x6 + y6; + ST_SP(out5, y); y += 4; + out13 = y6 - x6; + ST_SP(out6, x); x += 4; + out14 = x7 + y7; + ST_SP(out7, y); y += 4; + out15 = y7 - x7; + + x0 = LD_SP(px); px += 4; + ST_SP(out8, x); x += 4; + x1 = LD_SP(px); px += 4; + ST_SP(out10, x); x += 4; + x2 = LD_SP(px); px += 4; + ST_SP(out12, x); x += 4; + x3 = LD_SP(px); px += 4; + ST_SP(out14, x); x += 4; + + y0 = LD_SP(py); py += 4; + ST_SP(out9, y); y += 4; + y1 = LD_SP(py); py += 4; + ST_SP(out11, y); y += 4; + y2 = LD_SP(py); py += 4; + ST_SP(out13, y); y += 4; + y3 = LD_SP(py); py += 4; + ST_SP(out15, y); y += 4; + } + + x4 = LD_SP(px); px += 4; + x5 = LD_SP(px); px += 4; + x6 = LD_SP(px); px += 4; + x7 = LD_SP(px); px += 4; + y4 = LD_SP(py); py += 4; + y5 = LD_SP(py); py += 4; + y6 = LD_SP(py); py += 4; + y7 = LD_SP(py); py += 4; + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + out4 = x2 + y2; + out5 = y2 - x2; + out6 = x3 + y3; + out7 = y3 - x3; + out8 = x4 + y4; + out9 = y4 - x4; + out10 = x5 + y5; + out11 = y5 - x5; + out12 = x6 + y6; + out13 = y6 - x6; + out14 = x7 + y7; + out15 = y7 - x7; + + ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4); + ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4); + } + if (n & 8) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + out4 = x2 + y2; + out5 = y2 - x2; + out6 = x3 + y3; + out7 = y3 - x3; + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 4) + { + LD_SP2_INC(px, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + + ST_SP2_INC(out0, out2, x, 4); + ST_SP2_INC(out1, out3, y, 4); + } + if (n & 2) + { + x0 = LD_SP(px); + y0 = LD_SP(py); + px += 4; + py += 4; + + out0 = x0 + y0; + out1 = y0 - x0; + + ST_SP(out0, x); + ST_SP(out1, y); + x += 4; + y += 4; + } + if (n & 1) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + tp2 = fx1 + fy1; + tp3 = fy1 - fx1; + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + } + else if (0 == s) + { + + c0 = COPY_FLOAT_TO_VECTOR(c); + + if (n >> 4) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + + for (j = (n >> 4) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + y0 = LD_SP(py); py += 4; + x0 *= c0; + y1 = LD_SP(py); py += 4; + x1 *= c0; + y2 = LD_SP(py); py += 4; + x2 *= c0; + y3 = LD_SP(py); py += 4; + x3 *= c0; + y4 = LD_SP(py); py += 4; + x4 *= c0; + y5 = LD_SP(py); py += 4; + x5 *= c0; + y6 = LD_SP(py); py += 4; + x6 *= c0; + y7 = LD_SP(py); py += 4; + x7 *= c0; + + ST_SP(x0, x); x += 4; + y0 *= c0; + ST_SP(x1, x); x += 4; + y1 *= c0; + ST_SP(x2, x); x += 4; + y2 *= c0; + ST_SP(x3, x); x += 4; + y3 *= c0; + ST_SP(x4, x); x += 4; + y4 *= c0; + ST_SP(x5, x); x += 4; + y5 *= c0; + ST_SP(x6, x); x += 4; + y6 *= c0; + ST_SP(x7, x); x += 4; + y7 *= c0; + + x0 = LD_SP(px); px += 4; + ST_SP(y0, y); y += 4; + x1 = LD_SP(px); px += 4; + ST_SP(y1, y); y += 4; + x2 = LD_SP(px); px += 4; + ST_SP(y2, y); y += 4; + x3 = LD_SP(px); px += 4; + ST_SP(y3, y); y += 4; + x4 = LD_SP(px); px += 4; + ST_SP(y4, y); y += 4; + x5 = LD_SP(px); px += 4; + ST_SP(y5, y); y += 4; + x6 = LD_SP(px); px += 4; + ST_SP(y6, y); y += 4; + x7 = LD_SP(px); px += 4; + ST_SP(y7, y); y += 4; + } + + LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); + + x0 *= c0; + y0 *= c0; + x1 *= c0; + y1 *= c0; + x2 *= c0; + y2 *= c0; + x3 *= c0; + y3 *= c0; + x4 *= c0; + y4 *= c0; + x5 *= c0; + y5 *= c0; + x6 *= c0; + y6 *= c0; + x7 *= c0; + y7 *= c0; + + ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); + ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); + } + if (n & 8) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + x0 *= c0; + y0 *= c0; + x1 *= c0; + y1 *= c0; + x2 *= c0; + y2 *= c0; + x3 *= c0; + y3 *= c0; + + ST_SP4_INC(x0, x1, x2, x3, x, 4); + ST_SP4_INC(y0, y1, y2, y3, y, 4); + } + if (n & 4) + { + LD_SP2_INC(px, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + + x0 *= c0; + y0 *= c0; + x1 *= c0; + y1 *= c0; + + ST_SP2_INC(x0, x1, x, 4); + ST_SP2_INC(y0, y1, y, 4); + } + if (n & 2) + { + x0 = LD_SP(px); + y0 = LD_SP(py); + px += 4; + py += 4; + + x0 *= c0; + y0 *= c0; + + ST_SP(x0, x); + ST_SP(y0, y); + x += 4; + y += 4; + } + if (n & 1) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = (c * fx0); + tp1 = (c * fy0); + tp2 = (c * fx1); + tp3 = (c * fy1); + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + } + else if (0 == c) + { + s0 = COPY_FLOAT_TO_VECTOR(s); + + /* process 16 floats */ + if (n >> 4) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + for (j = (n >> 4) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + x4 = LD_SP(px); px += 4; + out0 = s0 * y0; + x5 = LD_SP(px); px += 4; + out2 = s0 * y1; + x6 = LD_SP(px); px += 4; + out4 = s0 * y2; + x7 = LD_SP(px); px += 4; + out6 = s0 * y3; + y4 = LD_SP(py); py += 4; + out1 = -(s0 * x0); + y5 = LD_SP(py); py += 4; + out3 = -(s0 * x1); + y6 = LD_SP(py); py += 4; + out5 = -(s0 * x2); + y7 = LD_SP(py); py += 4; + out7 = -(s0 * x3); + + ST_SP(out0, x); x += 4; + out0 = s0 * y4; + ST_SP(out2, x); x += 4; + out2 = s0 * y5; + ST_SP(out4, x); x += 4; + out4 = s0 * y6; + ST_SP(out6, x); x += 4; + out6 = s0 * y7; + ST_SP(out1, y); y += 4; + out1 = -(s0 * x4); + ST_SP(out3, y); y += 4; + out3 = -(s0 * x5); + ST_SP(out5, y); y += 4; + out5 = -(s0 * x6); + ST_SP(out7, y); y += 4; + out7 = -(s0 * x7); + + x0 = LD_SP(px); px += 4; + ST_SP(out0, x); x += 4; + x1 = LD_SP(px); px += 4; + ST_SP(out2, x); x += 4; + x2 = LD_SP(px); px += 4; + ST_SP(out4, x); x += 4; + x3 = LD_SP(px); px += 4; + ST_SP(out6, x); x += 4; + y0 = LD_SP(py); py += 4; + ST_SP(out1, y); y += 4; + y1 = LD_SP(py); py += 4; + ST_SP(out3, y); y += 4; + y2 = LD_SP(py); py += 4; + ST_SP(out5, y); y += 4; + y3 = LD_SP(py); py += 4; + ST_SP(out7, y); y += 4; + } + + out0 = s0 * y0; + out2 = s0 * y1; + out4 = s0 * y2; + out6 = s0 * y3; + out1 = -(s0 * x0); + out3 = -(s0 * x1); + out5 = -(s0 * x2); + out7 = -(s0 * x3); + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + + LD_SP4_INC(px, 4, x4, x5, x6, x7); + LD_SP4_INC(py, 4, y4, y5, y6, y7); + + out0 = s0 * y4; + out2 = s0 * y5; + out4 = s0 * y6; + out6 = s0 * y7; + out1 = -(s0 * x4); + out3 = -(s0 * x5); + out5 = -(s0 * x6); + out7 = -(s0 * x7); + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 8) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + out0 = s0 * y0; + out1 = - (s0 * x0); + out2 = s0 * y1; + out3 = - (s0 * x1); + out4 = s0 * y2; + out5 = - (s0 * x2); + out6 = s0 * y3; + out7 = - (s0 * x3); + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 4) + { + LD_SP2_INC(px, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + + out0 = s0 * y0; + out1 = - (s0 * x0); + out2 = s0 * y1; + out3 = - (s0 * x1); + + ST_SP2_INC(out0, out2, x, 4); + ST_SP2_INC(out1, out3, y, 4); + } + if (n & 2) + { + x0 = LD_SP(px); px += 4; + y0 = LD_SP(py); py += 4; + + out0 = s0 * y0; + out1 = - (s0 * x0); + + ST_SP(out0, x); x += 4; + ST_SP(out1, y); y += 4; + } + if (n & 1) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = s * fy0; + tp1 = - (s * fx0); + tp2 = s * fy1; + tp3 = - (s * fx1); + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + } + else + { + c0 = COPY_FLOAT_TO_VECTOR(c); + s0 = COPY_FLOAT_TO_VECTOR(s); + + if (n >> 4) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + for (j = (n >> 4) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + x4 = LD_SP(px); px += 4; + out0 = c0 * x0; + x5 = LD_SP(px); px += 4; + out1 = c0 * y0; + x6 = LD_SP(px); px += 4; + out2 = c0 * x1; + x7 = LD_SP(px); px += 4; + out3 = c0 * y1; + y4 = LD_SP(py); py += 4; + out4 = c0 * x2; + y5 = LD_SP(py); py += 4; + out5 = c0 * y2; + y6 = LD_SP(py); py += 4; + out6 = c0 * x3; + y7 = LD_SP(py); py += 4; + out7 = c0 * y3; + + out0 += s0 * y0; + out1 -= s0 * x0; + out2 += s0 * y1; + out3 -= s0 * x1; + out4 += s0 * y2; + out5 -= s0 * x2; + out6 += s0 * y3; + out7 -= s0 * x3; + + ST_SP(out0, x); x += 4; + out8 = c0 * x4; + ST_SP(out2, x); x += 4; + out9 = c0 * y4; + ST_SP(out4, x); x += 4; + out10 = c0 * x5; + ST_SP(out6, x); x += 4; + out11 = c0 * y5; + ST_SP(out1, y); y += 4; + out12 = c0 * x6; + ST_SP(out3, y); y += 4; + out13 = c0 * y6; + ST_SP(out5, y); y += 4; + out14 = c0 * x7; + ST_SP(out7, y); y += 4; + out15 = c0 * y7; + + x0 = LD_SP(px); px += 4; + out8 += s0 * y4; + x1 = LD_SP(px); px += 4; + out9 -= s0 * x4; + x2 = LD_SP(px); px += 4; + out10 += s0 * y5; + x3 = LD_SP(px); px += 4; + out11 -= s0 * x5; + y0 = LD_SP(py); py += 4; + out12 += s0 * y6; + y1 = LD_SP(py); py += 4; + out13 -= s0 * x6; + y2 = LD_SP(py); py += 4; + out14 += s0 * y7; + y3 = LD_SP(py); py += 4; + out15 -= s0 * x7; + + ST_SP(out8, x); x += 4; + ST_SP(out10, x); x += 4; + ST_SP(out12, x); x += 4; + ST_SP(out14, x); x += 4; + ST_SP(out9, y); y += 4; + ST_SP(out11, y); y += 4; + ST_SP(out13, y); y += 4; + ST_SP(out15, y); y += 4; + } + + out0 = c0 * x0; + out0 += s0 * y0; + out1 = c0 * y0; + out1 -= s0 * x0; + out2 = c0 * x1; + out2 += s0 * y1; + out3 = c0 * y1; + out3 -= s0 * x1; + out4 = c0 * x2; + out4 += s0 * y2; + out5 = c0 * y2; + out5 -= s0 * x2; + out6 = c0 * x3; + out6 += s0 * y3; + out7 = c0 * y3; + out7 -= s0 * x3; + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + + LD_SP4_INC(px, 4, x4, x5, x6, x7); + LD_SP4_INC(py, 4, y4, y5, y6, y7); + + out8 = c0 * x4; + out8 += s0 * y4; + out9 = c0 * y4; + out9 -= s0 * x4; + out10 = c0 * x5; + out10 += s0 * y5; + out11 = c0 * y5; + out11 -= s0 * x5; + out12 = c0 * x6; + out12 += s0 * y6; + out13 = c0 * y6; + out13 -= s0 * x6; + out14 = c0 * x7; + out14 += s0 * y7; + out15 = c0 * y7; + out15 -= s0 * x7; + + ST_SP4_INC(out8, out10, out12, out14, x, 4); + ST_SP4_INC(out9, out11, out13, out15, y, 4); + } + if (n & 8) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + out2 = (c0 * x1) + (s0 * y1); + out3 = (c0 * y1) - (s0 * x1); + out4 = (c0 * x2) + (s0 * y2); + out5 = (c0 * y2) - (s0 * x2); + out6 = (c0 * x3) + (s0 * y3); + out7 = (c0 * y3) - (s0 * x3); + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 4) + { + LD_SP2_INC(px, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + out2 = (c0 * x1) + (s0 * y1); + out3 = (c0 * y1) - (s0 * x1); + + ST_SP2_INC(out0, out2, x, 4); + ST_SP2_INC(out1, out3, y, 4); + } + if (n & 2) + { + x0 = LD_SP(px); + y0 = LD_SP(py); + px += 4; + py += 4; + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + + ST_SP(out0, x); + ST_SP(out1, y); + x += 4; + y += 4; + } + if (n & 1) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = (c * fx0) + (s * fy0); + tp1 = (c * fy0) - (s * fx0); + tp2 = (c * fx1) + (s * fy1); + tp3 = (c * fy1) - (s * fx1); + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + } + } + else + { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + if ((0 == c) && (0 == s)) + { + for (i = n; i--;) + { + *x = 0; + *(x + 1) = 0; + *y = 0; + *(y + 1) = 0; + + x += inc_x2; + y += inc_y2; + } + } + else if ((1 == c) && (1 == s)) + { + if (n >> 1) + { + fx0 = *px; + fx1 = *(px+1); px += inc_x2; + fx2 = *px; + fx3 = *(px+1); px += inc_x2; + + fy0 = *py; + fy1 = *(py+1); py += inc_y2; + fy2 = *py; + fy3 = *(py+1); py += inc_y2; + + for (i = (n >> 1) - 1; i--;) + { + tp0 = fx0 + fy0; + tp1 = fx1 + fy1; + tp2 = fy0 - fx0; + tp3 = fy1 - fx1; + tp4 = fx2 + fy2; + tp5 = fx3 + fy3; + tp6 = fy2 - fx2; + tp7 = fy3 - fx3; + + fx0 = *px; + *x = tp0; + fx1 = *(px+1); px += inc_x2; + *(x+1) = tp1; x += inc_x2; + fx2 = *px; + *x = tp4; + fx3 = *(px+1); px += inc_x2; + *(x+1) = tp5; x += inc_x2; + + fy0 = *py; + *y = tp2; + fy1 = *(py+1); py += inc_y2; + *(y+1) = tp3; y += inc_y2; + fy2 = *py; + *y = tp6; + fy3 = *(py+1); py += inc_y2; + *(y+1) = tp7; y += inc_y2; + } + + tp0 = fx0 + fy0; + tp1 = fx1 + fy1; + tp2 = fy0 - fx0; + tp3 = fy1 - fx1; + tp4 = fx2 + fy2; + tp5 = fx3 + fy3; + tp6 = fy2 - fx2; + tp7 = fy3 - fx3; + + *x = tp0; + *(x+1) = tp1; x += inc_x2; + *x = tp4; + *(x+1) = tp5; x += inc_x2; + + *y = tp2; + *(y+1) = tp3; y += inc_y2; + *y = tp6; + *(y+1) = tp7; y += inc_y2; + } + if (n & 1) + { + fx0 = *px; + fx1 = *(px+1); + + fy0 = *py; + fy1 = *(py+1); + + tp0 = fx0 + fy0; + tp1 = fx1 + fy1; + tp2 = fy0 - fx0; + tp3 = fy1 - fx1; + + *x = tp0; + *(x+1) = tp1; + + *y = tp2; + *(y+1) = tp3; + } + } + else if (0 == s) + { + if (n >> 1) + { + fx0 = *px; + fx1 = *(px+1); px += inc_x2; + fx2 = *px; + fx3 = *(px+1); px += inc_x2; + + fy0 = *py; + fy1 = *(py+1); py += inc_y2; + fy2 = *py; + fy3 = *(py+1); py += inc_y2; + + for (i = (n >> 1) - 1; i--;) + { + tp0 = c * fx0; + tp1 = c * fx1; + tp2 = c * fx2; + tp3 = c * fx3; + tp4 = c * fy0; + tp5 = c * fy1; + tp6 = c * fy2; + tp7 = c * fy3; + + fx0 = *px; + *x = tp0; + fx1 = *(px+1); px += inc_x2; + *(x+1) = tp1; x += inc_x2; + fx2 = *px; + *x = tp2; + fx3 = *(px+1); px += inc_x2; + *(x+1) = tp3; x += inc_x2; + fy0 = *py; + *y = tp4; + fy1 = *(py+1); py += inc_y2; + *(y+1) = tp5; y += inc_y2; + fy2 = *py; + *y = tp6; + fy3 = *(py+1); py += inc_y2; + *(y+1) = tp7; y += inc_y2; + } + + tp0 = c * fx0; + tp1 = c * fx1; + tp2 = c * fx2; + tp3 = c * fx3; + tp4 = c * fy0; + tp5 = c * fy1; + tp6 = c * fy2; + tp7 = c * fy3; + + *x = tp0; + *(x+1) = tp1; x += inc_x2; + *x = tp2; + *(x+1) = tp3; x += inc_x2; + + *y = tp4; + *(y+1) = tp5; y += inc_y2; + *y = tp6; + *(y+1) = tp7; y += inc_y2; + } + if (n & 1) + { + fx0 = *px; + fx1 = *(px+1); + + fy0 = *py; + fy1 = *(py+1); + + tp0 = c * fx0; + tp1 = c * fx1; + tp2 = c * fy0; + tp3 = c * fy1; + + *x = tp0; + *(x+1) = tp1; + + *y = tp2; + *(y+1) = tp3; + } + } + else + { + if (n >> 1) + { + fx0 = *px; + fx1 = *(px+1); px += inc_x2; + fx2 = *px; + fx3 = *(px+1); px += inc_x2; + fy0 = *py; + fy1 = *(py+1); py += inc_y2; + fy2 = *py; + fy3 = *(py+1); py += inc_y2; + + for (i = (n >> 1) - 1; i--;) + { + tp0 = c * fx0 + s * fy0; + tp1 = c * fx1 + s * fy1; + tp2 = c * fy0 - s * fx0; + tp3 = c * fy1 - s * fx1; + tp4 = c * fx2 + s * fy2; + tp5 = c * fx3 + s * fy3; + tp6 = c * fy2 - s * fx2; + tp7 = c * fy3 - s * fx3; + + fx0 = *px; + *x = tp0; + fx1 = *(px+1); px += inc_x2; + *(x+1) = tp1; x += inc_x2; + fx2 = *px; + *x = tp4; + fx3 = *(px+1); px += inc_x2; + *(x+1) = tp5; x += inc_x2; + fy0 = *py; + *y = tp2; + fy1 = *(py+1); py += inc_y2; + *(y+1) = tp3; y += inc_y2; + fy2 = *py; + *y = tp6; + fy3 = *(py+1); py += inc_y2; + *(y+1) = tp7; y += inc_y2; + } + + tp0 = c * fx0 + s * fy0; + tp1 = c * fx1 + s * fy1; + tp2 = c * fy0 - s * fx0; + tp3 = c * fy1 - s * fx1; + tp4 = c * fx2 + s * fy2; + tp5 = c * fx3 + s * fy3; + tp6 = c * fy2 - s * fx2; + tp7 = c * fy3 - s * fx3; + + *x = tp0; + *(x+1) = tp1; x += inc_x2; + *x = tp4; + *(x+1) = tp5; x += inc_x2; + *y = tp2; + *(y+1) = tp3; y += inc_y2; + *y = tp6; + *(y+1) = tp7; y += inc_y2; + } + if (n & 1) + { + fx0 = *px; + fx1 = *(px+1); + + fy0 = *py; + fy1 = *(py+1); + + tp0 = c * fx0 + s * fy0; + tp1 = c * fx1 + s * fy1; + tp2 = c * fy0 - s * fx0; + tp3 = c * fy1 - s * fx1; + + *x = tp0; + *(x+1) = tp1; + + *y = tp2; + *(y+1) = tp3; + } + } + } + + return 0; +} diff --git a/kernel/mips/drot_msa.c b/kernel/mips/drot_msa.c new file mode 100644 index 0000000..2fb9392 --- /dev/null +++ b/kernel/mips/drot_msa.c @@ -0,0 +1,1055 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) +{ + BLASLONG i, j; + FLOAT *px, *py; + FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; + v2f64 out0, out1, out2, out3, out4, out5, out6, out7; + v2f64 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0; + + if (n <= 0) return (0); + + px = x; + py = y; + + if ((1 == inc_x) && (1 == inc_y)) + { + if ((0 == c) && (0 == s)) + { + v2f64 zero = {0, 0}; + zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0); + zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0); + + /* process 4 floats */ + for (j = (n >> 2); j--;) + { + ST_DP(zero, px); + ST_DP(zero, py); + px += 2; + py += 2; + ST_DP(zero, px); + ST_DP(zero, py); + px += 2; + py += 2; + } + if (n & 2) + { + ST_DP(zero, px); + ST_DP(zero, py); + px += 2; + py += 2; + } + if (n & 1) + { + px[0] = 0; + py[0] = 0; + } + } + else if ((1 == c) && (1 == s)) + { + if (n >> 4) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + x0 = LD_DP(px); px += 2; + x1 = LD_DP(px); px += 2; + x2 = LD_DP(px); px += 2; + x3 = LD_DP(px); px += 2; + y0 = LD_DP(py); py += 2; + y1 = LD_DP(py); py += 2; + y2 = LD_DP(py); py += 2; + y3 = LD_DP(py); py += 2; + + for (j = (n >> 4) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 16); + PREFETCH(px + pref_offsetx + 20); + PREFETCH(px + pref_offsetx + 24); + PREFETCH(px + pref_offsetx + 28); + PREFETCH(py + pref_offsety + 16); + PREFETCH(py + pref_offsety + 20); + PREFETCH(py + pref_offsety + 24); + PREFETCH(py + pref_offsety + 28); + + out0 = x0 + y0; + x4 = LD_DP(px); px += 2; + out1 = y0 - x0; + x5 = LD_DP(px); px += 2; + out2 = x1 + y1; + x6 = LD_DP(px); px += 2; + out3 = y1 - x1; + x7 = LD_DP(px); px += 2; + out4 = x2 + y2; + y4 = LD_DP(py); py += 2; + out5 = y2 - x2; + y5 = LD_DP(py); py += 2; + out6 = x3 + y3; + y6 = LD_DP(py); py += 2; + out7 = y3 - x3; + y7 = LD_DP(py); py += 2; + + ST_DP(out0, x); x += 2; + out8 = x4 + y4; + ST_DP(out1, y); y += 2; + out9 = y4 - x4; + ST_DP(out2, x); x += 2; + out10 = x5 + y5; + ST_DP(out3, y); y += 2; + out11 = y5 - x5; + ST_DP(out4, x); x += 2; + out12 = x6 + y6; + ST_DP(out5, y); y += 2; + out13 = y6 - x6; + ST_DP(out6, x); x += 2; + out14 = x7 + y7; + ST_DP(out7, y); y += 2; + out15 = y7 - x7; + + x0 = LD_DP(px); px += 2; + ST_DP(out8, x); x += 2; + x1 = LD_DP(px); px += 2; + ST_DP(out10, x); x += 2; + x2 = LD_DP(px); px += 2; + ST_DP(out12, x); x += 2; + x3 = LD_DP(px); px += 2; + ST_DP(out14, x); x += 2; + + y0 = LD_DP(py); py += 2; + ST_DP(out9, y); y += 2; + y1 = LD_DP(py); py += 2; + ST_DP(out11, y); y += 2; + y2 = LD_DP(py); py += 2; + ST_DP(out13, y); y += 2; + y3 = LD_DP(py); py += 2; + ST_DP(out15, y); y += 2; + } + + x4 = LD_DP(px); px += 2; + x5 = LD_DP(px); px += 2; + x6 = LD_DP(px); px += 2; + x7 = LD_DP(px); px += 2; + y4 = LD_DP(py); py += 2; + y5 = LD_DP(py); py += 2; + y6 = LD_DP(py); py += 2; + y7 = LD_DP(py); py += 2; + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + out4 = x2 + y2; + out5 = y2 - x2; + out6 = x3 + y3; + out7 = y3 - x3; + out8 = x4 + y4; + out9 = y4 - x4; + out10 = x5 + y5; + out11 = y5 - x5; + out12 = x6 + y6; + out13 = y6 - x6; + out14 = x7 + y7; + out15 = y7 - x7; + + ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 2); + ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 2); + } + if (n & 8) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + out4 = x2 + y2; + out5 = y2 - x2; + out6 = x3 + y3; + out7 = y3 - x3; + + ST_DP4_INC(out0, out2, out4, out6, x, 2); + ST_DP4_INC(out1, out3, out5, out7, y, 2); + } + if (n & 4) + { + LD_DP2_INC(px, 2, x0, x1); + LD_DP2_INC(py, 2, y0, y1); + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + + ST_DP2_INC(out0, out2, x, 2); + ST_DP2_INC(out1, out3, y, 2); + } + if (n & 2) + { + x0 = LD_DP(px); + y0 = LD_DP(py); + px += 2; + py += 2; + + out0 = x0 + y0; + out1 = y0 - x0; + + ST_DP(out0, x); + ST_DP(out1, y); + x += 2; + y += 2; + } + if (n & 1) + { + tp0 = *x + *y; + *y = *y - *x; + *x = tp0; + } + } + else if (0 == s) + { + c0 = COPY_DOUBLE_TO_VECTOR(c); + + if (n >> 4) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7); + + for (j = (n >> 4) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 16); + PREFETCH(px + pref_offsetx + 20); + PREFETCH(px + pref_offsetx + 24); + PREFETCH(px + pref_offsetx + 28); + PREFETCH(py + pref_offsety + 16); + PREFETCH(py + pref_offsety + 20); + PREFETCH(py + pref_offsety + 24); + PREFETCH(py + pref_offsety + 28); + + y0 = LD_DP(py); py += 2; + x0 *= c0; + y1 = LD_DP(py); py += 2; + x1 *= c0; + y2 = LD_DP(py); py += 2; + x2 *= c0; + y3 = LD_DP(py); py += 2; + x3 *= c0; + y4 = LD_DP(py); py += 2; + x4 *= c0; + y5 = LD_DP(py); py += 2; + x5 *= c0; + y6 = LD_DP(py); py += 2; + x6 *= c0; + y7 = LD_DP(py); py += 2; + x7 *= c0; + + ST_DP(x0, x); x += 2; + y0 *= c0; + ST_DP(x1, x); x += 2; + y1 *= c0; + ST_DP(x2, x); x += 2; + y2 *= c0; + ST_DP(x3, x); x += 2; + y3 *= c0; + ST_DP(x4, x); x += 2; + y4 *= c0; + ST_DP(x5, x); x += 2; + y5 *= c0; + ST_DP(x6, x); x += 2; + y6 *= c0; + ST_DP(x7, x); x += 2; + y7 *= c0; + + x0 = LD_DP(px); px += 2; + ST_DP(y0, y); y += 2; + x1 = LD_DP(px); px += 2; + ST_DP(y1, y); y += 2; + x2 = LD_DP(px); px += 2; + ST_DP(y2, y); y += 2; + x3 = LD_DP(px); px += 2; + ST_DP(y3, y); y += 2; + x4 = LD_DP(px); px += 2; + ST_DP(y4, y); y += 2; + x5 = LD_DP(px); px += 2; + ST_DP(y5, y); y += 2; + x6 = LD_DP(px); px += 2; + ST_DP(y6, y); y += 2; + x7 = LD_DP(px); px += 2; + ST_DP(y7, y); y += 2; + } + + LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7); + + x0 *= c0; + y0 *= c0; + x1 *= c0; + y1 *= c0; + x2 *= c0; + y2 *= c0; + x3 *= c0; + y3 *= c0; + x4 *= c0; + y4 *= c0; + x5 *= c0; + y5 *= c0; + x6 *= c0; + y6 *= c0; + x7 *= c0; + y7 *= c0; + + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2); + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2); + } + if (n & 8) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + + out0 = c0 * x0; + out1 = c0 * y0; + out2 = c0 * x1; + out3 = c0 * y1; + out4 = c0 * x2; + out5 = c0 * y2; + out6 = c0 * x3; + out7 = c0 * y3; + + ST_DP4_INC(out0, out2, out4, out6, x, 2); + ST_DP4_INC(out1, out3, out5, out7, y, 2); + } + if (n & 4) + { + LD_DP2_INC(px, 2, x0, x1); + LD_DP2_INC(py, 2, y0, y1); + + out0 = c0 * x0; + out1 = c0 * y0; + out2 = c0 * x1; + out3 = c0 * y1; + + ST_DP2_INC(out0, out2, x, 2); + ST_DP2_INC(out1, out3, y, 2); + } + if (n & 2) + { + x0 = LD_DP(px); + y0 = LD_DP(py); + px += 2; + py += 2; + + out0 = c0 * x0; + out1 = c0 * y0; + + ST_DP(out0, x); + ST_DP(out1, y); + x += 2; + y += 2; + } + if (n & 1) + { + *x *= c; + *y *= c; + } + } + else if (0 == c) + { + s0 = COPY_DOUBLE_TO_VECTOR(s); + + /* process 16 floats */ + if (n >> 4) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_DP4_INC(px, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + + for (j = (n >> 4) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 16); + PREFETCH(px + pref_offsetx + 20); + PREFETCH(px + pref_offsetx + 24); + PREFETCH(px + pref_offsetx + 28); + PREFETCH(py + pref_offsety + 16); + PREFETCH(py + pref_offsety + 20); + PREFETCH(py + pref_offsety + 24); + PREFETCH(py + pref_offsety + 28); + + x4 = LD_DP(px); px += 2; + out0 = s0 * y0; + x5 = LD_DP(px); px += 2; + out2 = s0 * y1; + x6 = LD_DP(px); px += 2; + out4 = s0 * y2; + x7 = LD_DP(px); px += 2; + out6 = s0 * y3; + y4 = LD_DP(py); py += 2; + out1 = -(s0 * x0); + y5 = LD_DP(py); py += 2; + out3 = -(s0 * x1); + y6 = LD_DP(py); py += 2; + out5 = -(s0 * x2); + y7 = LD_DP(py); py += 2; + out7 = -(s0 * x3); + + ST_DP(out0, x); x += 2; + out0 = s0 * y4; + ST_DP(out2, x); x += 2; + out2 = s0 * y5; + ST_DP(out4, x); x += 2; + out4 = s0 * y6; + ST_DP(out6, x); x += 2; + out6 = s0 * y7; + ST_DP(out1, y); y += 2; + out1 = -(s0 * x4); + ST_DP(out3, y); y += 2; + out3 = -(s0 * x5); + ST_DP(out5, y); y += 2; + out5 = -(s0 * x6); + ST_DP(out7, y); y += 2; + out7 = -(s0 * x7); + + x0 = LD_DP(px); px += 2; + ST_DP(out0, x); x += 2; + x1 = LD_DP(px); px += 2; + ST_DP(out2, x); x += 2; + x2 = LD_DP(px); px += 2; + ST_DP(out4, x); x += 2; + x3 = LD_DP(px); px += 2; + ST_DP(out6, x); x += 2; + y0 = LD_DP(py); py += 2; + ST_DP(out1, y); y += 2; + y1 = LD_DP(py); py += 2; + ST_DP(out3, y); y += 2; + y2 = LD_DP(py); py += 2; + ST_DP(out5, y); y += 2; + y3 = LD_DP(py); py += 2; + ST_DP(out7, y); y += 2; + } + + out0 = s0 * y0; + out2 = s0 * y1; + out4 = s0 * y2; + out6 = s0 * y3; + out1 = -(s0 * x0); + out3 = -(s0 * x1); + out5 = -(s0 * x2); + out7 = -(s0 * x3); + + ST_DP4_INC(out0, out2, out4, out6, x, 2); + ST_DP4_INC(out1, out3, out5, out7, y, 2); + + LD_DP4_INC(px, 2, x4, x5, x6, x7); + LD_DP4_INC(py, 2, y4, y5, y6, y7); + + out0 = s0 * y4; + out2 = s0 * y5; + out4 = s0 * y6; + out6 = s0 * y7; + out1 = -(s0 * x4); + out3 = -(s0 * x5); + out5 = -(s0 * x6); + out7 = -(s0 * x7); + + ST_DP4_INC(out0, out2, out4, out6, x, 2); + ST_DP4_INC(out1, out3, out5, out7, y, 2); + } + if (n & 8) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + + out0 = s0 * y0; + out1 = - (s0 * x0); + out2 = s0 * y1; + out3 = - (s0 * x1); + out4 = s0 * y2; + out5 = - (s0 * x2); + out6 = s0 * y3; + out7 = - (s0 * x3); + + ST_DP4_INC(out0, out2, out4, out6, x, 2); + ST_DP4_INC(out1, out3, out5, out7, y, 2); + } + if (n & 4) + { + LD_DP2_INC(px, 2, x0, x1); + LD_DP2_INC(py, 2, y0, y1); + + out0 = s0 * y0; + out1 = - (s0 * x0); + out2 = s0 * y1; + out3 = - (s0 * x1); + + ST_DP2_INC(out0, out2, x, 2); + ST_DP2_INC(out1, out3, y, 2); + } + if (n & 2) + { + x0 = LD_DP(px); px += 2; + y0 = LD_DP(py); py += 2; + + out0 = s0 * y0; + out1 = - (s0 * x0); + + ST_DP(out0, x); x += 2; + ST_DP(out1, y); y += 2; + } + if (n & 1) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = s * fy0; + tp1 = - (s * fx0); + tp2 = s * fy1; + tp3 = - (s * fx1); + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + } + else + { + c0 = COPY_DOUBLE_TO_VECTOR(c); + s0 = COPY_DOUBLE_TO_VECTOR(s); + + /* process 14 doubles */ + if (n >> 4) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_DP4_INC(px, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + + for (j = (n >> 4) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 16); + PREFETCH(px + pref_offsetx + 20); + PREFETCH(px + pref_offsetx + 24); + PREFETCH(px + pref_offsetx + 28); + PREFETCH(py + pref_offsety + 16); + PREFETCH(py + pref_offsety + 20); + PREFETCH(py + pref_offsety + 24); + PREFETCH(py + pref_offsety + 28); + + x4 = LD_DP(px); px += 2; + out0 = c0 * x0; + x5 = LD_DP(px); px += 2; + out2 = c0 * x1; + x6 = LD_DP(px); px += 2; + out4 = c0 * x2; + x7 = LD_DP(px); px += 2; + out6 = c0 * x3; + y4 = LD_DP(py); py += 2; + out1 = c0 * y0; + y5 = LD_DP(py); py += 2; + out3 = c0 * y1; + y6 = LD_DP(py); py += 2; + out5 = c0 * y2; + y7 = LD_DP(py); py += 2; + out7 = c0 * y3; + + out0 += s0 * y0; + out2 += s0 * y1; + out4 += s0 * y2; + out6 += s0 * y3; + out1 -= s0 * x0; + out3 -= s0 * x1; + out5 -= s0 * x2; + out7 -= s0 * x3; + + ST_DP(out0, x); x += 2; + out0 = c0 * x4; + ST_DP(out2, x); x += 2; + out2 = c0 * x5; + ST_DP(out4, x); x += 2; + out4 = c0 * x6; + ST_DP(out6, x); x += 2; + out6 = c0 * x7; + ST_DP(out1, y); y += 2; + out1 = c0 * y4; + ST_DP(out3, y); y += 2; + out3 = c0 * y5; + ST_DP(out5, y); y += 2; + out5 = c0 * y6; + ST_DP(out7, y); y += 2; + out7 = c0 * y7; + + x0 = LD_DP(px); px += 2; + out0 += s0 * y4; + x1 = LD_DP(px); px += 2; + out2 += s0 * y5; + x2 = LD_DP(px); px += 2; + out4 += s0 * y6; + x3 = LD_DP(px); px += 2; + out6 += s0 * y7; + y0 = LD_DP(py); py += 2; + out1 -= s0 * x4; + y1 = LD_DP(py); py += 2; + out3 -= s0 * x5; + y2 = LD_DP(py); py += 2; + out5 -= s0 * x6; + y3 = LD_DP(py); py += 2; + out7 -= s0 * x7; + + ST_DP4_INC(out0, out2, out4, out6, x, 2); + ST_DP4_INC(out1, out3, out5, out7, y, 2); + } + + out0 = c0 * x0; + out0 += s0 * y0; + out1 = c0 * y0; + out1 -= s0 * x0; + out2 = c0 * x1; + out2 += s0 * y1; + out3 = c0 * y1; + out3 -= s0 * x1; + out4 = c0 * x2; + out4 += s0 * y2; + out5 = c0 * y2; + out5 -= s0 * x2; + out6 = c0 * x3; + out6 += s0 * y3; + out7 = c0 * y3; + out7 -= s0 * x3; + + ST_DP4_INC(out0, out2, out4, out6, x, 2); + ST_DP4_INC(out1, out3, out5, out7, y, 2); + + LD_DP4_INC(px, 2, x4, x5, x6, x7); + LD_DP4_INC(py, 2, y4, y5, y6, y7); + + out8 = c0 * x4; + out8 += s0 * y4; + out9 = c0 * y4; + out9 -= s0 * x4; + out10 = c0 * x5; + out10 += s0 * y5; + out11 = c0 * y5; + out11 -= s0 * x5; + out12 = c0 * x6; + out12 += s0 * y6; + out13 = c0 * y6; + out13 -= s0 * x6; + out14 = c0 * x7; + out14 += s0 * y7; + out15 = c0 * y7; + out15 -= s0 * x7; + + ST_DP4_INC(out8, out10, out12, out14, x, 2); + ST_DP4_INC(out9, out11, out13, out15, y, 2); + } + if (n & 8) + { + LD_DP4_INC(px, 2, x0, x1, x2, x3); + LD_DP4_INC(py, 2, y0, y1, y2, y3); + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + out2 = (c0 * x1) + (s0 * y1); + out3 = (c0 * y1) - (s0 * x1); + out4 = (c0 * x2) + (s0 * y2); + out5 = (c0 * y2) - (s0 * x2); + out6 = (c0 * x3) + (s0 * y3); + out7 = (c0 * y3) - (s0 * x3); + + ST_DP4_INC(out0, out2, out4, out6, x, 2); + ST_DP4_INC(out1, out3, out5, out7, y, 2); + } + if (n & 4) + { + LD_DP2_INC(px, 2, x0, x1); + LD_DP2_INC(py, 2, y0, y1); + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + out2 = (c0 * x1) + (s0 * y1); + out3 = (c0 * y1) - (s0 * x1); + + ST_DP2_INC(out0, out2, x, 2); + ST_DP2_INC(out1, out3, y, 2); + } + if (n & 2) + { + x0 = LD_DP(px); + y0 = LD_DP(py); + px += 2; + py += 2; + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + + ST_DP(out0, x); + ST_DP(out1, y); + x += 2; + y += 2; + } + if (n & 1) + { + tp0 = c * *x + s * *y; + *y = c * *y - s * *x; + *x = tp0; + } + } + } + else + { + if ((0 == c) && (0 == s)) + { + for (i = n; i--;) + { + *x = 0; + *y = 0; + + x += inc_x; + y += inc_y; + } + } + else if ((1 == c) && (1 == s)) + { + if (n >> 2) + { + fx0 = *px; px += inc_x; + fx1 = *px; px += inc_x; + fx2 = *px; px += inc_x; + fx3 = *px; px += inc_x; + fy0 = *py; py += inc_y; + fy1 = *py; py += inc_y; + fy2 = *py; py += inc_y; + fy3 = *py; py += inc_y; + + for (i = (n >> 2) -1; i--;) + { + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + tp2 = fx1 + fy1; + tp3 = fy1 - fx1; + tp4 = fx2 + fy2; + tp5 = fy2 - fx2; + tp6 = fx3 + fy3; + tp7 = fy3 - fx3; + + fx0 = *px; px += inc_x; + *x = tp0; x += inc_x; + fx1 = *px; px += inc_x; + *x = tp2; x += inc_x; + fx2 = *px; px += inc_x; + *x = tp4; x += inc_x; + fx3 = *px; px += inc_x; + *x = tp6; x += inc_x; + fy0 = *py; py += inc_y; + *y = tp1; y += inc_y; + fy1 = *py; py += inc_y; + *y = tp3; y += inc_y; + fy2 = *py; py += inc_y; + *y = tp5; y += inc_y; + fy3 = *py; py += inc_y; + *y = tp7; y += inc_y; + } + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + tp2 = fx1 + fy1; + tp3 = fy1 - fx1; + tp4 = fx2 + fy2; + tp5 = fy2 - fx2; + tp6 = fx3 + fy3; + tp7 = fy3 - fx3; + + *x = tp0; x += inc_x; + *x = tp2; x += inc_x; + *x = tp4; x += inc_x; + *x = tp6; x += inc_x; + *y = tp1; y += inc_y; + *y = tp3; y += inc_y; + *y = tp5; y += inc_y; + *y = tp7; y += inc_y; + } + + if (n & 2) + { + LD_GP2_INC(px, inc_x, fx0, fx1); + LD_GP2_INC(py, inc_y, fy0, fy1); + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + tp2 = fx1 + fy1; + tp3 = fy1 - fx1; + + ST_GP2_INC(tp0, tp2, x, inc_x); + ST_GP2_INC(tp1, tp3, y, inc_y); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + + *x = tp0; + *y = tp1; + } + } + else if (0 == s) + { + if (n >> 2) + { + fx0 = *px; px += inc_x; + fx1 = *px; px += inc_x; + fx2 = *px; px += inc_x; + fx3 = *px; px += inc_x; + fy0 = *py; py += inc_y; + fy1 = *py; py += inc_y; + fy2 = *py; py += inc_y; + fy3 = *py; py += inc_y; + + for (i = (n >> 2) - 1; i--;) + { + tp0 = c * fx0; + tp1 = c * fy0; + tp2 = c * fx1; + tp3 = c * fy1; + tp4 = c * fx2; + tp5 = c * fy2; + tp6 = c * fx3; + tp7 = c * fy3; + + fx0 = *px; px += inc_x; + *x = tp0; x += inc_x; + fx1 = *px; px += inc_x; + *x = tp2; x += inc_x; + fx2 = *px; px += inc_x; + *x = tp4; x += inc_x; + fx3 = *px; px += inc_x; + *x = tp6; x += inc_x; + fy0 = *py; py += inc_y; + *y = tp1; y += inc_y; + fy1 = *py; py += inc_y; + *y = tp3; y += inc_y; + fy2 = *py; py += inc_y; + *y = tp5; y += inc_y; + fy3 = *py; py += inc_y; + *y = tp7; y += inc_y; + } + + tp0 = c * fx0; + tp1 = c * fy0; + tp2 = c * fx1; + tp3 = c * fy1; + tp4 = c * fx2; + tp5 = c * fy2; + tp6 = c * fx3; + tp7 = c * fy3; + + *x = tp0; x += inc_x; + *x = tp2; x += inc_x; + *x = tp4; x += inc_x; + *x = tp6; x += inc_x; + *y = tp1; y += inc_y; + *y = tp3; y += inc_y; + *y = tp5; y += inc_y; + *y = tp7; y += inc_y; + } + if (n & 2) + { + LD_GP2_INC(px, inc_x, fx0, fx1); + LD_GP2_INC(py, inc_y, fy0, fy1); + + tp0 = c * fx0; + tp1 = c * fy0; + tp2 = c * fx1; + tp3 = c * fy1; + + ST_GP2_INC(tp0, tp2, x, inc_x); + ST_GP2_INC(tp1, tp3, y, inc_y); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = c * fx0; + tp1 = c * fy0; + + *x = tp0; + *y = tp1; + } + } + else + { + if (n >> 2) + { + fx0 = *px; px += inc_x; + fx1 = *px; px += inc_x; + fx2 = *px; px += inc_x; + fx3 = *px; px += inc_x; + fy0 = *py; py += inc_y; + fy1 = *py; py += inc_y; + fy2 = *py; py += inc_y; + fy3 = *py; py += inc_y; + + for (i = (n >> 2) - 1; i--;) + { + tp0 = c * fx0 + s * fy0; + tp1 = c * fy0 - s * fx0; + tp2 = c * fx1 + s * fy1; + tp3 = c * fy1 - s * fx1; + tp4 = c * fx2 + s * fy2; + tp5 = c * fy2 - s * fx2; + tp6 = c * fx3 + s * fy3; + tp7 = c * fy3 - s * fx3; + + fx0 = *px; px += inc_x; + *x = tp0; x += inc_x; + fx1 = *px; px += inc_x; + *x = tp2; x += inc_x; + fx2 = *px; px += inc_x; + *x = tp4; x += inc_x; + fx3 = *px; px += inc_x; + *x = tp6; x += inc_x; + fy0 = *py; py += inc_y; + *y = tp1; y += inc_y; + fy1 = *py; py += inc_y; + *y = tp3; y += inc_y; + fy2 = *py; py += inc_y; + *y = tp5; y += inc_y; + fy3 = *py; py += inc_y; + *y = tp7; y += inc_y; + } + + tp0 = c * fx0 + s * fy0; + tp1 = c * fy0 - s * fx0; + tp2 = c * fx1 + s * fy1; + tp3 = c * fy1 - s * fx1; + tp4 = c * fx2 + s * fy2; + tp5 = c * fy2 - s * fx2; + tp6 = c * fx3 + s * fy3; + tp7 = c * fy3 - s * fx3; + + *x = tp0; x += inc_x; + *x = tp2; x += inc_x; + *x = tp4; x += inc_x; + *x = tp6; x += inc_x; + *y = tp1; y += inc_y; + *y = tp3; y += inc_y; + *y = tp5; y += inc_y; + *y = tp7; y += inc_y; + } + if (n & 2) + { + LD_GP2_INC(px, inc_x, fx0, fx1); + LD_GP2_INC(py, inc_y, fy0, fy1); + + tp0 = (c * fx0) + (s * fy0); + tp1 = (c * fy0) - (s * fx0); + tp2 = (c * fx1) + (s * fy1); + tp3 = (c * fy1) - (s * fx1); + + ST_GP2_INC(tp0, tp2, x, inc_x); + ST_GP2_INC(tp1, tp3, y, inc_y); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = (c * fx0) + (s * fy0); + tp1 = (c * fy0) - (s * fx0); + + *x = tp0; + *y = tp1; + } + } + } + + return 0; +} diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c new file mode 100644 index 0000000..7573024 --- /dev/null +++ b/kernel/mips/srot_msa.c @@ -0,0 +1,1123 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) +{ + BLASLONG i, j; + FLOAT *px, *py; + FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7; + FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3; + v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; + v4f32 out0, out1, out2, out3, out4, out5, out6, out7; + v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0; + + if (n <= 0) return (0); + + px = x; + py = y; + + if ((1 == inc_x) && (1 == inc_y)) + { + if ((0 == c) && (0 == s)) + { + v4f32 zero = __msa_cast_to_vector_float(0); + zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); + zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); + zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); + zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + + /* process 4 floats */ + for (j = (n >> 2); j--;) + { + ST_SP(zero, px); + ST_SP(zero, py); + px += 4; + py += 4; + } + if (n & 2) + { + px[0] = 0; + py[0] = 0; + px[1] = 0; + py[1] = 0; + px += 2; + py += 2; + } + if (n & 1) + { + px[0] = 0; + py[0] = 0; + } + } + else if ((1 == c) && (1 == s)) + { + if (n >> 5) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + x0 = LD_SP(px); px += 4; + x1 = LD_SP(px); px += 4; + x2 = LD_SP(px); px += 4; + x3 = LD_SP(px); px += 4; + y0 = LD_SP(py); py += 4; + y1 = LD_SP(py); py += 4; + y2 = LD_SP(py); py += 4; + y3 = LD_SP(py); py += 4; + + for (j = (n >> 5) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + out0 = x0 + y0; + x4 = LD_SP(px); px += 4; + out1 = y0 - x0; + x5 = LD_SP(px); px += 4; + out2 = x1 + y1; + x6 = LD_SP(px); px += 4; + out3 = y1 - x1; + x7 = LD_SP(px); px += 4; + out4 = x2 + y2; + y4 = LD_SP(py); py += 4; + out5 = y2 - x2; + y5 = LD_SP(py); py += 4; + out6 = x3 + y3; + y6 = LD_SP(py); py += 4; + out7 = y3 - x3; + y7 = LD_SP(py); py += 4; + + ST_SP(out0, x); x += 4; + out8 = x4 + y4; + ST_SP(out1, y); y += 4; + out9 = y4 - x4; + ST_SP(out2, x); x += 4; + out10 = x5 + y5; + ST_SP(out3, y); y += 4; + out11 = y5 - x5; + ST_SP(out4, x); x += 4; + out12 = x6 + y6; + ST_SP(out5, y); y += 4; + out13 = y6 - x6; + ST_SP(out6, x); x += 4; + out14 = x7 + y7; + ST_SP(out7, y); y += 4; + out15 = y7 - x7; + + x0 = LD_SP(px); px += 4; + ST_SP(out8, x); x += 4; + x1 = LD_SP(px); px += 4; + ST_SP(out10, x); x += 4; + x2 = LD_SP(px); px += 4; + ST_SP(out12, x); x += 4; + x3 = LD_SP(px); px += 4; + ST_SP(out14, x); x += 4; + y0 = LD_SP(py); py += 4; + ST_SP(out9, y); y += 4; + y1 = LD_SP(py); py += 4; + ST_SP(out11, y); y += 4; + y2 = LD_SP(py); py += 4; + ST_SP(out13, y); y += 4; + y3 = LD_SP(py); py += 4; + ST_SP(out15, y); y += 4; + } + + x4 = LD_SP(px); px += 4; + x5 = LD_SP(px); px += 4; + x6 = LD_SP(px); px += 4; + x7 = LD_SP(px); px += 4; + y4 = LD_SP(py); py += 4; + y5 = LD_SP(py); py += 4; + y6 = LD_SP(py); py += 4; + y7 = LD_SP(py); py += 4; + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + out4 = x2 + y2; + out5 = y2 - x2; + out6 = x3 + y3; + out7 = y3 - x3; + out8 = x4 + y4; + out9 = y4 - x4; + out10 = x5 + y5; + out11 = y5 - x5; + out12 = x6 + y6; + out13 = y6 - x6; + out14 = x7 + y7; + out15 = y7 - x7; + + ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4); + ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4); + } + if (n & 16) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + out4 = x2 + y2; + out5 = y2 - x2; + out6 = x3 + y3; + out7 = y3 - x3; + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 8) + { + LD_SP2_INC(px, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + + out0 = x0 + y0; + out1 = y0 - x0; + out2 = x1 + y1; + out3 = y1 - x1; + + ST_SP2_INC(out0, out2, x, 4); + ST_SP2_INC(out1, out3, y, 4); + } + if (n & 4) + { + x0 = LD_SP(px); + y0 = LD_SP(py); + px += 4; + py += 4; + + out0 = x0 + y0; + out1 = y0 - x0; + + ST_SP(out0, x); + ST_SP(out1, y); + x += 4; + y += 4; + } + if (n & 2) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + tp2 = fx1 + fy1; + tp3 = fy1 - fx1; + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + + *x = tp0; + *y = tp1; + } + } + else if (0 == s) + { + c0 = COPY_FLOAT_TO_VECTOR(c); + + if (n >> 5) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7); + + for (j = (n >> 5) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + y0 = LD_SP(py); py += 4; + x0 *= c0; + y1 = LD_SP(py); py += 4; + x1 *= c0; + y2 = LD_SP(py); py += 4; + x2 *= c0; + y3 = LD_SP(py); py += 4; + x3 *= c0; + y4 = LD_SP(py); py += 4; + x4 *= c0; + y5 = LD_SP(py); py += 4; + x5 *= c0; + y6 = LD_SP(py); py += 4; + x6 *= c0; + y7 = LD_SP(py); py += 4; + x7 *= c0; + + ST_SP(x0, x); x += 4; + y0 *= c0; + ST_SP(x1, x); x += 4; + y1 *= c0; + ST_SP(x2, x); x += 4; + y2 *= c0; + ST_SP(x3, x); x += 4; + y3 *= c0; + ST_SP(x4, x); x += 4; + y4 *= c0; + ST_SP(x5, x); x += 4; + y5 *= c0; + ST_SP(x6, x); x += 4; + y6 *= c0; + ST_SP(x7, x); x += 4; + y7 *= c0; + + x0 = LD_SP(px); px += 4; + ST_SP(y0, y); y += 4; + x1 = LD_SP(px); px += 4; + ST_SP(y1, y); y += 4; + x2 = LD_SP(px); px += 4; + ST_SP(y2, y); y += 4; + x3 = LD_SP(px); px += 4; + ST_SP(y3, y); y += 4; + x4 = LD_SP(px); px += 4; + ST_SP(y4, y); y += 4; + x5 = LD_SP(px); px += 4; + ST_SP(y5, y); y += 4; + x6 = LD_SP(px); px += 4; + ST_SP(y6, y); y += 4; + x7 = LD_SP(px); px += 4; + ST_SP(y7, y); y += 4; + } + + LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7); + + x0 *= c0; + y0 *= c0; + x1 *= c0; + y1 *= c0; + x2 *= c0; + y2 *= c0; + x3 *= c0; + y3 *= c0; + x4 *= c0; + y4 *= c0; + x5 *= c0; + y5 *= c0; + x6 *= c0; + y6 *= c0; + x7 *= c0; + y7 *= c0; + + ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4); + ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4); + } + if (n & 16) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + x0 *= c0; + y0 *= c0; + x1 *= c0; + y1 *= c0; + x2 *= c0; + y2 *= c0; + x3 *= c0; + y3 *= c0; + + ST_SP4_INC(x0, x1, x2, x3, x, 4); + ST_SP4_INC(y0, y1, y2, y3, y, 4); + } + if (n & 8) + { + LD_SP2_INC(px, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + + x0 *= c0; + y0 *= c0; + x1 *= c0; + y1 *= c0; + + ST_SP2_INC(x0, x1, x, 4); + ST_SP2_INC(y0, y1, y, 4); + } + if (n & 4) + { + x0 = LD_SP(px); + y0 = LD_SP(py); + px += 4; + py += 4; + + x0 *= c0; + y0 *= c0; + + ST_SP(x0, x); + ST_SP(y0, y); + x += 4; + y += 4; + } + if (n & 2) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = (c * fx0); + tp1 = (c * fy0); + tp2 = (c * fx1); + tp3 = (c * fy1); + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = (c * fx0); + tp1 = (c * fy0); + + *x = tp0; + *y = tp1; + } + } + else if (0 == c) + { + s0 = COPY_FLOAT_TO_VECTOR(s); + + /* process 16 floats */ + if (n >> 5) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + for (j = (n >> 5) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + x4 = LD_SP(px); px += 4; + out0 = s0 * y0; + x5 = LD_SP(px); px += 4; + out2 = s0 * y1; + x6 = LD_SP(px); px += 4; + out4 = s0 * y2; + x7 = LD_SP(px); px += 4; + out6 = s0 * y3; + y4 = LD_SP(py); py += 4; + out1 = -(s0 * x0); + y5 = LD_SP(py); py += 4; + out3 = -(s0 * x1); + y6 = LD_SP(py); py += 4; + out5 = -(s0 * x2); + y7 = LD_SP(py); py += 4; + out7 = -(s0 * x3); + + ST_SP(out0, x); x += 4; + out0 = s0 * y4; + ST_SP(out2, x); x += 4; + out2 = s0 * y5; + ST_SP(out4, x); x += 4; + out4 = s0 * y6; + ST_SP(out6, x); x += 4; + out6 = s0 * y7; + ST_SP(out1, y); y += 4; + out1 = -(s0 * x4); + ST_SP(out3, y); y += 4; + out3 = -(s0 * x5); + ST_SP(out5, y); y += 4; + out5 = -(s0 * x6); + ST_SP(out7, y); y += 4; + out7 = -(s0 * x7); + + x0 = LD_SP(px); px += 4; + ST_SP(out0, x); x += 4; + x1 = LD_SP(px); px += 4; + ST_SP(out2, x); x += 4; + x2 = LD_SP(px); px += 4; + ST_SP(out4, x); x += 4; + x3 = LD_SP(px); px += 4; + ST_SP(out6, x); x += 4; + y0 = LD_SP(py); py += 4; + ST_SP(out1, y); y += 4; + y1 = LD_SP(py); py += 4; + ST_SP(out3, y); y += 4; + y2 = LD_SP(py); py += 4; + ST_SP(out5, y); y += 4; + y3 = LD_SP(py); py += 4; + ST_SP(out7, y); y += 4; + + } + + out0 = s0 * y0; + out2 = s0 * y1; + out4 = s0 * y2; + out6 = s0 * y3; + out1 = -(s0 * x0); + out3 = -(s0 * x1); + out5 = -(s0 * x2); + out7 = -(s0 * x3); + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + + LD_SP4_INC(px, 4, x4, x5, x6, x7); + LD_SP4_INC(py, 4, y4, y5, y6, y7); + + out0 = s0 * y4; + out2 = s0 * y5; + out4 = s0 * y6; + out6 = s0 * y7; + out1 = -(s0 * x4); + out3 = -(s0 * x5); + out5 = -(s0 * x6); + out7 = -(s0 * x7); + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 16) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + out0 = s0 * y0; + out1 = - (s0 * x0); + out2 = s0 * y1; + out3 = - (s0 * x1); + out4 = s0 * y2; + out5 = - (s0 * x2); + out6 = s0 * y3; + out7 = - (s0 * x3); + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 8) + { + LD_SP2_INC(px, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + + out0 = s0 * y0; + out1 = - (s0 * x0); + out2 = s0 * y1; + out3 = - (s0 * x1); + + ST_SP2_INC(out0, out2, x, 4); + ST_SP2_INC(out1, out3, y, 4); + } + if (n & 4) + { + x0 = LD_SP(px); px += 4; + y0 = LD_SP(py); py += 4; + + out0 = s0 * y0; + out1 = - (s0 * x0); + + ST_SP(out0, x); x += 4; + ST_SP(out1, y); y += 4; + } + if (n & 2) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = s * fy0; + tp1 = - (s * fx0); + tp2 = s * fy1; + tp3 = - (s * fx1); + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = s * fy0; + tp1 = - (s * fx0); + + *x = tp0; + *y = tp1; + } + } + else + { + c0 = COPY_FLOAT_TO_VECTOR(c); + s0 = COPY_FLOAT_TO_VECTOR(s); + + /* process 16 floats */ + if (n >> 5) + { + BLASLONG pref_offsetx, pref_offsety; + + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); + if (pref_offsetx > 0) + { + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; + pref_offsetx = pref_offsetx / sizeof(FLOAT); + } + + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); + if (pref_offsety > 0) + { + pref_offsety = L1_DATA_LINESIZE - pref_offsety; + pref_offsety = pref_offsety / sizeof(FLOAT); + } + + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + for (j = (n >> 5) - 1; j--;) + { + PREFETCH(px + pref_offsetx + 32); + PREFETCH(px + pref_offsetx + 40); + PREFETCH(px + pref_offsetx + 48); + PREFETCH(px + pref_offsetx + 56); + PREFETCH(py + pref_offsety + 32); + PREFETCH(py + pref_offsety + 40); + PREFETCH(py + pref_offsety + 48); + PREFETCH(py + pref_offsety + 56); + + x4 = LD_SP(px); px += 4; + out0 = c0 * x0; + x5 = LD_SP(px); px += 4; + out2 = c0 * x1; + x6 = LD_SP(px); px += 4; + out4 = c0 * x2; + x7 = LD_SP(px); px += 4; + out6 = c0 * x3; + y4 = LD_SP(py); py += 4; + out1 = c0 * y0; + y5 = LD_SP(py); py += 4; + out3 = c0 * y1; + y6 = LD_SP(py); py += 4; + out5 = c0 * y2; + y7 = LD_SP(py); py += 4; + out7 = c0 * y3; + + out0 += s0 * y0; + out2 += s0 * y1; + out4 += s0 * y2; + out6 += s0 * y3; + out1 -= s0 * x0; + out3 -= s0 * x1; + out5 -= s0 * x2; + out7 -= s0 * x3; + + ST_SP(out0, x); x += 4; + out0 = c0 * x4; + ST_SP(out2, x); x += 4; + out2 = c0 * x5; + ST_SP(out4, x); x += 4; + out4 = c0 * x6; + ST_SP(out6, x); x += 4; + out6 = c0 * x7; + ST_SP(out1, y); y += 4; + out1 = c0 * y4; + ST_SP(out3, y); y += 4; + out3 = c0 * y5; + ST_SP(out5, y); y += 4; + out5 = c0 * y6; + ST_SP(out7, y); y += 4; + out7 = c0 * y7; + + x0 = LD_SP(px); px += 4; + out0 += s0 * y4; + x1 = LD_SP(px); px += 4; + out2 += s0 * y5; + x2 = LD_SP(px); px += 4; + out4 += s0 * y6; + x3 = LD_SP(px); px += 4; + out6 += s0 * y7; + y0 = LD_SP(py); py += 4; + out1 -= s0 * x4; + y1 = LD_SP(py); py += 4; + out3 -= s0 * x5; + y2 = LD_SP(py); py += 4; + out5 -= s0 * x6; + y3 = LD_SP(py); py += 4; + out7 -= s0 * x7; + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + + out0 = c0 * x0; + out2 = c0 * x1; + out4 = c0 * x2; + out6 = c0 * x3; + out1 = c0 * y0; + out3 = c0 * y1; + out5 = c0 * y2; + out7 = c0 * y3; + + out0 += s0 * y0; + out2 += s0 * y1; + out4 += s0 * y2; + out6 += s0 * y3; + out1 -= s0 * x0; + out3 -= s0 * x1; + out5 -= s0 * x2; + out7 -= s0 * x3; + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + + LD_SP4_INC(px, 4, x4, x5, x6, x7); + LD_SP4_INC(py, 4, y4, y5, y6, y7); + + out0 = c0 * x4; + out2 = c0 * x5; + out4 = c0 * x6; + out6 = c0 * x7; + out1 = c0 * y4; + out3 = c0 * y5; + out5 = c0 * y6; + out7 = c0 * y7; + + out0 += s0 * y4; + out2 += s0 * y5; + out4 += s0 * y6; + out6 += s0 * y7; + out1 -= s0 * x4; + out3 -= s0 * x5; + out5 -= s0 * x6; + out7 -= s0 * x7; + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 16) + { + LD_SP4_INC(px, 4, x0, x1, x2, x3); + LD_SP4_INC(py, 4, y0, y1, y2, y3); + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + out2 = (c0 * x1) + (s0 * y1); + out3 = (c0 * y1) - (s0 * x1); + out4 = (c0 * x2) + (s0 * y2); + out5 = (c0 * y2) - (s0 * x2); + out6 = (c0 * x3) + (s0 * y3); + out7 = (c0 * y3) - (s0 * x3); + + ST_SP4_INC(out0, out2, out4, out6, x, 4); + ST_SP4_INC(out1, out3, out5, out7, y, 4); + } + if (n & 8) + { + LD_SP2_INC(px, 4, x0, x1); + LD_SP2_INC(py, 4, y0, y1); + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + out2 = (c0 * x1) + (s0 * y1); + out3 = (c0 * y1) - (s0 * x1); + + ST_SP2_INC(out0, out2, x, 4); + ST_SP2_INC(out1, out3, y, 4); + } + if (n & 4) + { + x0 = LD_SP(px); + y0 = LD_SP(py); + px += 4; + py += 4; + + out0 = (c0 * x0) + (s0 * y0); + out1 = (c0 * y0) - (s0 * x0); + + ST_SP(out0, x); + ST_SP(out1, y); + x += 4; + y += 4; + } + if (n & 2) + { + LD_GP2_INC(px, 1, fx0, fx1); + LD_GP2_INC(py, 1, fy0, fy1); + + tp0 = (c * fx0) + (s * fy0); + tp1 = (c * fy0) - (s * fx0); + tp2 = (c * fx1) + (s * fy1); + tp3 = (c * fy1) - (s * fx1); + + ST_GP2_INC(tp0, tp2, x, 1); + ST_GP2_INC(tp1, tp3, y, 1); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = (c * fx0) + (s * fy0); + tp1 = (c * fy0) - (s * fx0); + + *x = tp0; + *y = tp1; + } + } + } + else + { + if ((0 == c) && (0 == s)) + { + for (i = n; i--;) + { + *x = 0; + *y = 0; + x += inc_x; + y += inc_y; + } + } + else if ((1 == c) && (1 == s)) + { + if (n >> 2) + { + fx0 = *px; px += inc_x; + fx1 = *px; px += inc_x; + fx2 = *px; px += inc_x; + fx3 = *px; px += inc_x; + fy0 = *py; py += inc_y; + fy1 = *py; py += inc_y; + fy2 = *py; py += inc_y; + fy3 = *py; py += inc_y; + + for (i = (n >> 2) -1; i--;) + { + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + tp2 = fx1 + fy1; + tp3 = fy1 - fx1; + tp4 = fx2 + fy2; + tp5 = fy2 - fx2; + tp6 = fx3 + fy3; + tp7 = fy3 - fx3; + + fx0 = *px; px += inc_x; + *x = tp0; x += inc_x; + fx1 = *px; px += inc_x; + *x = tp2; x += inc_x; + fx2 = *px; px += inc_x; + *x = tp4; x += inc_x; + fx3 = *px; px += inc_x; + *x = tp6; x += inc_x; + fy0 = *py; py += inc_y; + *y = tp1; y += inc_y; + fy1 = *py; py += inc_y; + *y = tp3; y += inc_y; + fy2 = *py; py += inc_y; + *y = tp5; y += inc_y; + fy3 = *py; py += inc_y; + *y = tp7; y += inc_y; + } + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + tp2 = fx1 + fy1; + tp3 = fy1 - fx1; + tp4 = fx2 + fy2; + tp5 = fy2 - fx2; + tp6 = fx3 + fy3; + tp7 = fy3 - fx3; + + *x = tp0; x += inc_x; + *x = tp2; x += inc_x; + *x = tp4; x += inc_x; + *x = tp6; x += inc_x; + *y = tp1; y += inc_y; + *y = tp3; y += inc_y; + *y = tp5; y += inc_y; + *y = tp7; y += inc_y; + } + + if (n & 2) + { + LD_GP2_INC(px, inc_x, fx0, fx1); + LD_GP2_INC(py, inc_y, fy0, fy1); + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + tp2 = fx1 + fy1; + tp3 = fy1 - fx1; + + ST_GP2_INC(tp0, tp2, x, inc_x); + ST_GP2_INC(tp1, tp3, y, inc_y); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = fx0 + fy0; + tp1 = fy0 - fx0; + + *x = tp0; + *y = tp1; + } + } + else if (0 == s) + { + if (n >> 2) + { + fx0 = *px; px += inc_x; + fx1 = *px; px += inc_x; + fx2 = *px; px += inc_x; + fx3 = *px; px += inc_x; + fy0 = *py; py += inc_y; + fy1 = *py; py += inc_y; + fy2 = *py; py += inc_y; + fy3 = *py; py += inc_y; + + for (i = (n >> 2) - 1; i--;) + { + tp0 = c * fx0; + tp1 = c * fy0; + tp2 = c * fx1; + tp3 = c * fy1; + tp4 = c * fx2; + tp5 = c * fy2; + tp6 = c * fx3; + tp7 = c * fy3; + + fx0 = *px; px += inc_x; + *x = tp0; x += inc_x; + fx1 = *px; px += inc_x; + *x = tp2; x += inc_x; + fx2 = *px; px += inc_x; + *x = tp4; x += inc_x; + fx3 = *px; px += inc_x; + *x = tp6; x += inc_x; + fy0 = *py; py += inc_y; + *y = tp1; y += inc_y; + fy1 = *py; py += inc_y; + *y = tp3; y += inc_y; + fy2 = *py; py += inc_y; + *y = tp5; y += inc_y; + fy3 = *py; py += inc_y; + *y = tp7; y += inc_y; + } + + tp0 = c * fx0; + tp1 = c * fy0; + tp2 = c * fx1; + tp3 = c * fy1; + tp4 = c * fx2; + tp5 = c * fy2; + tp6 = c * fx3; + tp7 = c * fy3; + + *x = tp0; x += inc_x; + *x = tp2; x += inc_x; + *x = tp4; x += inc_x; + *x = tp6; x += inc_x; + *y = tp1; y += inc_y; + *y = tp3; y += inc_y; + *y = tp5; y += inc_y; + *y = tp7; y += inc_y; + } + if (n & 2) + { + LD_GP2_INC(px, inc_x, fx0, fx1); + LD_GP2_INC(py, inc_y, fy0, fy1); + + tp0 = c * fx0; + tp1 = c * fy0; + tp2 = c * fx1; + tp3 = c * fy1; + + ST_GP2_INC(tp0, tp2, x, inc_x); + ST_GP2_INC(tp1, tp3, y, inc_y); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = c * fx0; + tp1 = c * fy0; + + *x = tp0; + *y = tp1; + } + } + else + { + if (n >> 2) + { + fx0 = *px; px += inc_x; + fx1 = *px; px += inc_x; + fx2 = *px; px += inc_x; + fx3 = *px; px += inc_x; + fy0 = *py; py += inc_y; + fy1 = *py; py += inc_y; + fy2 = *py; py += inc_y; + fy3 = *py; py += inc_y; + + for (i = (n >> 2) - 1; i--;) + { + tp0 = c * fx0 + s * fy0; + tp1 = c * fy0 - s * fx0; + tp2 = c * fx1 + s * fy1; + tp3 = c * fy1 - s * fx1; + tp4 = c * fx2 + s * fy2; + tp5 = c * fy2 - s * fx2; + tp6 = c * fx3 + s * fy3; + tp7 = c * fy3 - s * fx3; + + fx0 = *px; px += inc_x; + *x = tp0; x += inc_x; + fx1 = *px; px += inc_x; + *x = tp2; x += inc_x; + fx2 = *px; px += inc_x; + *x = tp4; x += inc_x; + fx3 = *px; px += inc_x; + *x = tp6; x += inc_x; + fy0 = *py; py += inc_y; + *y = tp1; y += inc_y; + fy1 = *py; py += inc_y; + *y = tp3; y += inc_y; + fy2 = *py; py += inc_y; + *y = tp5; y += inc_y; + fy3 = *py; py += inc_y; + *y = tp7; y += inc_y; + } + + tp0 = c * fx0 + s * fy0; + tp1 = c * fy0 - s * fx0; + tp2 = c * fx1 + s * fy1; + tp3 = c * fy1 - s * fx1; + tp4 = c * fx2 + s * fy2; + tp5 = c * fy2 - s * fx2; + tp6 = c * fx3 + s * fy3; + tp7 = c * fy3 - s * fx3; + + *x = tp0; x += inc_x; + *x = tp2; x += inc_x; + *x = tp4; x += inc_x; + *x = tp6; x += inc_x; + *y = tp1; y += inc_y; + *y = tp3; y += inc_y; + *y = tp5; y += inc_y; + *y = tp7; y += inc_y; + } + if (n & 2) + { + LD_GP2_INC(px, inc_x, fx0, fx1); + LD_GP2_INC(py, inc_y, fy0, fy1); + + tp0 = c * fx0 + s * fy0; + tp1 = c * fy0 - s * fx0; + tp2 = c * fx1 + s * fy1; + tp3 = c * fy1 - s * fx1; + + ST_GP2_INC(tp0, tp2, x, inc_x); + ST_GP2_INC(tp1, tp3, y, inc_y); + } + if (n & 1) + { + fx0 = *px; + fy0 = *py; + + tp0 = c * fx0 + s * fy0; + tp1 = c * fy0 - s * fx0; + + *x = tp0; + *y = tp1; + } + } + } + + return 0; +} diff --git a/kernel/mips/zrot_msa.c b/kernel/mips/zrot_msa.c new file mode 100644 index 0000000..c71b157 --- /dev/null +++ b/kernel/mips/zrot_msa.c @@ -0,0 +1,738 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +#define PROCESS_ZROT(inc_x2, inc_y2) \ + if ((0 == c) && (0 == s)) \ + { \ + v2f64 zero = {0, 0}; \ + zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0); \ + zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0); \ + \ + /* process 4 floats */ \ + for (j = (n >> 1); j--;) \ + { \ + ST_DP2_INC(zero, zero, px, inc_x2); \ + ST_DP2_INC(zero, zero, py, inc_y2); \ + } \ + \ + if (n & 1) \ + { \ + ST_DP(zero, px); \ + ST_DP(zero, py); \ + } \ + } \ + else if ((1 == c) && (1 == s)) \ + { \ + /* process 8 elements */ \ + if (n >> 3) \ + { \ + BLASLONG pref_offsetx, pref_offsety; \ + \ + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \ + if (pref_offsetx > 0) \ + { \ + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \ + pref_offsetx = pref_offsetx / sizeof(FLOAT); \ + } \ + \ + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \ + if (pref_offsety > 0) \ + { \ + pref_offsety = L1_DATA_LINESIZE - pref_offsety; \ + pref_offsety = pref_offsety / sizeof(FLOAT); \ + } \ + \ + x0 = LD_DP(px); px += inc_x2; \ + x1 = LD_DP(px); px += inc_x2; \ + x2 = LD_DP(px); px += inc_x2; \ + x3 = LD_DP(px); px += inc_x2; \ + y0 = LD_DP(py); py += inc_y2; \ + y1 = LD_DP(py); py += inc_y2; \ + y2 = LD_DP(py); py += inc_y2; \ + y3 = LD_DP(py); py += inc_y2; \ + \ + for (j = (n >> 3) - 1; j--;) \ + { \ + PREFETCH(px + pref_offsetx + 16); \ + PREFETCH(px + pref_offsetx + 20); \ + PREFETCH(px + pref_offsetx + 24); \ + PREFETCH(px + pref_offsetx + 28); \ + PREFETCH(py + pref_offsety + 16); \ + PREFETCH(py + pref_offsety + 20); \ + PREFETCH(py + pref_offsety + 24); \ + PREFETCH(py + pref_offsety + 28); \ + \ + out0 = x0 + y0; \ + x4 = LD_DP(px); px += inc_x2; \ + out1 = y0 - x0; \ + x5 = LD_DP(px); px += inc_x2; \ + out2 = x1 + y1; \ + x6 = LD_DP(px); px += inc_x2; \ + out3 = y1 - x1; \ + x7 = LD_DP(px); px += inc_x2; \ + out4 = x2 + y2; \ + y4 = LD_DP(py); py += inc_y2; \ + out5 = y2 - x2; \ + y5 = LD_DP(py); py += inc_y2; \ + out6 = x3 + y3; \ + y6 = LD_DP(py); py += inc_y2; \ + out7 = y3 - x3; \ + y7 = LD_DP(py); py += inc_y2; \ + \ + ST_DP(out0, x); x += inc_x2; \ + out8 = x4 + y4; \ + ST_DP(out1, y); y += inc_y2; \ + out9 = y4 - x4; \ + ST_DP(out2, x); x += inc_x2; \ + out10 = x5 + y5; \ + ST_DP(out3, y); y += inc_y2; \ + out11 = y5 - x5; \ + ST_DP(out4, x); x += inc_x2; \ + out12 = x6 + y6; \ + ST_DP(out5, y); y += inc_y2; \ + out13 = y6 - x6; \ + ST_DP(out6, x); x += inc_x2; \ + out14 = x7 + y7; \ + ST_DP(out7, y); y += inc_y2; \ + out15 = y7 - x7; \ + \ + x0 = LD_DP(px); px += inc_x2; \ + ST_DP(out8, x); x += inc_x2; \ + x1 = LD_DP(px); px += inc_x2; \ + ST_DP(out10, x); x += inc_x2; \ + x2 = LD_DP(px); px += inc_x2; \ + ST_DP(out12, x); x += inc_x2; \ + x3 = LD_DP(px); px += inc_x2; \ + ST_DP(out14, x); x += inc_x2; \ + \ + y0 = LD_DP(py); py += inc_y2; \ + ST_DP(out9, y); y += inc_y2; \ + y1 = LD_DP(py); py += inc_y2; \ + ST_DP(out11, y); y += inc_y2; \ + y2 = LD_DP(py); py += inc_y2; \ + ST_DP(out13, y); y += inc_y2; \ + y3 = LD_DP(py); py += inc_y2; \ + ST_DP(out15, y); y += inc_y2; \ + } \ + \ + x4 = LD_DP(px); px += inc_x2; \ + x5 = LD_DP(px); px += inc_x2; \ + x6 = LD_DP(px); px += inc_x2; \ + x7 = LD_DP(px); px += inc_x2; \ + y4 = LD_DP(py); py += inc_y2; \ + y5 = LD_DP(py); py += inc_y2; \ + y6 = LD_DP(py); py += inc_y2; \ + y7 = LD_DP(py); py += inc_y2; \ + \ + out0 = x0 + y0; \ + out1 = y0 - x0; \ + out2 = x1 + y1; \ + out3 = y1 - x1; \ + out4 = x2 + y2; \ + out5 = y2 - x2; \ + out6 = x3 + y3; \ + out7 = y3 - x3; \ + out8 = x4 + y4; \ + out9 = y4 - x4; \ + out10 = x5 + y5; \ + out11 = y5 - x5; \ + out12 = x6 + y6; \ + out13 = y6 - x6; \ + out14 = x7 + y7; \ + out15 = y7 - x7; \ + \ + ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, inc_x2); \ + ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, inc_y2); \ + } \ + if (n & 4) \ + { \ + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ + \ + out0 = x0 + y0; \ + out1 = y0 - x0; \ + out2 = x1 + y1; \ + out3 = y1 - x1; \ + out4 = x2 + y2; \ + out5 = y2 - x2; \ + out6 = x3 + y3; \ + out7 = y3 - x3; \ + \ + ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ + ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ + } \ + if (n & 2) \ + { \ + LD_DP2_INC(px, inc_x2, x0, x1); \ + LD_DP2_INC(py, inc_y2, y0, y1); \ + \ + out0 = x0 + y0; \ + out1 = y0 - x0; \ + out2 = x1 + y1; \ + out3 = y1 - x1; \ + \ + ST_DP2_INC(out0, out2, x, inc_x2); \ + ST_DP2_INC(out1, out3, y, inc_y2); \ + } \ + if (n & 1) \ + { \ + x0 = LD_DP(px); \ + y0 = LD_DP(py); \ + \ + out0 = x0 + y0; \ + out1 = y0 - x0; \ + \ + ST_DP(out0, px); \ + ST_DP(out1, py); \ + } \ + } \ + else if (0 == s) \ + { \ + c0 = COPY_DOUBLE_TO_VECTOR(c); \ + \ + if (n >> 3) \ + { \ + BLASLONG pref_offsetx, pref_offsety; \ + \ + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \ + if (pref_offsetx > 0) \ + { \ + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \ + pref_offsetx = pref_offsetx / sizeof(FLOAT); \ + } \ + \ + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \ + if (pref_offsety > 0) \ + { \ + pref_offsety = L1_DATA_LINESIZE - pref_offsety; \ + pref_offsety = pref_offsety / sizeof(FLOAT); \ + } \ + \ + LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); \ + \ + for (j = (n >> 3) - 1; j--;) \ + { \ + PREFETCH(px + pref_offsetx + 16); \ + PREFETCH(px + pref_offsetx + 20); \ + PREFETCH(px + pref_offsetx + 24); \ + PREFETCH(px + pref_offsetx + 28); \ + PREFETCH(py + pref_offsety + 16); \ + PREFETCH(py + pref_offsety + 20); \ + PREFETCH(py + pref_offsety + 24); \ + PREFETCH(py + pref_offsety + 28); \ + \ + y0 = LD_DP(py); py += inc_y2; \ + x0 *= c0; \ + y1 = LD_DP(py); py += inc_y2; \ + x1 *= c0; \ + y2 = LD_DP(py); py += inc_y2; \ + x2 *= c0; \ + y3 = LD_DP(py); py += inc_y2; \ + x3 *= c0; \ + y4 = LD_DP(py); py += inc_y2; \ + x4 *= c0; \ + y5 = LD_DP(py); py += inc_y2; \ + x5 *= c0; \ + y6 = LD_DP(py); py += inc_y2; \ + x6 *= c0; \ + y7 = LD_DP(py); py += inc_y2; \ + x7 *= c0; \ + \ + ST_DP(x0, x); x += inc_x2; \ + y0 *= c0; \ + ST_DP(x1, x); x += inc_x2; \ + y1 *= c0; \ + ST_DP(x2, x); x += inc_x2; \ + y2 *= c0; \ + ST_DP(x3, x); x += inc_x2; \ + y3 *= c0; \ + ST_DP(x4, x); x += inc_x2; \ + y4 *= c0; \ + ST_DP(x5, x); x += inc_x2; \ + y5 *= c0; \ + ST_DP(x6, x); x += inc_x2; \ + y6 *= c0; \ + ST_DP(x7, x); x += inc_x2; \ + y7 *= c0; \ + \ + x0 = LD_DP(px); px += inc_x2; \ + ST_DP(y0, y); y += inc_y2; \ + x1 = LD_DP(px); px += inc_x2; \ + ST_DP(y1, y); y += inc_y2; \ + x2 = LD_DP(px); px += inc_x2; \ + ST_DP(y2, y); y += inc_y2; \ + x3 = LD_DP(px); px += inc_x2; \ + ST_DP(y3, y); y += inc_y2; \ + x4 = LD_DP(px); px += inc_x2; \ + ST_DP(y4, y); y += inc_y2; \ + x5 = LD_DP(px); px += inc_x2; \ + ST_DP(y5, y); y += inc_y2; \ + x6 = LD_DP(px); px += inc_x2; \ + ST_DP(y6, y); y += inc_y2; \ + x7 = LD_DP(px); px += inc_x2; \ + ST_DP(y7, y); y += inc_y2; \ + } \ + \ + LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); \ + \ + x0 *= c0; \ + y0 *= c0; \ + x1 *= c0; \ + y1 *= c0; \ + x2 *= c0; \ + y2 *= c0; \ + x3 *= c0; \ + y3 *= c0; \ + x4 *= c0; \ + y4 *= c0; \ + x5 *= c0; \ + y5 *= c0; \ + x6 *= c0; \ + y6 *= c0; \ + x7 *= c0; \ + y7 *= c0; \ + \ + ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); \ + ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); \ + } \ + \ + if (n & 4) \ + { \ + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ + \ + out0 = c0 * x0; \ + out1 = c0 * y0; \ + out2 = c0 * x1; \ + out3 = c0 * y1; \ + out4 = c0 * x2; \ + out5 = c0 * y2; \ + out6 = c0 * x3; \ + out7 = c0 * y3; \ + \ + ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ + ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ + } \ + if (n & 2) \ + { \ + LD_DP2_INC(px, inc_x2, x0, x1); \ + LD_DP2_INC(py, inc_y2, y0, y1); \ + \ + out0 = c0 * x0; \ + out1 = c0 * y0; \ + out2 = c0 * x1; \ + out3 = c0 * y1; \ + \ + ST_DP2_INC(out0, out2, x, inc_x2); \ + ST_DP2_INC(out1, out3, y, inc_y2); \ + } \ + if (n & 1) \ + { \ + x0 = LD_DP(px); \ + y0 = LD_DP(py); \ + \ + out0 = c0 * x0; \ + out1 = c0 * y0; \ + \ + ST_DP(out0, px); \ + ST_DP(out1, py); \ + } \ + } \ + else if (0 == c) \ + { \ + s0 = COPY_DOUBLE_TO_VECTOR(s); \ + \ + /* process 16 floats */ \ + if (n >> 3) \ + { \ + BLASLONG pref_offsetx, pref_offsety; \ + \ + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \ + if (pref_offsetx > 0) \ + { \ + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \ + pref_offsetx = pref_offsetx / sizeof(FLOAT); \ + } \ + \ + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \ + if (pref_offsety > 0) \ + { \ + pref_offsety = L1_DATA_LINESIZE - pref_offsety; \ + pref_offsety = pref_offsety / sizeof(FLOAT); \ + } \ + \ + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ + \ + for (j = (n >> 3) - 1; j--;) \ + { \ + PREFETCH(px + pref_offsetx + 16); \ + PREFETCH(px + pref_offsetx + 20); \ + PREFETCH(px + pref_offsetx + 24); \ + PREFETCH(px + pref_offsetx + 28); \ + PREFETCH(py + pref_offsety + 16); \ + PREFETCH(py + pref_offsety + 20); \ + PREFETCH(py + pref_offsety + 24); \ + PREFETCH(py + pref_offsety + 28); \ + \ + x4 = LD_DP(px); px += inc_x2; \ + out0 = s0 * y0; \ + x5 = LD_DP(px); px += inc_x2; \ + out2 = s0 * y1; \ + x6 = LD_DP(px); px += inc_x2; \ + out4 = s0 * y2; \ + x7 = LD_DP(px); px += inc_x2; \ + out6 = s0 * y3; \ + y4 = LD_DP(py); py += inc_y2; \ + out1 = -(s0 * x0); \ + y5 = LD_DP(py); py += inc_y2; \ + out3 = -(s0 * x1); \ + y6 = LD_DP(py); py += inc_y2; \ + out5 = -(s0 * x2); \ + y7 = LD_DP(py); py += inc_y2; \ + out7 = -(s0 * x3); \ + \ + ST_DP(out0, x); x += inc_y2; \ + out0 = s0 * y4; \ + ST_DP(out2, x); x += inc_y2; \ + out2 = s0 * y5; \ + ST_DP(out4, x); x += inc_y2; \ + out4 = s0 * y6; \ + ST_DP(out6, x); x += inc_y2; \ + out6 = s0 * y7; \ + ST_DP(out1, y); y += inc_y2; \ + out1 = -(s0 * x4); \ + ST_DP(out3, y); y += inc_y2; \ + out3 = -(s0 * x5); \ + ST_DP(out5, y); y += inc_y2; \ + out5 = -(s0 * x6); \ + ST_DP(out7, y); y += inc_y2; \ + out7 = -(s0 * x7); \ + \ + x0 = LD_DP(px); px += inc_x2; \ + ST_DP(out0, x); x += inc_y2; \ + x1 = LD_DP(px); px += inc_x2; \ + ST_DP(out2, x); x += inc_y2; \ + x2 = LD_DP(px); px += inc_x2; \ + ST_DP(out4, x); x += inc_y2; \ + x3 = LD_DP(px); px += inc_x2; \ + ST_DP(out6, x); x += inc_y2; \ + y0 = LD_DP(py); py += inc_y2; \ + ST_DP(out1, y); y += inc_y2; \ + y1 = LD_DP(py); py += inc_y2; \ + ST_DP(out3, y); y += inc_y2; \ + y2 = LD_DP(py); py += inc_y2; \ + ST_DP(out5, y); y += inc_y2; \ + y3 = LD_DP(py); py += inc_y2; \ + ST_DP(out7, y); y += inc_y2; \ + } \ + \ + out0 = s0 * y0; \ + out2 = s0 * y1; \ + out4 = s0 * y2; \ + out6 = s0 * y3; \ + out1 = -(s0 * x0); \ + out3 = -(s0 * x1); \ + out5 = -(s0 * x2); \ + out7 = -(s0 * x3); \ + \ + ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ + ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ + \ + LD_DP4_INC(px, inc_x2, x4, x5, x6, x7); \ + LD_DP4_INC(py, inc_y2, y4, y5, y6, y7); \ + \ + out0 = s0 * y4; \ + out2 = s0 * y5; \ + out4 = s0 * y6; \ + out6 = s0 * y7; \ + out1 = -(s0 * x4); \ + out3 = -(s0 * x5); \ + out5 = -(s0 * x6); \ + out7 = -(s0 * x7); \ + \ + ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ + ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ + } \ + if (n & 4) \ + { \ + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ + \ + out0 = s0 * y0; \ + out1 = - (s0 * x0); \ + out2 = s0 * y1; \ + out3 = - (s0 * x1); \ + out4 = s0 * y2; \ + out5 = - (s0 * x2); \ + out6 = s0 * y3; \ + out7 = - (s0 * x3); \ + \ + ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ + ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ + } \ + if (n & 2) \ + { \ + LD_DP2_INC(px, inc_x2, x0, x1); \ + LD_DP2_INC(py, inc_y2, y0, y1); \ + \ + out0 = s0 * y0; \ + out1 = - (s0 * x0); \ + out2 = s0 * y1; \ + out3 = - (s0 * x1); \ + \ + ST_DP2_INC(out0, out2, x, inc_x2); \ + ST_DP2_INC(out1, out3, y, inc_y2); \ + } \ + if (n & 1) \ + { \ + x0 = LD_DP(px); px += inc_x2; \ + y0 = LD_DP(py); py += inc_y2; \ + \ + out0 = s0 * y0; \ + out1 = - (s0 * x0); \ + \ + ST_DP(out0, x); x += inc_x2; \ + ST_DP(out1, y); y += inc_y2; \ + } \ + } \ + else \ + { \ + c0 = COPY_DOUBLE_TO_VECTOR(c); \ + s0 = COPY_DOUBLE_TO_VECTOR(s); \ + \ + if (n >> 3) \ + { \ + BLASLONG pref_offsetx, pref_offsety; \ + \ + pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \ + if (pref_offsetx > 0) \ + { \ + pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \ + pref_offsetx = pref_offsetx / sizeof(FLOAT); \ + } \ + \ + pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \ + if (pref_offsety > 0) \ + { \ + pref_offsety = L1_DATA_LINESIZE - pref_offsety; \ + pref_offsety = pref_offsety / sizeof(FLOAT); \ + } \ + \ + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ + \ + for (j = (n >> 3) - 1; j--;) \ + { \ + PREFETCH(px + pref_offsetx + 16); \ + PREFETCH(px + pref_offsetx + 20); \ + PREFETCH(px + pref_offsetx + 24); \ + PREFETCH(px + pref_offsetx + 28); \ + PREFETCH(py + pref_offsety + 16); \ + PREFETCH(py + pref_offsety + 20); \ + PREFETCH(py + pref_offsety + 24); \ + PREFETCH(py + pref_offsety + 28); \ + \ + x4 = LD_DP(px); px += inc_x2; \ + out0 = c0 * x0; \ + x5 = LD_DP(px); px += inc_x2; \ + out2 = c0 * x1; \ + x6 = LD_DP(px); px += inc_x2; \ + out4 = c0 * x2; \ + x7 = LD_DP(px); px += inc_x2; \ + out6 = c0 * x3; \ + y4 = LD_DP(py); py += inc_y2; \ + out1 = c0 * y0; \ + y5 = LD_DP(py); py += inc_y2; \ + out3 = c0 * y1; \ + y6 = LD_DP(py); py += inc_y2; \ + out5 = c0 * y2; \ + y7 = LD_DP(py); py += inc_y2; \ + out7 = c0 * y3; \ + \ + out0 += s0 * y0; \ + out2 += s0 * y1; \ + out4 += s0 * y2; \ + out6 += s0 * y3; \ + out1 -= s0 * x0; \ + out3 -= s0 * x1; \ + out5 -= s0 * x2; \ + out7 -= s0 * x3; \ + \ + ST_DP(out0, x); x += inc_x2; \ + out0 = c0 * x4; \ + ST_DP(out2, x); x += inc_x2; \ + out2 = c0 * x5; \ + ST_DP(out4, x); x += inc_x2; \ + out4 = c0 * x6; \ + ST_DP(out6, x); x += inc_x2; \ + out6 = c0 * x7; \ + ST_DP(out1, y); y += inc_y2; \ + out1 = c0 * y4; \ + ST_DP(out3, y); y += inc_y2; \ + out3 = c0 * y5; \ + ST_DP(out5, y); y += inc_y2; \ + out5 = c0 * y6; \ + ST_DP(out7, y); y += inc_y2; \ + out7 = c0 * y7; \ + \ + x0 = LD_DP(px); px += inc_x2; \ + out0 += s0 * y4; \ + x1 = LD_DP(px); px += inc_x2; \ + out2 += s0 * y5; \ + x2 = LD_DP(px); px += inc_x2; \ + out4 += s0 * y6; \ + x3 = LD_DP(px); px += inc_x2; \ + out6 += s0 * y7; \ + y0 = LD_DP(py); py += inc_y2; \ + out1 -= s0 * x4; \ + y1 = LD_DP(py); py += inc_y2; \ + out3 -= s0 * x5; \ + y2 = LD_DP(py); py += inc_y2; \ + out5 -= s0 * x6; \ + y3 = LD_DP(py); py += inc_y2; \ + out7 -= s0 * x7; \ + \ + ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ + ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ + } \ + \ + out0 = c0 * x0; \ + out0 += s0 * y0; \ + out1 = c0 * y0; \ + out1 -= s0 * x0; \ + out2 = c0 * x1; \ + out2 += s0 * y1; \ + out3 = c0 * y1; \ + out3 -= s0 * x1; \ + out4 = c0 * x2; \ + out4 += s0 * y2; \ + out5 = c0 * y2; \ + out5 -= s0 * x2; \ + out6 = c0 * x3; \ + out6 += s0 * y3; \ + out7 = c0 * y3; \ + out7 -= s0 * x3; \ + \ + ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ + ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ + \ + LD_DP4_INC(px, inc_x2, x4, x5, x6, x7); \ + LD_DP4_INC(py, inc_y2, y4, y5, y6, y7); \ + \ + out8 = c0 * x4; \ + out8 += s0 * y4; \ + out9 = c0 * y4; \ + out9 -= s0 * x4; \ + out10 = c0 * x5; \ + out10 += s0 * y5; \ + out11 = c0 * y5; \ + out11 -= s0 * x5; \ + out12 = c0 * x6; \ + out12 += s0 * y6; \ + out13 = c0 * y6; \ + out13 -= s0 * x6; \ + out14 = c0 * x7; \ + out14 += s0 * y7; \ + out15 = c0 * y7; \ + out15 -= s0 * x7; \ + \ + ST_DP4_INC(out8, out10, out12, out14, x, inc_x2); \ + ST_DP4_INC(out9, out11, out13, out15, y, inc_y2); \ + } \ + if (n & 4) \ + { \ + LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \ + LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \ + \ + out0 = (c0 * x0) + (s0 * y0); \ + out1 = (c0 * y0) - (s0 * x0); \ + out2 = (c0 * x1) + (s0 * y1); \ + out3 = (c0 * y1) - (s0 * x1); \ + out4 = (c0 * x2) + (s0 * y2); \ + out5 = (c0 * y2) - (s0 * x2); \ + out6 = (c0 * x3) + (s0 * y3); \ + out7 = (c0 * y3) - (s0 * x3); \ + \ + ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \ + ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \ + } \ + if (n & 2) \ + { \ + LD_DP2_INC(px, inc_x2, x0, x1); \ + LD_DP2_INC(py, inc_y2, y0, y1); \ + \ + out0 = (c0 * x0) + (s0 * y0); \ + out1 = (c0 * y0) - (s0 * x0); \ + out2 = (c0 * x1) + (s0 * y1); \ + out3 = (c0 * y1) - (s0 * x1); \ + \ + ST_DP2_INC(out0, out2, x, inc_x2); \ + ST_DP2_INC(out1, out3, y, inc_y2); \ + } \ + if (n & 1) \ + { \ + x0 = LD_DP(px); \ + y0 = LD_DP(py); \ + \ + out0 = (c0 * x0) + (s0 * y0); \ + out1 = (c0 * y0) - (s0 * x0); \ + \ + ST_DP(out0, px); \ + ST_DP(out1, py); \ + } \ + } + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) +{ + BLASLONG j; + FLOAT *px, *py; + v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7; + v2f64 out0, out1, out2, out3, out4, out5, out6, out7, c0, s0; + v2f64 out8, out9, out10, out11, out12, out13, out14, out15; + + px = x; + py = y; + + if ((1 == inc_x) && (1 == inc_y)) + { + PROCESS_ZROT(2, 2); + } + else + { + inc_x *= 2; + inc_y *= 2; + + PROCESS_ZROT(inc_x, inc_y); + } + + return 0; +}