1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
32 FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
33 FLOAT *dummy, BLASLONG dummy2)
35 BLASLONG i = 0, pref_offsetx, pref_offsety;
37 FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
38 FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
39 v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
40 v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
42 if (n < 0) return (0);
44 pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
47 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
48 pref_offsetx = pref_offsetx / sizeof(FLOAT);
51 pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
54 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
55 pref_offsety = pref_offsety / sizeof(FLOAT);
61 if ((1 == inc_x) && (1 == inc_y))
65 LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
67 for (i = (n >> 4) - 1; i--;)
69 PREFETCH(px + pref_offsetx + 16);
70 PREFETCH(px + pref_offsetx + 20);
71 PREFETCH(px + pref_offsetx + 24);
72 PREFETCH(px + pref_offsetx + 28);
74 PREFETCH(py + pref_offsety + 16);
75 PREFETCH(py + pref_offsety + 20);
76 PREFETCH(py + pref_offsety + 24);
77 PREFETCH(py + pref_offsety + 28);
79 yv0 = LD_DP(py); py += 2;
80 ST_DP(xv0, srcy); srcy += 2;
81 yv1 = LD_DP(py); py += 2;
82 ST_DP(xv1, srcy); srcy += 2;
83 yv2 = LD_DP(py); py += 2;
84 ST_DP(xv2, srcy); srcy += 2;
85 yv3 = LD_DP(py); py += 2;
86 ST_DP(xv3, srcy); srcy += 2;
87 yv4 = LD_DP(py); py += 2;
88 ST_DP(xv4, srcy); srcy += 2;
89 yv5 = LD_DP(py); py += 2;
90 ST_DP(xv5, srcy); srcy += 2;
91 yv6 = LD_DP(py); py += 2;
92 ST_DP(xv6, srcy); srcy += 2;
93 yv7 = LD_DP(py); py += 2;
94 ST_DP(xv7, srcy); srcy += 2;
96 xv0 = LD_DP(px); px += 2;
97 ST_DP(yv0, srcx); srcx += 2;
98 xv1 = LD_DP(px); px += 2;
99 ST_DP(yv1, srcx); srcx += 2;
100 xv2 = LD_DP(px); px += 2;
101 ST_DP(yv2, srcx); srcx += 2;
102 xv3 = LD_DP(px); px += 2;
103 ST_DP(yv3, srcx); srcx += 2;
104 xv4 = LD_DP(px); px += 2;
105 ST_DP(yv4, srcx); srcx += 2;
106 xv5 = LD_DP(px); px += 2;
107 ST_DP(yv5, srcx); srcx += 2;
108 xv6 = LD_DP(px); px += 2;
109 ST_DP(yv6, srcx); srcx += 2;
110 xv7 = LD_DP(px); px += 2;
111 ST_DP(yv7, srcx); srcx += 2;
114 LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
115 ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2);
116 ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2);
121 if ((n & 8) && (n & 4) && (n & 2))
123 LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
124 LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
125 ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2);
126 ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2);
128 else if ((n & 8) && (n & 4))
130 LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5);
131 LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5);
132 ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2);
133 ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2);
135 else if ((n & 8) && (n & 2))
137 LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4);
138 LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4);
139 ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2);
140 ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2);
142 else if ((n & 4) && (n & 2))
144 LD_DP3_INC(px, 2, xv0, xv1, xv2);
145 LD_DP3_INC(py, 2, yv0, yv1, yv2);
146 ST_DP3_INC(xv0, xv1, xv2, srcy, 2);
147 ST_DP3_INC(yv0, yv1, yv2, srcx, 2);
151 LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3);
152 LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3);
153 ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2);
154 ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2);
158 LD_DP2_INC(px, 2, xv0, xv1);
159 LD_DP2_INC(py, 2, yv0, yv1);
160 ST_DP2_INC(xv0, xv1, srcy, 2);
161 ST_DP2_INC(yv0, yv1, srcx, 2);
189 for (i = (n >> 3); i--;)
191 LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
192 LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
193 ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
194 ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
199 if ((n & 4) && (n & 2) && (n & 1))
201 LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
202 LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
203 ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
204 ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
206 else if ((n & 4) && (n & 2))
208 LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
209 LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
210 ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
211 ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
213 else if ((n & 4) && (n & 1))
215 LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
216 LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
217 ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
218 ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
220 else if ((n & 2) && (n & 1))
222 LD_GP3_INC(px, inc_x, x0, x1, x2);
223 LD_GP3_INC(py, inc_y, y0, y1, y2);
224 ST_GP3_INC(x0, x1, x2, srcy, inc_y);
225 ST_GP3_INC(y0, y1, y2, srcx, inc_x);
229 LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
230 LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
231 ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
232 ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
236 LD_GP2_INC(px, inc_x, x0, x1);
237 LD_GP2_INC(py, inc_y, y0, y1);
238 ST_GP2_INC(x0, x1, srcy, inc_y);
239 ST_GP2_INC(y0, y1, srcx, inc_x);