1 /*******************************************************************************
2 Copyright (c) 2017, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
32 BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
37 FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
38 v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
47 v2f64 zero_v = __msa_cast_to_vector_double(0);
48 zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
49 zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
51 for (i = (n >> 5); i--;)
53 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
54 zero_v, zero_v, x, 2);
55 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
56 zero_v, zero_v, x, 2);
63 ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
64 zero_v, zero_v, x, 2);
69 ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
74 ST_DP2_INC(zero_v, zero_v, x, 2);
91 da_vec = COPY_DOUBLE_TO_VECTOR(da);
98 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
101 pref_offset = L1_DATA_LINESIZE - pref_offset;
102 pref_offset = pref_offset / sizeof(FLOAT);
104 x_pref = x + pref_offset + 32 + 16;
106 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
107 for (i = 0; i < (n >> 5) - 1; i++)
109 PREF_OFFSET(x_pref, 0);
110 PREF_OFFSET(x_pref, 32);
111 PREF_OFFSET(x_pref, 64);
112 PREF_OFFSET(x_pref, 96);
113 PREF_OFFSET(x_pref, 128);
114 PREF_OFFSET(x_pref, 160);
115 PREF_OFFSET(x_pref, 192);
116 PREF_OFFSET(x_pref, 224);
119 x8 = LD_DP(px); px += 2;
121 x9 = LD_DP(px); px += 2;
123 x10 = LD_DP(px); px += 2;
125 x11 = LD_DP(px); px += 2;
127 x12 = LD_DP(px); px += 2;
129 x13 = LD_DP(px); px += 2;
131 x14 = LD_DP(px); px += 2;
133 x15 = LD_DP(px); px += 2;
136 ST_DP(x0, x); x += 2;
138 ST_DP(x1, x); x += 2;
140 ST_DP(x2, x); x += 2;
142 ST_DP(x3, x); x += 2;
144 ST_DP(x4, x); x += 2;
146 ST_DP(x5, x); x += 2;
148 ST_DP(x6, x); x += 2;
150 ST_DP(x7, x); x += 2;
151 ST_DP(x8, x); x += 2;
152 x0 = LD_DP(px); px += 2;
153 ST_DP(x9, x); x += 2;
154 x1 = LD_DP(px); px += 2;
155 ST_DP(x10, x); x += 2;
156 x2 = LD_DP(px); px += 2;
157 ST_DP(x11, x); x += 2;
158 x3 = LD_DP(px); px += 2;
159 ST_DP(x12, x); x += 2;
160 x4 = LD_DP(px); px += 2;
161 ST_DP(x13, x); x += 2;
162 x5 = LD_DP(px); px += 2;
163 ST_DP(x14, x); x += 2;
164 x6 = LD_DP(px); px += 2;
165 ST_DP(x15, x); x += 2;
166 x7 = LD_DP(px); px += 2;
169 x8 = LD_DP(px); px += 2;
171 x9 = LD_DP(px); px += 2;
173 x10 = LD_DP(px); px += 2;
175 x11 = LD_DP(px); px += 2;
177 x12 = LD_DP(px); px += 2;
179 x13 = LD_DP(px); px += 2;
181 x14 = LD_DP(px); px += 2;
183 x15 = LD_DP(px); px += 2;
186 ST_DP(x0, x); x += 2;
188 ST_DP(x1, x); x += 2;
190 ST_DP(x2, x); x += 2;
192 ST_DP(x3, x); x += 2;
194 ST_DP(x4, x); x += 2;
196 ST_DP(x5, x); x += 2;
198 ST_DP(x6, x); x += 2;
200 ST_DP(x7, x); x += 2;
202 ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2);
209 LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
210 MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
211 MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
212 ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
217 LD_DP4_INC(px, 2, x0, x1, x2, x3);
218 MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
219 ST_DP4_INC(x0, x1, x2, x3, x, 2);
224 LD_DP2_INC(px, 2, x0, x1);
225 MUL2(x0, da_vec, x1, da_vec, x0, x1);
226 ST_DP2_INC(x0, x1, x, 2);
231 LD_GP2_INC(px, 1, f0, f1);
232 MUL2(f0, da, f1, da, f0, f1);
233 ST_GP2_INC(f0, f1, x, 1);
258 LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
259 for (i = 0; i < (n >> 4) - 1; i++)
261 LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
262 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
286 *x = f10; x += inc_x;
287 *x = f11; x += inc_x;
288 f0 = *px; px += inc_x;
289 *x = f12; x += inc_x;
290 f1 = *px; px += inc_x;
291 *x = f13; x += inc_x;
292 f2 = *px; px += inc_x;
293 *x = f14; x += inc_x;
294 f3 = *px; px += inc_x;
295 *x = f15; x += inc_x;
296 f4 = *px; px += inc_x;
297 f5 = *px; px += inc_x;
298 f6 = *px; px += inc_x;
299 f7 = *px; px += inc_x;
302 LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
303 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
327 *x = f10; x += inc_x;
328 *x = f11; x += inc_x;
329 *x = f12; x += inc_x;
330 *x = f13; x += inc_x;
331 *x = f14; x += inc_x;
332 *x = f15; x += inc_x;
339 LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
340 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
341 MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
342 ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
347 LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
348 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
349 ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
354 LD_GP2_INC(px, inc_x, f0, f1);
355 MUL2(f0, da, f1, da, f0, f1);
356 ST_GP2_INC(f0, f1, x, inc_x);