1 /*******************************************************************************
2 Copyright (c) 2017, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
32 BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
37 FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
38 v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
47 v4f32 zero_v = __msa_cast_to_vector_float(0);
48 zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
49 zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
50 zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
51 zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
53 for (i = (n >> 6); i--;)
55 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
56 zero_v, zero_v, x, 4);
57 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
58 zero_v, zero_v, x, 4);
65 ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
66 zero_v, zero_v, x, 4);
71 ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
76 ST_SP2_INC(zero_v, zero_v, x, 4);
101 da_vec = COPY_FLOAT_TO_VECTOR(da);
106 BLASLONG pref_offset;
108 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
111 pref_offset = L1_DATA_LINESIZE - pref_offset;
112 pref_offset = pref_offset / sizeof(FLOAT);
114 x_pref = x + pref_offset + 64 + 32;
116 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
117 for (i = 0; i < (n >> 6) - 1; i++)
119 PREF_OFFSET(x_pref, 0);
120 PREF_OFFSET(x_pref, 32);
121 PREF_OFFSET(x_pref, 64);
122 PREF_OFFSET(x_pref, 96);
123 PREF_OFFSET(x_pref, 128);
124 PREF_OFFSET(x_pref, 160);
125 PREF_OFFSET(x_pref, 192);
126 PREF_OFFSET(x_pref, 224);
129 x8 = LD_SP(px); px += 4;
131 x9 = LD_SP(px); px += 4;
133 x10 = LD_SP(px); px += 4;
135 x11 = LD_SP(px); px += 4;
137 x12 = LD_SP(px); px += 4;
139 x13 = LD_SP(px); px += 4;
141 x14 = LD_SP(px); px += 4;
143 x15 = LD_SP(px); px += 4;
146 ST_SP(x0, x); x += 4;
148 ST_SP(x1, x); x += 4;
150 ST_SP(x2, x); x += 4;
152 ST_SP(x3, x); x += 4;
154 ST_SP(x4, x); x += 4;
156 ST_SP(x5, x); x += 4;
158 ST_SP(x6, x); x += 4;
160 ST_SP(x7, x); x += 4;
161 ST_SP(x8, x); x += 4;
162 x0 = LD_SP(px); px += 4;
163 ST_SP(x9, x); x += 4;
164 x1 = LD_SP(px); px += 4;
165 ST_SP(x10, x); x += 4;
166 x2 = LD_SP(px); px += 4;
167 ST_SP(x11, x); x += 4;
168 x3 = LD_SP(px); px += 4;
169 ST_SP(x12, x); x += 4;
170 x4 = LD_SP(px); px += 4;
171 ST_SP(x13, x); x += 4;
172 x5 = LD_SP(px); px += 4;
173 ST_SP(x14, x); x += 4;
174 x6 = LD_SP(px); px += 4;
175 ST_SP(x15, x); x += 4;
176 x7 = LD_SP(px); px += 4;
179 x8 = LD_SP(px); px += 4;
181 x9 = LD_SP(px); px += 4;
183 x10 = LD_SP(px); px += 4;
185 x11 = LD_SP(px); px += 4;
187 x12 = LD_SP(px); px += 4;
189 x13 = LD_SP(px); px += 4;
191 x14 = LD_SP(px); px += 4;
193 x15 = LD_SP(px); px += 4;
196 ST_SP(x0, x); x += 4;
198 ST_SP(x1, x); x += 4;
200 ST_SP(x2, x); x += 4;
202 ST_SP(x3, x); x += 4;
204 ST_SP(x4, x); x += 4;
206 ST_SP(x5, x); x += 4;
208 ST_SP(x6, x); x += 4;
210 ST_SP(x7, x); x += 4;
212 ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4);
219 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
220 MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
221 MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
222 ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
227 LD_SP4_INC(px, 4, x0, x1, x2, x3);
228 MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
229 ST_SP4_INC(x0, x1, x2, x3, x, 4);
234 LD_SP2_INC(px, 4, x0, x1);
235 MUL2(x0, da_vec, x1, da_vec, x0, x1);
236 ST_SP2_INC(x0, x1, x, 4);
241 LD_GP4_INC(px, 1, f0, f1, f2, f3);
242 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
243 ST_GP4_INC(f0, f1, f2, f3, x, 1);
248 LD_GP2_INC(px, 1, f0, f1);
249 MUL2(f0, da, f1, da, f0, f1);
250 ST_GP2_INC(f0, f1, x, 1);
274 LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
276 for (i = 0; i < (n >> 4) - 1; i++)
278 LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
279 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
303 *x = f10; x += inc_x;
304 *x = f11; x += inc_x;
305 f0 = *px; px += inc_x;
306 *x = f12; x += inc_x;
307 f1 = *px; px += inc_x;
308 *x = f13; x += inc_x;
309 f2 = *px; px += inc_x;
310 *x = f14; x += inc_x;
311 f3 = *px; px += inc_x;
312 *x = f15; x += inc_x;
313 f4 = *px; px += inc_x;
314 f5 = *px; px += inc_x;
315 f6 = *px; px += inc_x;
316 f7 = *px; px += inc_x;
319 LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
320 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
344 *x = f10; x += inc_x;
345 *x = f11; x += inc_x;
346 *x = f12; x += inc_x;
347 *x = f13; x += inc_x;
348 *x = f14; x += inc_x;
349 *x = f15; x += inc_x;
356 LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
357 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
358 MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
359 ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
364 LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
365 MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
366 ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
371 LD_GP2_INC(px, inc_x, f0, f1);
372 MUL2(f0, da, f1, da, f0, f1);
373 ST_GP2_INC(f0, f1, x, inc_x);