1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
41 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
42 FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
45 BLASLONG i, inc_x2, inc_y2;
47 v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
48 v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
49 v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
50 v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
51 FLOAT xd0, xd1, yd0, yd1;
54 if ((da_r == 0.0) && (da_i == 0.0)) return(0);
58 dar_vec = COPY_DOUBLE_TO_VECTOR(da_r);
59 dai_vec = COPY_DOUBLE_TO_VECTOR(da_i);
61 if ((1 == inc_x) && (1 == inc_y))
63 FLOAT *x_pref, *y_pref;
66 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
69 pref_offset = L1_DATA_LINESIZE - pref_offset;
70 pref_offset = pref_offset / sizeof(FLOAT);
72 x_pref = x + pref_offset + 32;
74 pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
77 pref_offset = L1_DATA_LINESIZE - pref_offset;
78 pref_offset = pref_offset / sizeof(FLOAT);
80 y_pref = y + pref_offset + 32;
82 for (i = (n >> 3); i--;)
84 PREF_OFFSET(x_pref, 0);
85 PREF_OFFSET(x_pref, 32);
86 PREF_OFFSET(x_pref, 64);
87 PREF_OFFSET(x_pref, 96);
88 PREF_OFFSET(y_pref, 0);
89 PREF_OFFSET(y_pref, 32);
90 PREF_OFFSET(y_pref, 64);
91 PREF_OFFSET(y_pref, 96);
95 LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
96 LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
97 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
98 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
99 PCKEVOD_D2_DP(x3, x2, x1r, x1i);
100 PCKEVOD_D2_DP(y3, y2, y1r, y1i);
101 PCKEVOD_D2_DP(x5, x4, x2r, x2i);
102 PCKEVOD_D2_DP(y5, y4, y2r, y2i);
103 PCKEVOD_D2_DP(x7, x6, x3r, x3i);
104 PCKEVOD_D2_DP(y7, y6, y3r, y3i);
106 FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
107 y0i OP0 dar_vec * x0i;
108 y1i OP0 dar_vec * x1i;
109 y2i OP0 dar_vec * x2i;
110 y3i OP0 dar_vec * x3i;
111 y0r OP1 dai_vec * x0i;
112 y1r OP1 dai_vec * x1i;
113 y2r OP1 dai_vec * x2i;
114 y3r OP1 dai_vec * x3i;
115 y0i OP2 dai_vec * x0r;
116 y1i OP2 dai_vec * x1r;
117 y2i OP2 dai_vec * x2r;
118 y3i OP2 dai_vec * x3r;
120 ILVRL_D2_DP(y0i, y0r, y0, y1);
121 ILVRL_D2_DP(y1i, y1r, y2, y3);
122 ILVRL_D2_DP(y2i, y2r, y4, y5);
123 ILVRL_D2_DP(y3i, y3r, y6, y7);
124 ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
131 LD_DP4_INC(x, 2, x0, x1, x2, x3);
132 LD_DP4_INC(py, 2, y0, y1, y2, y3);
133 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
134 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
135 PCKEVOD_D2_DP(x3, x2, x1r, x1i);
136 PCKEVOD_D2_DP(y3, y2, y1r, y1i);
138 FMADD2(x0r, x1r, dar_vec, y0r, y1r);
139 y0i OP0 dar_vec * x0i;
140 y1i OP0 dar_vec * x1i;
141 y0r OP1 dai_vec * x0i;
142 y1r OP1 dai_vec * x1i;
143 y0i OP2 dai_vec * x0r;
144 y1i OP2 dai_vec * x1r;
146 ILVRL_D2_DP(y0i, y0r, y0, y1);
147 ILVRL_D2_DP(y1i, y1r, y2, y3);
148 ST_DP4_INC(y0, y1, y2, y3, y, 2);
153 LD_DP2_INC(x, 2, x0, x1);
154 LD_DP2_INC(py, 2, y0, y1);
155 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
156 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
158 y0r += dar_vec * x0r;
159 y0i OP0 dar_vec * x0i;
160 y0r OP1 dai_vec * x0i;
161 y0i OP2 dai_vec * x0r;
163 ILVRL_D2_DP(y0i, y0r, y0, y1);
164 ST_DP2_INC(y0, y1, y, 2);
169 LD_GP2_INC(x, 1, xd0, xd1);
170 LD_GP2_INC(py, 1, yd0, yd1);
177 ST_GP2_INC(yd0, yd1, y, 1);
184 BLASLONG pref_offset;
186 pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
189 pref_offset = L1_DATA_LINESIZE - pref_offset;
190 pref_offset = pref_offset / sizeof(FLOAT);
192 y_pref = y + pref_offset + 32;
196 for (i = (n >> 3); i--;)
198 PREF_OFFSET(y_pref, 0);
199 PREF_OFFSET(y_pref, 32);
200 PREF_OFFSET(y_pref, 64);
201 PREF_OFFSET(y_pref, 96);
204 LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
205 LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
206 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
207 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
208 PCKEVOD_D2_DP(x3, x2, x1r, x1i);
209 PCKEVOD_D2_DP(y3, y2, y1r, y1i);
210 PCKEVOD_D2_DP(x5, x4, x2r, x2i);
211 PCKEVOD_D2_DP(y5, y4, y2r, y2i);
212 PCKEVOD_D2_DP(x7, x6, x3r, x3i);
213 PCKEVOD_D2_DP(y7, y6, y3r, y3i);
215 FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
216 y0i OP0 dar_vec * x0i;
217 y1i OP0 dar_vec * x1i;
218 y2i OP0 dar_vec * x2i;
219 y3i OP0 dar_vec * x3i;
220 y0r OP1 dai_vec * x0i;
221 y1r OP1 dai_vec * x1i;
222 y2r OP1 dai_vec * x2i;
223 y3r OP1 dai_vec * x3i;
224 y0i OP2 dai_vec * x0r;
225 y1i OP2 dai_vec * x1r;
226 y2i OP2 dai_vec * x2r;
227 y3i OP2 dai_vec * x3r;
229 ILVRL_D2_DP(y0i, y0r, y0, y1);
230 ILVRL_D2_DP(y1i, y1r, y2, y3);
231 ILVRL_D2_DP(y2i, y2r, y4, y5);
232 ILVRL_D2_DP(y3i, y3r, y6, y7);
233 ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
240 LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
241 LD_DP4_INC(py, 2, y0, y1, y2, y3);
242 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
243 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
244 PCKEVOD_D2_DP(x3, x2, x1r, x1i);
245 PCKEVOD_D2_DP(y3, y2, y1r, y1i);
247 FMADD2(x0r, x1r, dar_vec, y0r, y1r);
248 y0i OP0 dar_vec * x0i;
249 y1i OP0 dar_vec * x1i;
250 y0r OP1 dai_vec * x0i;
251 y1r OP1 dai_vec * x1i;
252 y0i OP2 dai_vec * x0r;
253 y1i OP2 dai_vec * x1r;
255 ILVRL_D2_DP(y0i, y0r, y0, y1);
256 ILVRL_D2_DP(y1i, y1r, y2, y3);
257 ST_DP4_INC(y0, y1, y2, y3, y, 2);
262 LD_DP2_INC(x, inc_x2, x0, x1);
263 LD_DP2_INC(py, 2, y0, y1);
264 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
265 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
267 y0r += dar_vec * x0r;
268 y0i OP0 dar_vec * x0i;
269 y0r OP1 dai_vec * x0i;
270 y0i OP2 dai_vec * x0r;
272 ILVRL_D2_DP(y0i, y0r, y0, y1);
273 ST_DP2_INC(y0, y1, y, 2);
278 LD_GP2_INC(x, 1, xd0, xd1);
279 LD_GP2_INC(py, 1, yd0, yd1);
286 ST_GP2_INC(yd0, yd1, y, 1);
293 BLASLONG pref_offset;
295 pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
298 pref_offset = L1_DATA_LINESIZE - pref_offset;
299 pref_offset = pref_offset / sizeof(FLOAT);
301 x_pref = x + pref_offset + 32;
305 for (i = (n >> 3); i--;)
307 PREF_OFFSET(x_pref, 0);
308 PREF_OFFSET(x_pref, 32);
309 PREF_OFFSET(x_pref, 64);
310 PREF_OFFSET(x_pref, 96);
313 LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
314 LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
315 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
316 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
317 PCKEVOD_D2_DP(x3, x2, x1r, x1i);
318 PCKEVOD_D2_DP(y3, y2, y1r, y1i);
319 PCKEVOD_D2_DP(x5, x4, x2r, x2i);
320 PCKEVOD_D2_DP(y5, y4, y2r, y2i);
321 PCKEVOD_D2_DP(x7, x6, x3r, x3i);
322 PCKEVOD_D2_DP(y7, y6, y3r, y3i);
324 FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
325 y0i OP0 dar_vec * x0i;
326 y1i OP0 dar_vec * x1i;
327 y2i OP0 dar_vec * x2i;
328 y3i OP0 dar_vec * x3i;
329 y0r OP1 dai_vec * x0i;
330 y1r OP1 dai_vec * x1i;
331 y2r OP1 dai_vec * x2i;
332 y3r OP1 dai_vec * x3i;
333 y0i OP2 dai_vec * x0r;
334 y1i OP2 dai_vec * x1r;
335 y2i OP2 dai_vec * x2r;
336 y3i OP2 dai_vec * x3r;
338 ILVRL_D2_DP(y0i, y0r, y0, y1);
339 ILVRL_D2_DP(y1i, y1r, y2, y3);
340 ILVRL_D2_DP(y2i, y2r, y4, y5);
341 ILVRL_D2_DP(y3i, y3r, y6, y7);
342 ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
349 LD_DP4_INC(x, 2, x0, x1, x2, x3);
350 LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
351 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
352 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
353 PCKEVOD_D2_DP(x3, x2, x1r, x1i);
354 PCKEVOD_D2_DP(y3, y2, y1r, y1i);
356 FMADD2(x0r, x1r, dar_vec, y0r, y1r);
357 y0i OP0 dar_vec * x0i;
358 y1i OP0 dar_vec * x1i;
359 y0r OP1 dai_vec * x0i;
360 y1r OP1 dai_vec * x1i;
361 y0i OP2 dai_vec * x0r;
362 y1i OP2 dai_vec * x1r;
364 ILVRL_D2_DP(y0i, y0r, y0, y1);
365 ILVRL_D2_DP(y1i, y1r, y2, y3);
366 ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
371 LD_DP2_INC(x, 2, x0, x1);
372 LD_DP2_INC(py, inc_y2, y0, y1);
373 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
374 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
376 y0r += dar_vec * x0r;
377 y0i OP0 dar_vec * x0i;
378 y0r OP1 dai_vec * x0i;
379 y0i OP2 dai_vec * x0r;
381 ILVRL_D2_DP(y0i, y0r, y0, y1);
382 ST_DP2_INC(y0, y1, y, inc_y2);
387 LD_GP2_INC(x, 1, xd0, xd1);
388 LD_GP2_INC(py, 1, yd0, yd1);
395 ST_GP2_INC(yd0, yd1, y, 1);
404 for (i = (n >> 3); i--;)
406 LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
407 LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
408 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
409 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
410 PCKEVOD_D2_DP(x3, x2, x1r, x1i);
411 PCKEVOD_D2_DP(y3, y2, y1r, y1i);
412 PCKEVOD_D2_DP(x5, x4, x2r, x2i);
413 PCKEVOD_D2_DP(y5, y4, y2r, y2i);
414 PCKEVOD_D2_DP(x7, x6, x3r, x3i);
415 PCKEVOD_D2_DP(y7, y6, y3r, y3i);
417 FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
418 y0i OP0 dar_vec * x0i;
419 y1i OP0 dar_vec * x1i;
420 y2i OP0 dar_vec * x2i;
421 y3i OP0 dar_vec * x3i;
422 y0r OP1 dai_vec * x0i;
423 y1r OP1 dai_vec * x1i;
424 y2r OP1 dai_vec * x2i;
425 y3r OP1 dai_vec * x3i;
426 y0i OP2 dai_vec * x0r;
427 y1i OP2 dai_vec * x1r;
428 y2i OP2 dai_vec * x2r;
429 y3i OP2 dai_vec * x3r;
431 ILVRL_D2_DP(y0i, y0r, y0, y1);
432 ILVRL_D2_DP(y1i, y1r, y2, y3);
433 ILVRL_D2_DP(y2i, y2r, y4, y5);
434 ILVRL_D2_DP(y3i, y3r, y6, y7);
435 ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
442 LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
443 LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
444 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
445 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
446 PCKEVOD_D2_DP(x3, x2, x1r, x1i);
447 PCKEVOD_D2_DP(y3, y2, y1r, y1i);
449 FMADD2(x0r, x1r, dar_vec, y0r, y1r);
450 y0i OP0 dar_vec * x0i;
451 y1i OP0 dar_vec * x1i;
452 y0r OP1 dai_vec * x0i;
453 y1r OP1 dai_vec * x1i;
454 y0i OP2 dai_vec * x0r;
455 y1i OP2 dai_vec * x1r;
457 ILVRL_D2_DP(y0i, y0r, y0, y1);
458 ILVRL_D2_DP(y1i, y1r, y2, y3);
459 ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
464 LD_DP2_INC(x, inc_x2, x0, x1);
465 LD_DP2_INC(py, inc_y2, y0, y1);
466 PCKEVOD_D2_DP(x1, x0, x0r, x0i);
467 PCKEVOD_D2_DP(y1, y0, y0r, y0i);
469 y0r += dar_vec * x0r;
470 y0i OP0 dar_vec * x0i;
471 y0r OP1 dai_vec * x0i;
472 y0i OP2 dai_vec * x0r;
474 ILVRL_D2_DP(y0i, y0r, y0, y1);
475 ST_DP2_INC(y0, y1, y, inc_y2);
480 LD_GP2_INC(x, 1, xd0, xd1);
481 LD_GP2_INC(py, 1, yd0, yd1);
488 ST_GP2_INC(yd0, yd1, y, 1);