1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 #define PROCESS_ZROT(inc_x2, inc_y2) \
32 if ((0 == c) && (0 == s)) \
34 v2f64 zero = {0, 0}; \
35 zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0); \
36 zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0); \
38 /* process 4 floats */ \
39 for (j = (n >> 1); j--;) \
41 ST_DP2_INC(zero, zero, px, inc_x2); \
42 ST_DP2_INC(zero, zero, py, inc_y2); \
51 else if ((1 == c) && (1 == s)) \
53 /* process 8 elements */ \
56 BLASLONG pref_offsetx, pref_offsety; \
58 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \
59 if (pref_offsetx > 0) \
61 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \
62 pref_offsetx = pref_offsetx / sizeof(FLOAT); \
65 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \
66 if (pref_offsety > 0) \
68 pref_offsety = L1_DATA_LINESIZE - pref_offsety; \
69 pref_offsety = pref_offsety / sizeof(FLOAT); \
72 x0 = LD_DP(px); px += inc_x2; \
73 x1 = LD_DP(px); px += inc_x2; \
74 x2 = LD_DP(px); px += inc_x2; \
75 x3 = LD_DP(px); px += inc_x2; \
76 y0 = LD_DP(py); py += inc_y2; \
77 y1 = LD_DP(py); py += inc_y2; \
78 y2 = LD_DP(py); py += inc_y2; \
79 y3 = LD_DP(py); py += inc_y2; \
81 for (j = (n >> 3) - 1; j--;) \
83 PREFETCH(px + pref_offsetx + 16); \
84 PREFETCH(px + pref_offsetx + 20); \
85 PREFETCH(px + pref_offsetx + 24); \
86 PREFETCH(px + pref_offsetx + 28); \
87 PREFETCH(py + pref_offsety + 16); \
88 PREFETCH(py + pref_offsety + 20); \
89 PREFETCH(py + pref_offsety + 24); \
90 PREFETCH(py + pref_offsety + 28); \
93 x4 = LD_DP(px); px += inc_x2; \
95 x5 = LD_DP(px); px += inc_x2; \
97 x6 = LD_DP(px); px += inc_x2; \
99 x7 = LD_DP(px); px += inc_x2; \
101 y4 = LD_DP(py); py += inc_y2; \
103 y5 = LD_DP(py); py += inc_y2; \
105 y6 = LD_DP(py); py += inc_y2; \
107 y7 = LD_DP(py); py += inc_y2; \
109 ST_DP(out0, x); x += inc_x2; \
111 ST_DP(out1, y); y += inc_y2; \
113 ST_DP(out2, x); x += inc_x2; \
115 ST_DP(out3, y); y += inc_y2; \
117 ST_DP(out4, x); x += inc_x2; \
119 ST_DP(out5, y); y += inc_y2; \
121 ST_DP(out6, x); x += inc_x2; \
123 ST_DP(out7, y); y += inc_y2; \
126 x0 = LD_DP(px); px += inc_x2; \
127 ST_DP(out8, x); x += inc_x2; \
128 x1 = LD_DP(px); px += inc_x2; \
129 ST_DP(out10, x); x += inc_x2; \
130 x2 = LD_DP(px); px += inc_x2; \
131 ST_DP(out12, x); x += inc_x2; \
132 x3 = LD_DP(px); px += inc_x2; \
133 ST_DP(out14, x); x += inc_x2; \
135 y0 = LD_DP(py); py += inc_y2; \
136 ST_DP(out9, y); y += inc_y2; \
137 y1 = LD_DP(py); py += inc_y2; \
138 ST_DP(out11, y); y += inc_y2; \
139 y2 = LD_DP(py); py += inc_y2; \
140 ST_DP(out13, y); y += inc_y2; \
141 y3 = LD_DP(py); py += inc_y2; \
142 ST_DP(out15, y); y += inc_y2; \
145 x4 = LD_DP(px); px += inc_x2; \
146 x5 = LD_DP(px); px += inc_x2; \
147 x6 = LD_DP(px); px += inc_x2; \
148 x7 = LD_DP(px); px += inc_x2; \
149 y4 = LD_DP(py); py += inc_y2; \
150 y5 = LD_DP(py); py += inc_y2; \
151 y6 = LD_DP(py); py += inc_y2; \
152 y7 = LD_DP(py); py += inc_y2; \
171 ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, inc_x2); \
172 ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, inc_y2); \
176 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
177 LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
188 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
189 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
193 LD_DP2_INC(px, inc_x2, x0, x1); \
194 LD_DP2_INC(py, inc_y2, y0, y1); \
201 ST_DP2_INC(out0, out2, x, inc_x2); \
202 ST_DP2_INC(out1, out3, y, inc_y2); \
218 c0 = COPY_DOUBLE_TO_VECTOR(c); \
222 BLASLONG pref_offsetx, pref_offsety; \
224 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \
225 if (pref_offsetx > 0) \
227 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \
228 pref_offsetx = pref_offsetx / sizeof(FLOAT); \
231 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \
232 if (pref_offsety > 0) \
234 pref_offsety = L1_DATA_LINESIZE - pref_offsety; \
235 pref_offsety = pref_offsety / sizeof(FLOAT); \
238 LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7); \
240 for (j = (n >> 3) - 1; j--;) \
242 PREFETCH(px + pref_offsetx + 16); \
243 PREFETCH(px + pref_offsetx + 20); \
244 PREFETCH(px + pref_offsetx + 24); \
245 PREFETCH(px + pref_offsetx + 28); \
246 PREFETCH(py + pref_offsety + 16); \
247 PREFETCH(py + pref_offsety + 20); \
248 PREFETCH(py + pref_offsety + 24); \
249 PREFETCH(py + pref_offsety + 28); \
251 y0 = LD_DP(py); py += inc_y2; \
253 y1 = LD_DP(py); py += inc_y2; \
255 y2 = LD_DP(py); py += inc_y2; \
257 y3 = LD_DP(py); py += inc_y2; \
259 y4 = LD_DP(py); py += inc_y2; \
261 y5 = LD_DP(py); py += inc_y2; \
263 y6 = LD_DP(py); py += inc_y2; \
265 y7 = LD_DP(py); py += inc_y2; \
268 ST_DP(x0, x); x += inc_x2; \
270 ST_DP(x1, x); x += inc_x2; \
272 ST_DP(x2, x); x += inc_x2; \
274 ST_DP(x3, x); x += inc_x2; \
276 ST_DP(x4, x); x += inc_x2; \
278 ST_DP(x5, x); x += inc_x2; \
280 ST_DP(x6, x); x += inc_x2; \
282 ST_DP(x7, x); x += inc_x2; \
285 x0 = LD_DP(px); px += inc_x2; \
286 ST_DP(y0, y); y += inc_y2; \
287 x1 = LD_DP(px); px += inc_x2; \
288 ST_DP(y1, y); y += inc_y2; \
289 x2 = LD_DP(px); px += inc_x2; \
290 ST_DP(y2, y); y += inc_y2; \
291 x3 = LD_DP(px); px += inc_x2; \
292 ST_DP(y3, y); y += inc_y2; \
293 x4 = LD_DP(px); px += inc_x2; \
294 ST_DP(y4, y); y += inc_y2; \
295 x5 = LD_DP(px); px += inc_x2; \
296 ST_DP(y5, y); y += inc_y2; \
297 x6 = LD_DP(px); px += inc_x2; \
298 ST_DP(y6, y); y += inc_y2; \
299 x7 = LD_DP(px); px += inc_x2; \
300 ST_DP(y7, y); y += inc_y2; \
303 LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7); \
322 ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2); \
323 ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2); \
328 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
329 LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
340 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
341 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
345 LD_DP2_INC(px, inc_x2, x0, x1); \
346 LD_DP2_INC(py, inc_y2, y0, y1); \
353 ST_DP2_INC(out0, out2, x, inc_x2); \
354 ST_DP2_INC(out1, out3, y, inc_y2); \
370 s0 = COPY_DOUBLE_TO_VECTOR(s); \
372 /* process 16 floats */ \
375 BLASLONG pref_offsetx, pref_offsety; \
377 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \
378 if (pref_offsetx > 0) \
380 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \
381 pref_offsetx = pref_offsetx / sizeof(FLOAT); \
384 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \
385 if (pref_offsety > 0) \
387 pref_offsety = L1_DATA_LINESIZE - pref_offsety; \
388 pref_offsety = pref_offsety / sizeof(FLOAT); \
391 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
392 LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
394 for (j = (n >> 3) - 1; j--;) \
396 PREFETCH(px + pref_offsetx + 16); \
397 PREFETCH(px + pref_offsetx + 20); \
398 PREFETCH(px + pref_offsetx + 24); \
399 PREFETCH(px + pref_offsetx + 28); \
400 PREFETCH(py + pref_offsety + 16); \
401 PREFETCH(py + pref_offsety + 20); \
402 PREFETCH(py + pref_offsety + 24); \
403 PREFETCH(py + pref_offsety + 28); \
405 x4 = LD_DP(px); px += inc_x2; \
407 x5 = LD_DP(px); px += inc_x2; \
409 x6 = LD_DP(px); px += inc_x2; \
411 x7 = LD_DP(px); px += inc_x2; \
413 y4 = LD_DP(py); py += inc_y2; \
415 y5 = LD_DP(py); py += inc_y2; \
417 y6 = LD_DP(py); py += inc_y2; \
419 y7 = LD_DP(py); py += inc_y2; \
422 ST_DP(out0, x); x += inc_y2; \
424 ST_DP(out2, x); x += inc_y2; \
426 ST_DP(out4, x); x += inc_y2; \
428 ST_DP(out6, x); x += inc_y2; \
430 ST_DP(out1, y); y += inc_y2; \
432 ST_DP(out3, y); y += inc_y2; \
434 ST_DP(out5, y); y += inc_y2; \
436 ST_DP(out7, y); y += inc_y2; \
439 x0 = LD_DP(px); px += inc_x2; \
440 ST_DP(out0, x); x += inc_y2; \
441 x1 = LD_DP(px); px += inc_x2; \
442 ST_DP(out2, x); x += inc_y2; \
443 x2 = LD_DP(px); px += inc_x2; \
444 ST_DP(out4, x); x += inc_y2; \
445 x3 = LD_DP(px); px += inc_x2; \
446 ST_DP(out6, x); x += inc_y2; \
447 y0 = LD_DP(py); py += inc_y2; \
448 ST_DP(out1, y); y += inc_y2; \
449 y1 = LD_DP(py); py += inc_y2; \
450 ST_DP(out3, y); y += inc_y2; \
451 y2 = LD_DP(py); py += inc_y2; \
452 ST_DP(out5, y); y += inc_y2; \
453 y3 = LD_DP(py); py += inc_y2; \
454 ST_DP(out7, y); y += inc_y2; \
466 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
467 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
469 LD_DP4_INC(px, inc_x2, x4, x5, x6, x7); \
470 LD_DP4_INC(py, inc_y2, y4, y5, y6, y7); \
481 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
482 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
486 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
487 LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
490 out1 = - (s0 * x0); \
492 out3 = - (s0 * x1); \
494 out5 = - (s0 * x2); \
496 out7 = - (s0 * x3); \
498 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
499 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
503 LD_DP2_INC(px, inc_x2, x0, x1); \
504 LD_DP2_INC(py, inc_y2, y0, y1); \
507 out1 = - (s0 * x0); \
509 out3 = - (s0 * x1); \
511 ST_DP2_INC(out0, out2, x, inc_x2); \
512 ST_DP2_INC(out1, out3, y, inc_y2); \
516 x0 = LD_DP(px); px += inc_x2; \
517 y0 = LD_DP(py); py += inc_y2; \
520 out1 = - (s0 * x0); \
522 ST_DP(out0, x); x += inc_x2; \
523 ST_DP(out1, y); y += inc_y2; \
528 c0 = COPY_DOUBLE_TO_VECTOR(c); \
529 s0 = COPY_DOUBLE_TO_VECTOR(s); \
533 BLASLONG pref_offsetx, pref_offsety; \
535 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1); \
536 if (pref_offsetx > 0) \
538 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx; \
539 pref_offsetx = pref_offsetx / sizeof(FLOAT); \
542 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1); \
543 if (pref_offsety > 0) \
545 pref_offsety = L1_DATA_LINESIZE - pref_offsety; \
546 pref_offsety = pref_offsety / sizeof(FLOAT); \
549 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
550 LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
552 for (j = (n >> 3) - 1; j--;) \
554 PREFETCH(px + pref_offsetx + 16); \
555 PREFETCH(px + pref_offsetx + 20); \
556 PREFETCH(px + pref_offsetx + 24); \
557 PREFETCH(px + pref_offsetx + 28); \
558 PREFETCH(py + pref_offsety + 16); \
559 PREFETCH(py + pref_offsety + 20); \
560 PREFETCH(py + pref_offsety + 24); \
561 PREFETCH(py + pref_offsety + 28); \
563 x4 = LD_DP(px); px += inc_x2; \
565 x5 = LD_DP(px); px += inc_x2; \
567 x6 = LD_DP(px); px += inc_x2; \
569 x7 = LD_DP(px); px += inc_x2; \
571 y4 = LD_DP(py); py += inc_y2; \
573 y5 = LD_DP(py); py += inc_y2; \
575 y6 = LD_DP(py); py += inc_y2; \
577 y7 = LD_DP(py); py += inc_y2; \
589 ST_DP(out0, x); x += inc_x2; \
591 ST_DP(out2, x); x += inc_x2; \
593 ST_DP(out4, x); x += inc_x2; \
595 ST_DP(out6, x); x += inc_x2; \
597 ST_DP(out1, y); y += inc_y2; \
599 ST_DP(out3, y); y += inc_y2; \
601 ST_DP(out5, y); y += inc_y2; \
603 ST_DP(out7, y); y += inc_y2; \
606 x0 = LD_DP(px); px += inc_x2; \
608 x1 = LD_DP(px); px += inc_x2; \
610 x2 = LD_DP(px); px += inc_x2; \
612 x3 = LD_DP(px); px += inc_x2; \
614 y0 = LD_DP(py); py += inc_y2; \
616 y1 = LD_DP(py); py += inc_y2; \
618 y2 = LD_DP(py); py += inc_y2; \
620 y3 = LD_DP(py); py += inc_y2; \
623 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
624 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
644 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
645 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
647 LD_DP4_INC(px, inc_x2, x4, x5, x6, x7); \
648 LD_DP4_INC(py, inc_y2, y4, y5, y6, y7); \
667 ST_DP4_INC(out8, out10, out12, out14, x, inc_x2); \
668 ST_DP4_INC(out9, out11, out13, out15, y, inc_y2); \
672 LD_DP4_INC(px, inc_x2, x0, x1, x2, x3); \
673 LD_DP4_INC(py, inc_y2, y0, y1, y2, y3); \
675 out0 = (c0 * x0) + (s0 * y0); \
676 out1 = (c0 * y0) - (s0 * x0); \
677 out2 = (c0 * x1) + (s0 * y1); \
678 out3 = (c0 * y1) - (s0 * x1); \
679 out4 = (c0 * x2) + (s0 * y2); \
680 out5 = (c0 * y2) - (s0 * x2); \
681 out6 = (c0 * x3) + (s0 * y3); \
682 out7 = (c0 * y3) - (s0 * x3); \
684 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2); \
685 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2); \
689 LD_DP2_INC(px, inc_x2, x0, x1); \
690 LD_DP2_INC(py, inc_y2, y0, y1); \
692 out0 = (c0 * x0) + (s0 * y0); \
693 out1 = (c0 * y0) - (s0 * x0); \
694 out2 = (c0 * x1) + (s0 * y1); \
695 out3 = (c0 * y1) - (s0 * x1); \
697 ST_DP2_INC(out0, out2, x, inc_x2); \
698 ST_DP2_INC(out1, out3, y, inc_y2); \
705 out0 = (c0 * x0) + (s0 * y0); \
706 out1 = (c0 * y0) - (s0 * x0); \
713 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
718 v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
719 v2f64 out0, out1, out2, out3, out4, out5, out6, out7, c0, s0;
720 v2f64 out8, out9, out10, out11, out12, out13, out14, out15;
725 if ((1 == inc_x) && (1 == inc_y))
734 PROCESS_ZROT(inc_x, inc_y);