1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
29 #include "macros_msa.h"
31 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
36 FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
37 FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
38 v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
39 v4f32 out0, out1, out2, out3, out4, out5, out6, out7;
40 v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
42 if (n <= 0) return (0);
47 if ((1 == inc_x) && (1 == inc_y))
49 if ((0 == c) && (0 == s))
51 v4f32 zero = __msa_cast_to_vector_float(0);
52 zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
53 zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
54 zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
55 zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
57 /* process 4 floats */
58 for (j = (n >> 2); j--;)
80 else if ((1 == c) && (1 == s))
84 BLASLONG pref_offsetx, pref_offsety;
86 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
89 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
90 pref_offsetx = pref_offsetx / sizeof(FLOAT);
93 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
96 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
97 pref_offsety = pref_offsety / sizeof(FLOAT);
100 x0 = LD_SP(px); px += 4;
101 x1 = LD_SP(px); px += 4;
102 x2 = LD_SP(px); px += 4;
103 x3 = LD_SP(px); px += 4;
104 y0 = LD_SP(py); py += 4;
105 y1 = LD_SP(py); py += 4;
106 y2 = LD_SP(py); py += 4;
107 y3 = LD_SP(py); py += 4;
109 for (j = (n >> 5) - 1; j--;)
111 PREFETCH(px + pref_offsetx + 32);
112 PREFETCH(px + pref_offsetx + 40);
113 PREFETCH(px + pref_offsetx + 48);
114 PREFETCH(px + pref_offsetx + 56);
115 PREFETCH(py + pref_offsety + 32);
116 PREFETCH(py + pref_offsety + 40);
117 PREFETCH(py + pref_offsety + 48);
118 PREFETCH(py + pref_offsety + 56);
121 x4 = LD_SP(px); px += 4;
123 x5 = LD_SP(px); px += 4;
125 x6 = LD_SP(px); px += 4;
127 x7 = LD_SP(px); px += 4;
129 y4 = LD_SP(py); py += 4;
131 y5 = LD_SP(py); py += 4;
133 y6 = LD_SP(py); py += 4;
135 y7 = LD_SP(py); py += 4;
137 ST_SP(out0, x); x += 4;
139 ST_SP(out1, y); y += 4;
141 ST_SP(out2, x); x += 4;
143 ST_SP(out3, y); y += 4;
145 ST_SP(out4, x); x += 4;
147 ST_SP(out5, y); y += 4;
149 ST_SP(out6, x); x += 4;
151 ST_SP(out7, y); y += 4;
154 x0 = LD_SP(px); px += 4;
155 ST_SP(out8, x); x += 4;
156 x1 = LD_SP(px); px += 4;
157 ST_SP(out10, x); x += 4;
158 x2 = LD_SP(px); px += 4;
159 ST_SP(out12, x); x += 4;
160 x3 = LD_SP(px); px += 4;
161 ST_SP(out14, x); x += 4;
162 y0 = LD_SP(py); py += 4;
163 ST_SP(out9, y); y += 4;
164 y1 = LD_SP(py); py += 4;
165 ST_SP(out11, y); y += 4;
166 y2 = LD_SP(py); py += 4;
167 ST_SP(out13, y); y += 4;
168 y3 = LD_SP(py); py += 4;
169 ST_SP(out15, y); y += 4;
172 x4 = LD_SP(px); px += 4;
173 x5 = LD_SP(px); px += 4;
174 x6 = LD_SP(px); px += 4;
175 x7 = LD_SP(px); px += 4;
176 y4 = LD_SP(py); py += 4;
177 y5 = LD_SP(py); py += 4;
178 y6 = LD_SP(py); py += 4;
179 y7 = LD_SP(py); py += 4;
198 ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4);
199 ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4);
203 LD_SP4_INC(px, 4, x0, x1, x2, x3);
204 LD_SP4_INC(py, 4, y0, y1, y2, y3);
215 ST_SP4_INC(out0, out2, out4, out6, x, 4);
216 ST_SP4_INC(out1, out3, out5, out7, y, 4);
220 LD_SP2_INC(px, 4, x0, x1);
221 LD_SP2_INC(py, 4, y0, y1);
228 ST_SP2_INC(out0, out2, x, 4);
229 ST_SP2_INC(out1, out3, y, 4);
248 LD_GP2_INC(px, 1, fx0, fx1);
249 LD_GP2_INC(py, 1, fy0, fy1);
256 ST_GP2_INC(tp0, tp2, x, 1);
257 ST_GP2_INC(tp1, tp3, y, 1);
273 c0 = COPY_FLOAT_TO_VECTOR(c);
277 BLASLONG pref_offsetx, pref_offsety;
279 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
280 if (pref_offsetx > 0)
282 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
283 pref_offsetx = pref_offsetx / sizeof(FLOAT);
286 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
287 if (pref_offsety > 0)
289 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
290 pref_offsety = pref_offsety / sizeof(FLOAT);
293 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
295 for (j = (n >> 5) - 1; j--;)
297 PREFETCH(px + pref_offsetx + 32);
298 PREFETCH(px + pref_offsetx + 40);
299 PREFETCH(px + pref_offsetx + 48);
300 PREFETCH(px + pref_offsetx + 56);
301 PREFETCH(py + pref_offsety + 32);
302 PREFETCH(py + pref_offsety + 40);
303 PREFETCH(py + pref_offsety + 48);
304 PREFETCH(py + pref_offsety + 56);
306 y0 = LD_SP(py); py += 4;
308 y1 = LD_SP(py); py += 4;
310 y2 = LD_SP(py); py += 4;
312 y3 = LD_SP(py); py += 4;
314 y4 = LD_SP(py); py += 4;
316 y5 = LD_SP(py); py += 4;
318 y6 = LD_SP(py); py += 4;
320 y7 = LD_SP(py); py += 4;
323 ST_SP(x0, x); x += 4;
325 ST_SP(x1, x); x += 4;
327 ST_SP(x2, x); x += 4;
329 ST_SP(x3, x); x += 4;
331 ST_SP(x4, x); x += 4;
333 ST_SP(x5, x); x += 4;
335 ST_SP(x6, x); x += 4;
337 ST_SP(x7, x); x += 4;
340 x0 = LD_SP(px); px += 4;
341 ST_SP(y0, y); y += 4;
342 x1 = LD_SP(px); px += 4;
343 ST_SP(y1, y); y += 4;
344 x2 = LD_SP(px); px += 4;
345 ST_SP(y2, y); y += 4;
346 x3 = LD_SP(px); px += 4;
347 ST_SP(y3, y); y += 4;
348 x4 = LD_SP(px); px += 4;
349 ST_SP(y4, y); y += 4;
350 x5 = LD_SP(px); px += 4;
351 ST_SP(y5, y); y += 4;
352 x6 = LD_SP(px); px += 4;
353 ST_SP(y6, y); y += 4;
354 x7 = LD_SP(px); px += 4;
355 ST_SP(y7, y); y += 4;
358 LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
377 ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
378 ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
382 LD_SP4_INC(px, 4, x0, x1, x2, x3);
383 LD_SP4_INC(py, 4, y0, y1, y2, y3);
394 ST_SP4_INC(x0, x1, x2, x3, x, 4);
395 ST_SP4_INC(y0, y1, y2, y3, y, 4);
399 LD_SP2_INC(px, 4, x0, x1);
400 LD_SP2_INC(py, 4, y0, y1);
407 ST_SP2_INC(x0, x1, x, 4);
408 ST_SP2_INC(y0, y1, y, 4);
427 LD_GP2_INC(px, 1, fx0, fx1);
428 LD_GP2_INC(py, 1, fy0, fy1);
435 ST_GP2_INC(tp0, tp2, x, 1);
436 ST_GP2_INC(tp1, tp3, y, 1);
452 s0 = COPY_FLOAT_TO_VECTOR(s);
454 /* process 16 floats */
457 BLASLONG pref_offsetx, pref_offsety;
459 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
460 if (pref_offsetx > 0)
462 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
463 pref_offsetx = pref_offsetx / sizeof(FLOAT);
466 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
467 if (pref_offsety > 0)
469 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
470 pref_offsety = pref_offsety / sizeof(FLOAT);
473 LD_SP4_INC(px, 4, x0, x1, x2, x3);
474 LD_SP4_INC(py, 4, y0, y1, y2, y3);
476 for (j = (n >> 5) - 1; j--;)
478 PREFETCH(px + pref_offsetx + 32);
479 PREFETCH(px + pref_offsetx + 40);
480 PREFETCH(px + pref_offsetx + 48);
481 PREFETCH(px + pref_offsetx + 56);
483 PREFETCH(py + pref_offsety + 32);
484 PREFETCH(py + pref_offsety + 40);
485 PREFETCH(py + pref_offsety + 48);
486 PREFETCH(py + pref_offsety + 56);
488 x4 = LD_SP(px); px += 4;
490 x5 = LD_SP(px); px += 4;
492 x6 = LD_SP(px); px += 4;
494 x7 = LD_SP(px); px += 4;
496 y4 = LD_SP(py); py += 4;
498 y5 = LD_SP(py); py += 4;
500 y6 = LD_SP(py); py += 4;
502 y7 = LD_SP(py); py += 4;
505 ST_SP(out0, x); x += 4;
507 ST_SP(out2, x); x += 4;
509 ST_SP(out4, x); x += 4;
511 ST_SP(out6, x); x += 4;
513 ST_SP(out1, y); y += 4;
515 ST_SP(out3, y); y += 4;
517 ST_SP(out5, y); y += 4;
519 ST_SP(out7, y); y += 4;
522 x0 = LD_SP(px); px += 4;
523 ST_SP(out0, x); x += 4;
524 x1 = LD_SP(px); px += 4;
525 ST_SP(out2, x); x += 4;
526 x2 = LD_SP(px); px += 4;
527 ST_SP(out4, x); x += 4;
528 x3 = LD_SP(px); px += 4;
529 ST_SP(out6, x); x += 4;
530 y0 = LD_SP(py); py += 4;
531 ST_SP(out1, y); y += 4;
532 y1 = LD_SP(py); py += 4;
533 ST_SP(out3, y); y += 4;
534 y2 = LD_SP(py); py += 4;
535 ST_SP(out5, y); y += 4;
536 y3 = LD_SP(py); py += 4;
537 ST_SP(out7, y); y += 4;
550 ST_SP4_INC(out0, out2, out4, out6, x, 4);
551 ST_SP4_INC(out1, out3, out5, out7, y, 4);
553 LD_SP4_INC(px, 4, x4, x5, x6, x7);
554 LD_SP4_INC(py, 4, y4, y5, y6, y7);
565 ST_SP4_INC(out0, out2, out4, out6, x, 4);
566 ST_SP4_INC(out1, out3, out5, out7, y, 4);
570 LD_SP4_INC(px, 4, x0, x1, x2, x3);
571 LD_SP4_INC(py, 4, y0, y1, y2, y3);
582 ST_SP4_INC(out0, out2, out4, out6, x, 4);
583 ST_SP4_INC(out1, out3, out5, out7, y, 4);
587 LD_SP2_INC(px, 4, x0, x1);
588 LD_SP2_INC(py, 4, y0, y1);
595 ST_SP2_INC(out0, out2, x, 4);
596 ST_SP2_INC(out1, out3, y, 4);
600 x0 = LD_SP(px); px += 4;
601 y0 = LD_SP(py); py += 4;
606 ST_SP(out0, x); x += 4;
607 ST_SP(out1, y); y += 4;
611 LD_GP2_INC(px, 1, fx0, fx1);
612 LD_GP2_INC(py, 1, fy0, fy1);
619 ST_GP2_INC(tp0, tp2, x, 1);
620 ST_GP2_INC(tp1, tp3, y, 1);
636 c0 = COPY_FLOAT_TO_VECTOR(c);
637 s0 = COPY_FLOAT_TO_VECTOR(s);
639 /* process 16 floats */
642 BLASLONG pref_offsetx, pref_offsety;
644 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
645 if (pref_offsetx > 0)
647 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
648 pref_offsetx = pref_offsetx / sizeof(FLOAT);
651 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
652 if (pref_offsety > 0)
654 pref_offsety = L1_DATA_LINESIZE - pref_offsety;
655 pref_offsety = pref_offsety / sizeof(FLOAT);
658 LD_SP4_INC(px, 4, x0, x1, x2, x3);
659 LD_SP4_INC(py, 4, y0, y1, y2, y3);
661 for (j = (n >> 5) - 1; j--;)
663 PREFETCH(px + pref_offsetx + 32);
664 PREFETCH(px + pref_offsetx + 40);
665 PREFETCH(px + pref_offsetx + 48);
666 PREFETCH(px + pref_offsetx + 56);
667 PREFETCH(py + pref_offsety + 32);
668 PREFETCH(py + pref_offsety + 40);
669 PREFETCH(py + pref_offsety + 48);
670 PREFETCH(py + pref_offsety + 56);
672 x4 = LD_SP(px); px += 4;
674 x5 = LD_SP(px); px += 4;
676 x6 = LD_SP(px); px += 4;
678 x7 = LD_SP(px); px += 4;
680 y4 = LD_SP(py); py += 4;
682 y5 = LD_SP(py); py += 4;
684 y6 = LD_SP(py); py += 4;
686 y7 = LD_SP(py); py += 4;
698 ST_SP(out0, x); x += 4;
700 ST_SP(out2, x); x += 4;
702 ST_SP(out4, x); x += 4;
704 ST_SP(out6, x); x += 4;
706 ST_SP(out1, y); y += 4;
708 ST_SP(out3, y); y += 4;
710 ST_SP(out5, y); y += 4;
712 ST_SP(out7, y); y += 4;
715 x0 = LD_SP(px); px += 4;
717 x1 = LD_SP(px); px += 4;
719 x2 = LD_SP(px); px += 4;
721 x3 = LD_SP(px); px += 4;
723 y0 = LD_SP(py); py += 4;
725 y1 = LD_SP(py); py += 4;
727 y2 = LD_SP(py); py += 4;
729 y3 = LD_SP(py); py += 4;
732 ST_SP4_INC(out0, out2, out4, out6, x, 4);
733 ST_SP4_INC(out1, out3, out5, out7, y, 4);
754 ST_SP4_INC(out0, out2, out4, out6, x, 4);
755 ST_SP4_INC(out1, out3, out5, out7, y, 4);
757 LD_SP4_INC(px, 4, x4, x5, x6, x7);
758 LD_SP4_INC(py, 4, y4, y5, y6, y7);
778 ST_SP4_INC(out0, out2, out4, out6, x, 4);
779 ST_SP4_INC(out1, out3, out5, out7, y, 4);
783 LD_SP4_INC(px, 4, x0, x1, x2, x3);
784 LD_SP4_INC(py, 4, y0, y1, y2, y3);
786 out0 = (c0 * x0) + (s0 * y0);
787 out1 = (c0 * y0) - (s0 * x0);
788 out2 = (c0 * x1) + (s0 * y1);
789 out3 = (c0 * y1) - (s0 * x1);
790 out4 = (c0 * x2) + (s0 * y2);
791 out5 = (c0 * y2) - (s0 * x2);
792 out6 = (c0 * x3) + (s0 * y3);
793 out7 = (c0 * y3) - (s0 * x3);
795 ST_SP4_INC(out0, out2, out4, out6, x, 4);
796 ST_SP4_INC(out1, out3, out5, out7, y, 4);
800 LD_SP2_INC(px, 4, x0, x1);
801 LD_SP2_INC(py, 4, y0, y1);
803 out0 = (c0 * x0) + (s0 * y0);
804 out1 = (c0 * y0) - (s0 * x0);
805 out2 = (c0 * x1) + (s0 * y1);
806 out3 = (c0 * y1) - (s0 * x1);
808 ST_SP2_INC(out0, out2, x, 4);
809 ST_SP2_INC(out1, out3, y, 4);
818 out0 = (c0 * x0) + (s0 * y0);
819 out1 = (c0 * y0) - (s0 * x0);
828 LD_GP2_INC(px, 1, fx0, fx1);
829 LD_GP2_INC(py, 1, fy0, fy1);
831 tp0 = (c * fx0) + (s * fy0);
832 tp1 = (c * fy0) - (s * fx0);
833 tp2 = (c * fx1) + (s * fy1);
834 tp3 = (c * fy1) - (s * fx1);
836 ST_GP2_INC(tp0, tp2, x, 1);
837 ST_GP2_INC(tp1, tp3, y, 1);
844 tp0 = (c * fx0) + (s * fy0);
845 tp1 = (c * fy0) - (s * fx0);
854 if ((0 == c) && (0 == s))
864 else if ((1 == c) && (1 == s))
868 fx0 = *px; px += inc_x;
869 fx1 = *px; px += inc_x;
870 fx2 = *px; px += inc_x;
871 fx3 = *px; px += inc_x;
872 fy0 = *py; py += inc_y;
873 fy1 = *py; py += inc_y;
874 fy2 = *py; py += inc_y;
875 fy3 = *py; py += inc_y;
877 for (i = (n >> 2) -1; i--;)
888 fx0 = *px; px += inc_x;
889 *x = tp0; x += inc_x;
890 fx1 = *px; px += inc_x;
891 *x = tp2; x += inc_x;
892 fx2 = *px; px += inc_x;
893 *x = tp4; x += inc_x;
894 fx3 = *px; px += inc_x;
895 *x = tp6; x += inc_x;
896 fy0 = *py; py += inc_y;
897 *y = tp1; y += inc_y;
898 fy1 = *py; py += inc_y;
899 *y = tp3; y += inc_y;
900 fy2 = *py; py += inc_y;
901 *y = tp5; y += inc_y;
902 fy3 = *py; py += inc_y;
903 *y = tp7; y += inc_y;
915 *x = tp0; x += inc_x;
916 *x = tp2; x += inc_x;
917 *x = tp4; x += inc_x;
918 *x = tp6; x += inc_x;
919 *y = tp1; y += inc_y;
920 *y = tp3; y += inc_y;
921 *y = tp5; y += inc_y;
922 *y = tp7; y += inc_y;
927 LD_GP2_INC(px, inc_x, fx0, fx1);
928 LD_GP2_INC(py, inc_y, fy0, fy1);
935 ST_GP2_INC(tp0, tp2, x, inc_x);
936 ST_GP2_INC(tp1, tp3, y, inc_y);
954 fx0 = *px; px += inc_x;
955 fx1 = *px; px += inc_x;
956 fx2 = *px; px += inc_x;
957 fx3 = *px; px += inc_x;
958 fy0 = *py; py += inc_y;
959 fy1 = *py; py += inc_y;
960 fy2 = *py; py += inc_y;
961 fy3 = *py; py += inc_y;
963 for (i = (n >> 2) - 1; i--;)
974 fx0 = *px; px += inc_x;
975 *x = tp0; x += inc_x;
976 fx1 = *px; px += inc_x;
977 *x = tp2; x += inc_x;
978 fx2 = *px; px += inc_x;
979 *x = tp4; x += inc_x;
980 fx3 = *px; px += inc_x;
981 *x = tp6; x += inc_x;
982 fy0 = *py; py += inc_y;
983 *y = tp1; y += inc_y;
984 fy1 = *py; py += inc_y;
985 *y = tp3; y += inc_y;
986 fy2 = *py; py += inc_y;
987 *y = tp5; y += inc_y;
988 fy3 = *py; py += inc_y;
989 *y = tp7; y += inc_y;
1001 *x = tp0; x += inc_x;
1002 *x = tp2; x += inc_x;
1003 *x = tp4; x += inc_x;
1004 *x = tp6; x += inc_x;
1005 *y = tp1; y += inc_y;
1006 *y = tp3; y += inc_y;
1007 *y = tp5; y += inc_y;
1008 *y = tp7; y += inc_y;
1012 LD_GP2_INC(px, inc_x, fx0, fx1);
1013 LD_GP2_INC(py, inc_y, fy0, fy1);
1020 ST_GP2_INC(tp0, tp2, x, inc_x);
1021 ST_GP2_INC(tp1, tp3, y, inc_y);
1039 fx0 = *px; px += inc_x;
1040 fx1 = *px; px += inc_x;
1041 fx2 = *px; px += inc_x;
1042 fx3 = *px; px += inc_x;
1043 fy0 = *py; py += inc_y;
1044 fy1 = *py; py += inc_y;
1045 fy2 = *py; py += inc_y;
1046 fy3 = *py; py += inc_y;
1048 for (i = (n >> 2) - 1; i--;)
1050 tp0 = c * fx0 + s * fy0;
1051 tp1 = c * fy0 - s * fx0;
1052 tp2 = c * fx1 + s * fy1;
1053 tp3 = c * fy1 - s * fx1;
1054 tp4 = c * fx2 + s * fy2;
1055 tp5 = c * fy2 - s * fx2;
1056 tp6 = c * fx3 + s * fy3;
1057 tp7 = c * fy3 - s * fx3;
1059 fx0 = *px; px += inc_x;
1060 *x = tp0; x += inc_x;
1061 fx1 = *px; px += inc_x;
1062 *x = tp2; x += inc_x;
1063 fx2 = *px; px += inc_x;
1064 *x = tp4; x += inc_x;
1065 fx3 = *px; px += inc_x;
1066 *x = tp6; x += inc_x;
1067 fy0 = *py; py += inc_y;
1068 *y = tp1; y += inc_y;
1069 fy1 = *py; py += inc_y;
1070 *y = tp3; y += inc_y;
1071 fy2 = *py; py += inc_y;
1072 *y = tp5; y += inc_y;
1073 fy3 = *py; py += inc_y;
1074 *y = tp7; y += inc_y;
1077 tp0 = c * fx0 + s * fy0;
1078 tp1 = c * fy0 - s * fx0;
1079 tp2 = c * fx1 + s * fy1;
1080 tp3 = c * fy1 - s * fx1;
1081 tp4 = c * fx2 + s * fy2;
1082 tp5 = c * fy2 - s * fx2;
1083 tp6 = c * fx3 + s * fy3;
1084 tp7 = c * fy3 - s * fx3;
1086 *x = tp0; x += inc_x;
1087 *x = tp2; x += inc_x;
1088 *x = tp4; x += inc_x;
1089 *x = tp6; x += inc_x;
1090 *y = tp1; y += inc_y;
1091 *y = tp3; y += inc_y;
1092 *y = tp5; y += inc_y;
1093 *y = tp7; y += inc_y;
1097 LD_GP2_INC(px, inc_x, fx0, fx1);
1098 LD_GP2_INC(py, inc_y, fy0, fy1);
1100 tp0 = c * fx0 + s * fy0;
1101 tp1 = c * fy0 - s * fx0;
1102 tp2 = c * fx1 + s * fy1;
1103 tp3 = c * fy1 - s * fx1;
1105 ST_GP2_INC(tp0, tp2, x, inc_x);
1106 ST_GP2_INC(tp1, tp3, y, inc_y);
1113 tp0 = c * fx0 + s * fy0;
1114 tp1 = c * fy0 - s * fx0;