fix build error
[platform/upstream/openblas.git] / kernel / mips / zrot_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 #define PROCESS_ZROT(inc_x2, inc_y2)                                                   \
32     if ((0 == c) && (0 == s))                                                          \
33     {                                                                                  \
34         v2f64 zero = {0, 0};                                                           \
35         zero = (v2f64) __msa_insert_d((v2i64) zero, 0, 0.0);                           \
36         zero = (v2f64) __msa_insert_d((v2i64) zero, 1, 0.0);                           \
37                                                                                        \
38         /* process 4 floats */                                                         \
39         for (j = (n >> 1); j--;)                                                       \
40         {                                                                              \
41             ST_DP2_INC(zero, zero, px, inc_x2);                                        \
42             ST_DP2_INC(zero, zero, py, inc_y2);                                        \
43         }                                                                              \
44                                                                                        \
45         if (n & 1)                                                                     \
46         {                                                                              \
47             ST_DP(zero, px);                                                           \
48             ST_DP(zero, py);                                                           \
49         }                                                                              \
50     }                                                                                  \
51     else if ((1 == c) && (1 == s))                                                     \
52     {                                                                                  \
53         /* process 8 elements */                                                       \
54         if (n >> 3)                                                                    \
55         {                                                                              \
56             BLASLONG pref_offsetx, pref_offsety;                                           \
57                                                                                        \
58             pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);                          \
59             if (pref_offsetx > 0)                                                      \
60             {                                                                          \
61                 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;                        \
62                 pref_offsetx = pref_offsetx / sizeof(FLOAT);                           \
63             }                                                                          \
64                                                                                        \
65             pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);                          \
66             if (pref_offsety > 0)                                                      \
67             {                                                                          \
68                 pref_offsety = L1_DATA_LINESIZE - pref_offsety;                        \
69                 pref_offsety = pref_offsety / sizeof(FLOAT);                           \
70             }                                                                          \
71                                                                                        \
72             x0 = LD_DP(px); px += inc_x2;                                              \
73             x1 = LD_DP(px); px += inc_x2;                                              \
74             x2 = LD_DP(px); px += inc_x2;                                              \
75             x3 = LD_DP(px); px += inc_x2;                                              \
76             y0 = LD_DP(py); py += inc_y2;                                              \
77             y1 = LD_DP(py); py += inc_y2;                                              \
78             y2 = LD_DP(py); py += inc_y2;                                              \
79             y3 = LD_DP(py); py += inc_y2;                                              \
80                                                                                        \
81             for (j = (n >> 3) - 1; j--;)                                               \
82             {                                                                          \
83                 PREFETCH(px + pref_offsetx + 16);                                      \
84                 PREFETCH(px + pref_offsetx + 20);                                      \
85                 PREFETCH(px + pref_offsetx + 24);                                      \
86                 PREFETCH(px + pref_offsetx + 28);                                      \
87                 PREFETCH(py + pref_offsety + 16);                                      \
88                 PREFETCH(py + pref_offsety + 20);                                      \
89                 PREFETCH(py + pref_offsety + 24);                                      \
90                 PREFETCH(py + pref_offsety + 28);                                      \
91                                                                                        \
92                 out0 = x0 + y0;                                                        \
93                 x4 = LD_DP(px); px += inc_x2;                                          \
94                 out1 = y0 - x0;                                                        \
95                 x5 = LD_DP(px); px += inc_x2;                                          \
96                 out2 = x1 + y1;                                                        \
97                 x6 = LD_DP(px); px += inc_x2;                                          \
98                 out3 = y1 - x1;                                                        \
99                 x7 = LD_DP(px); px += inc_x2;                                          \
100                 out4 = x2 + y2;                                                        \
101                 y4 = LD_DP(py); py += inc_y2;                                          \
102                 out5 = y2 - x2;                                                        \
103                 y5 = LD_DP(py); py += inc_y2;                                          \
104                 out6 = x3 + y3;                                                        \
105                 y6 = LD_DP(py); py += inc_y2;                                          \
106                 out7 = y3 - x3;                                                        \
107                 y7 = LD_DP(py); py += inc_y2;                                          \
108                                                                                        \
109                 ST_DP(out0, x); x += inc_x2;                                           \
110                 out8 = x4 + y4;                                                        \
111                 ST_DP(out1, y); y += inc_y2;                                           \
112                 out9 = y4 - x4;                                                        \
113                 ST_DP(out2, x); x += inc_x2;                                           \
114                 out10 = x5 + y5;                                                       \
115                 ST_DP(out3, y); y += inc_y2;                                           \
116                 out11 = y5 - x5;                                                       \
117                 ST_DP(out4, x); x += inc_x2;                                           \
118                 out12 = x6 + y6;                                                       \
119                 ST_DP(out5, y); y += inc_y2;                                           \
120                 out13 = y6 - x6;                                                       \
121                 ST_DP(out6, x); x += inc_x2;                                           \
122                 out14 = x7 + y7;                                                       \
123                 ST_DP(out7, y); y += inc_y2;                                           \
124                 out15 = y7 - x7;                                                       \
125                                                                                        \
126                 x0 = LD_DP(px); px += inc_x2;                                          \
127                 ST_DP(out8, x); x += inc_x2;                                           \
128                 x1 = LD_DP(px); px += inc_x2;                                          \
129                 ST_DP(out10, x); x += inc_x2;                                          \
130                 x2 = LD_DP(px); px += inc_x2;                                          \
131                 ST_DP(out12, x); x += inc_x2;                                          \
132                 x3 = LD_DP(px); px += inc_x2;                                          \
133                 ST_DP(out14, x); x += inc_x2;                                          \
134                                                                                        \
135                 y0 = LD_DP(py); py += inc_y2;                                          \
136                 ST_DP(out9, y); y += inc_y2;                                           \
137                 y1 = LD_DP(py); py += inc_y2;                                          \
138                 ST_DP(out11, y); y += inc_y2;                                          \
139                 y2 = LD_DP(py); py += inc_y2;                                          \
140                 ST_DP(out13, y); y += inc_y2;                                          \
141                 y3 = LD_DP(py); py += inc_y2;                                          \
142                 ST_DP(out15, y); y += inc_y2;                                          \
143             }                                                                          \
144                                                                                        \
145             x4 = LD_DP(px); px += inc_x2;                                              \
146             x5 = LD_DP(px); px += inc_x2;                                              \
147             x6 = LD_DP(px); px += inc_x2;                                              \
148             x7 = LD_DP(px); px += inc_x2;                                              \
149             y4 = LD_DP(py); py += inc_y2;                                              \
150             y5 = LD_DP(py); py += inc_y2;                                              \
151             y6 = LD_DP(py); py += inc_y2;                                              \
152             y7 = LD_DP(py); py += inc_y2;                                              \
153                                                                                        \
154             out0 = x0 + y0;                                                            \
155             out1 = y0 - x0;                                                            \
156             out2 = x1 + y1;                                                            \
157             out3 = y1 - x1;                                                            \
158             out4 = x2 + y2;                                                            \
159             out5 = y2 - x2;                                                            \
160             out6 = x3 + y3;                                                            \
161             out7 = y3 - x3;                                                            \
162             out8 = x4 + y4;                                                            \
163             out9 = y4 - x4;                                                            \
164             out10 = x5 + y5;                                                           \
165             out11 = y5 - x5;                                                           \
166             out12 = x6 + y6;                                                           \
167             out13 = y6 - x6;                                                           \
168             out14 = x7 + y7;                                                           \
169             out15 = y7 - x7;                                                           \
170                                                                                        \
171             ST_DP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, inc_x2);  \
172             ST_DP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, inc_y2);  \
173         }                                                                              \
174         if (n & 4)                                                                     \
175         {                                                                              \
176             LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);                                    \
177             LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);                                    \
178                                                                                        \
179             out0 = x0 + y0;                                                            \
180             out1 = y0 - x0;                                                            \
181             out2 = x1 + y1;                                                            \
182             out3 = y1 - x1;                                                            \
183             out4 = x2 + y2;                                                            \
184             out5 = y2 - x2;                                                            \
185             out6 = x3 + y3;                                                            \
186             out7 = y3 - x3;                                                            \
187                                                                                        \
188             ST_DP4_INC(out0, out2, out4, out6, x, inc_x2);                             \
189             ST_DP4_INC(out1, out3, out5, out7, y, inc_y2);                             \
190         }                                                                              \
191         if (n & 2)                                                                     \
192         {                                                                              \
193             LD_DP2_INC(px, inc_x2, x0, x1);                                            \
194             LD_DP2_INC(py, inc_y2, y0, y1);                                            \
195                                                                                        \
196             out0 = x0 + y0;                                                            \
197             out1 = y0 - x0;                                                            \
198             out2 = x1 + y1;                                                            \
199             out3 = y1 - x1;                                                            \
200                                                                                        \
201             ST_DP2_INC(out0, out2, x, inc_x2);                                         \
202             ST_DP2_INC(out1, out3, y, inc_y2);                                         \
203         }                                                                              \
204         if (n & 1)                                                                     \
205         {                                                                              \
206             x0 = LD_DP(px);                                                            \
207             y0 = LD_DP(py);                                                            \
208                                                                                        \
209             out0 = x0 + y0;                                                            \
210             out1 = y0 - x0;                                                            \
211                                                                                        \
212             ST_DP(out0, px);                                                           \
213             ST_DP(out1, py);                                                           \
214         }                                                                              \
215     }                                                                                  \
216     else if (0 == s)                                                                   \
217     {                                                                                  \
218         c0 = COPY_DOUBLE_TO_VECTOR(c);                                                 \
219                                                                                        \
220         if (n >> 3)                                                                    \
221         {                                                                              \
222             BLASLONG pref_offsetx, pref_offsety;                                           \
223                                                                                        \
224             pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);                          \
225             if (pref_offsetx > 0)                                                      \
226             {                                                                          \
227                 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;                        \
228                 pref_offsetx = pref_offsetx / sizeof(FLOAT);                           \
229             }                                                                          \
230                                                                                        \
231             pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);                          \
232             if (pref_offsety > 0)                                                      \
233             {                                                                          \
234                 pref_offsety = L1_DATA_LINESIZE - pref_offsety;                        \
235                 pref_offsety = pref_offsety / sizeof(FLOAT);                           \
236             }                                                                          \
237                                                                                        \
238             LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);                    \
239                                                                                        \
240             for (j = (n >> 3) - 1; j--;)                                               \
241             {                                                                          \
242                 PREFETCH(px + pref_offsetx + 16);                                      \
243                 PREFETCH(px + pref_offsetx + 20);                                      \
244                 PREFETCH(px + pref_offsetx + 24);                                      \
245                 PREFETCH(px + pref_offsetx + 28);                                      \
246                 PREFETCH(py + pref_offsety + 16);                                      \
247                 PREFETCH(py + pref_offsety + 20);                                      \
248                 PREFETCH(py + pref_offsety + 24);                                      \
249                 PREFETCH(py + pref_offsety + 28);                                      \
250                                                                                        \
251                 y0 = LD_DP(py); py += inc_y2;                                          \
252                 x0 *= c0;                                                              \
253                 y1 = LD_DP(py); py += inc_y2;                                          \
254                 x1 *= c0;                                                              \
255                 y2 = LD_DP(py); py += inc_y2;                                          \
256                 x2 *= c0;                                                              \
257                 y3 = LD_DP(py); py += inc_y2;                                          \
258                 x3 *= c0;                                                              \
259                 y4 = LD_DP(py); py += inc_y2;                                          \
260                 x4 *= c0;                                                              \
261                 y5 = LD_DP(py); py += inc_y2;                                          \
262                 x5 *= c0;                                                              \
263                 y6 = LD_DP(py); py += inc_y2;                                          \
264                 x6 *= c0;                                                              \
265                 y7 = LD_DP(py); py += inc_y2;                                          \
266                 x7 *= c0;                                                              \
267                                                                                        \
268                 ST_DP(x0, x); x += inc_x2;                                             \
269                 y0 *= c0;                                                              \
270                 ST_DP(x1, x); x += inc_x2;                                             \
271                 y1 *= c0;                                                              \
272                 ST_DP(x2, x); x += inc_x2;                                             \
273                 y2 *= c0;                                                              \
274                 ST_DP(x3, x); x += inc_x2;                                             \
275                 y3 *= c0;                                                              \
276                 ST_DP(x4, x); x += inc_x2;                                             \
277                 y4 *= c0;                                                              \
278                 ST_DP(x5, x); x += inc_x2;                                             \
279                 y5 *= c0;                                                              \
280                 ST_DP(x6, x); x += inc_x2;                                             \
281                 y6 *= c0;                                                              \
282                 ST_DP(x7, x); x += inc_x2;                                             \
283                 y7 *= c0;                                                              \
284                                                                                        \
285                 x0 = LD_DP(px); px += inc_x2;                                          \
286                 ST_DP(y0, y); y += inc_y2;                                             \
287                 x1 = LD_DP(px); px += inc_x2;                                          \
288                 ST_DP(y1, y); y += inc_y2;                                             \
289                 x2 = LD_DP(px); px += inc_x2;                                          \
290                 ST_DP(y2, y); y += inc_y2;                                             \
291                 x3 = LD_DP(px); px += inc_x2;                                          \
292                 ST_DP(y3, y); y += inc_y2;                                             \
293                 x4 = LD_DP(px); px += inc_x2;                                          \
294                 ST_DP(y4, y); y += inc_y2;                                             \
295                 x5 = LD_DP(px); px += inc_x2;                                          \
296                 ST_DP(y5, y); y += inc_y2;                                             \
297                 x6 = LD_DP(px); px += inc_x2;                                          \
298                 ST_DP(y6, y); y += inc_y2;                                             \
299                 x7 = LD_DP(px); px += inc_x2;                                          \
300                 ST_DP(y7, y); y += inc_y2;                                             \
301             }                                                                          \
302                                                                                        \
303             LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);                    \
304                                                                                        \
305             x0 *= c0;                                                                  \
306             y0 *= c0;                                                                  \
307             x1 *= c0;                                                                  \
308             y1 *= c0;                                                                  \
309             x2 *= c0;                                                                  \
310             y2 *= c0;                                                                  \
311             x3 *= c0;                                                                  \
312             y3 *= c0;                                                                  \
313             x4 *= c0;                                                                  \
314             y4 *= c0;                                                                  \
315             x5 *= c0;                                                                  \
316             y5 *= c0;                                                                  \
317             x6 *= c0;                                                                  \
318             y6 *= c0;                                                                  \
319             x7 *= c0;                                                                  \
320             y7 *= c0;                                                                  \
321                                                                                        \
322             ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);                     \
323             ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);                     \
324         }                                                                              \
325                                                                                        \
326         if (n & 4)                                                                     \
327         {                                                                              \
328             LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);                                    \
329             LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);                                    \
330                                                                                        \
331             out0 = c0 * x0;                                                            \
332             out1 = c0 * y0;                                                            \
333             out2 = c0 * x1;                                                            \
334             out3 = c0 * y1;                                                            \
335             out4 = c0 * x2;                                                            \
336             out5 = c0 * y2;                                                            \
337             out6 = c0 * x3;                                                            \
338             out7 = c0 * y3;                                                            \
339                                                                                        \
340             ST_DP4_INC(out0, out2, out4, out6, x, inc_x2);                             \
341             ST_DP4_INC(out1, out3, out5, out7, y, inc_y2);                             \
342         }                                                                              \
343         if (n & 2)                                                                     \
344         {                                                                              \
345             LD_DP2_INC(px, inc_x2, x0, x1);                                            \
346             LD_DP2_INC(py, inc_y2, y0, y1);                                            \
347                                                                                        \
348             out0 = c0 * x0;                                                            \
349             out1 = c0 * y0;                                                            \
350             out2 = c0 * x1;                                                            \
351             out3 = c0 * y1;                                                            \
352                                                                                        \
353             ST_DP2_INC(out0, out2, x, inc_x2);                                         \
354             ST_DP2_INC(out1, out3, y, inc_y2);                                         \
355         }                                                                              \
356         if (n & 1)                                                                     \
357         {                                                                              \
358             x0 = LD_DP(px);                                                            \
359             y0 = LD_DP(py);                                                            \
360                                                                                        \
361             out0 = c0 * x0;                                                            \
362             out1 = c0 * y0;                                                            \
363                                                                                        \
364             ST_DP(out0, px);                                                           \
365             ST_DP(out1, py);                                                           \
366         }                                                                              \
367     }                                                                                  \
368     else if (0 == c)                                                                   \
369     {                                                                                  \
370         s0 = COPY_DOUBLE_TO_VECTOR(s);                                                 \
371                                                                                        \
372         /* process 16 floats */                                                        \
373         if (n >> 3)                                                                    \
374         {                                                                              \
375             BLASLONG pref_offsetx, pref_offsety;                                           \
376                                                                                        \
377             pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);                          \
378             if (pref_offsetx > 0)                                                      \
379             {                                                                          \
380                 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;                        \
381                 pref_offsetx = pref_offsetx / sizeof(FLOAT);                           \
382             }                                                                          \
383                                                                                        \
384             pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);                          \
385             if (pref_offsety > 0)                                                      \
386             {                                                                          \
387                 pref_offsety = L1_DATA_LINESIZE - pref_offsety;                        \
388                 pref_offsety = pref_offsety / sizeof(FLOAT);                           \
389             }                                                                          \
390                                                                                        \
391             LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);                                    \
392             LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);                                    \
393                                                                                        \
394             for (j = (n >> 3) - 1; j--;)                                               \
395             {                                                                          \
396                 PREFETCH(px + pref_offsetx + 16);                                      \
397                 PREFETCH(px + pref_offsetx + 20);                                      \
398                 PREFETCH(px + pref_offsetx + 24);                                      \
399                 PREFETCH(px + pref_offsetx + 28);                                      \
400                 PREFETCH(py + pref_offsety + 16);                                      \
401                 PREFETCH(py + pref_offsety + 20);                                      \
402                 PREFETCH(py + pref_offsety + 24);                                      \
403                 PREFETCH(py + pref_offsety + 28);                                      \
404                                                                                        \
405                 x4 = LD_DP(px); px += inc_x2;                                          \
406                 out0 = s0 * y0;                                                        \
407                 x5 = LD_DP(px); px += inc_x2;                                          \
408                 out2 = s0 * y1;                                                        \
409                 x6 = LD_DP(px); px += inc_x2;                                          \
410                 out4 = s0 * y2;                                                        \
411                 x7 = LD_DP(px); px += inc_x2;                                          \
412                 out6 = s0 * y3;                                                        \
413                 y4 = LD_DP(py); py += inc_y2;                                          \
414                 out1 = -(s0 * x0);                                                     \
415                 y5 = LD_DP(py); py += inc_y2;                                          \
416                 out3 = -(s0 * x1);                                                     \
417                 y6 = LD_DP(py); py += inc_y2;                                          \
418                 out5 = -(s0 * x2);                                                     \
419                 y7 = LD_DP(py); py += inc_y2;                                          \
420                 out7 = -(s0 * x3);                                                     \
421                                                                                        \
422                 ST_DP(out0, x); x += inc_y2;                                           \
423                 out0 = s0 * y4;                                                        \
424                 ST_DP(out2, x); x += inc_y2;                                           \
425                 out2 = s0 * y5;                                                        \
426                 ST_DP(out4, x); x += inc_y2;                                           \
427                 out4 = s0 * y6;                                                        \
428                 ST_DP(out6, x); x += inc_y2;                                           \
429                 out6 = s0 * y7;                                                        \
430                 ST_DP(out1, y); y += inc_y2;                                           \
431                 out1 = -(s0 * x4);                                                     \
432                 ST_DP(out3, y); y += inc_y2;                                           \
433                 out3 = -(s0 * x5);                                                     \
434                 ST_DP(out5, y); y += inc_y2;                                           \
435                 out5 = -(s0 * x6);                                                     \
436                 ST_DP(out7, y); y += inc_y2;                                           \
437                 out7 = -(s0 * x7);                                                     \
438                                                                                        \
439                 x0 = LD_DP(px); px += inc_x2;                                          \
440                 ST_DP(out0, x); x += inc_y2;                                           \
441                 x1 = LD_DP(px); px += inc_x2;                                          \
442                 ST_DP(out2, x); x += inc_y2;                                           \
443                 x2 = LD_DP(px); px += inc_x2;                                          \
444                 ST_DP(out4, x); x += inc_y2;                                           \
445                 x3 = LD_DP(px); px += inc_x2;                                          \
446                 ST_DP(out6, x); x += inc_y2;                                           \
447                 y0 = LD_DP(py); py += inc_y2;                                          \
448                 ST_DP(out1, y); y += inc_y2;                                           \
449                 y1 = LD_DP(py); py += inc_y2;                                          \
450                 ST_DP(out3, y); y += inc_y2;                                           \
451                 y2 = LD_DP(py); py += inc_y2;                                          \
452                 ST_DP(out5, y); y += inc_y2;                                           \
453                 y3 = LD_DP(py); py += inc_y2;                                          \
454                 ST_DP(out7, y); y += inc_y2;                                           \
455             }                                                                          \
456                                                                                        \
457             out0 = s0 * y0;                                                            \
458             out2 = s0 * y1;                                                            \
459             out4 = s0 * y2;                                                            \
460             out6 = s0 * y3;                                                            \
461             out1 = -(s0 * x0);                                                         \
462             out3 = -(s0 * x1);                                                         \
463             out5 = -(s0 * x2);                                                         \
464             out7 = -(s0 * x3);                                                         \
465                                                                                        \
466             ST_DP4_INC(out0, out2, out4, out6, x, inc_x2);                             \
467             ST_DP4_INC(out1, out3, out5, out7, y, inc_y2);                             \
468                                                                                        \
469             LD_DP4_INC(px, inc_x2, x4, x5, x6, x7);                                    \
470             LD_DP4_INC(py, inc_y2, y4, y5, y6, y7);                                    \
471                                                                                        \
472             out0 = s0 * y4;                                                            \
473             out2 = s0 * y5;                                                            \
474             out4 = s0 * y6;                                                            \
475             out6 = s0 * y7;                                                            \
476             out1 = -(s0 * x4);                                                         \
477             out3 = -(s0 * x5);                                                         \
478             out5 = -(s0 * x6);                                                         \
479             out7 = -(s0 * x7);                                                         \
480                                                                                        \
481             ST_DP4_INC(out0, out2, out4, out6, x, inc_x2);                             \
482             ST_DP4_INC(out1, out3, out5, out7, y, inc_y2);                             \
483         }                                                                              \
484         if (n & 4)                                                                     \
485         {                                                                              \
486             LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);                                    \
487             LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);                                    \
488                                                                                        \
489             out0 = s0 * y0;                                                            \
490             out1 = - (s0 * x0);                                                        \
491             out2 = s0 * y1;                                                            \
492             out3 = - (s0 * x1);                                                        \
493             out4 = s0 * y2;                                                            \
494             out5 = - (s0 * x2);                                                        \
495             out6 = s0 * y3;                                                            \
496             out7 = - (s0 * x3);                                                        \
497                                                                                        \
498             ST_DP4_INC(out0, out2, out4, out6, x, inc_x2);                             \
499             ST_DP4_INC(out1, out3, out5, out7, y, inc_y2);                             \
500         }                                                                              \
501         if (n & 2)                                                                     \
502         {                                                                              \
503             LD_DP2_INC(px, inc_x2, x0, x1);                                            \
504             LD_DP2_INC(py, inc_y2, y0, y1);                                            \
505                                                                                        \
506             out0 = s0 * y0;                                                            \
507             out1 = - (s0 * x0);                                                        \
508             out2 = s0 * y1;                                                            \
509             out3 = - (s0 * x1);                                                        \
510                                                                                        \
511             ST_DP2_INC(out0, out2, x, inc_x2);                                         \
512             ST_DP2_INC(out1, out3, y, inc_y2);                                         \
513         }                                                                              \
514         if (n & 1)                                                                     \
515         {                                                                              \
516             x0 = LD_DP(px); px += inc_x2;                                              \
517             y0 = LD_DP(py); py += inc_y2;                                              \
518                                                                                        \
519             out0 = s0 * y0;                                                            \
520             out1 = - (s0 * x0);                                                        \
521                                                                                        \
522             ST_DP(out0, x); x += inc_x2;                                               \
523             ST_DP(out1, y); y += inc_y2;                                               \
524         }                                                                              \
525     }                                                                                  \
526     else                                                                               \
527     {                                                                                  \
528         c0 = COPY_DOUBLE_TO_VECTOR(c);                                                 \
529         s0 = COPY_DOUBLE_TO_VECTOR(s);                                                 \
530                                                                                        \
531         if (n >> 3)                                                                    \
532         {                                                                              \
533             BLASLONG pref_offsetx, pref_offsety;                                           \
534                                                                                        \
535             pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);                          \
536             if (pref_offsetx > 0)                                                      \
537             {                                                                          \
538                 pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;                        \
539                 pref_offsetx = pref_offsetx / sizeof(FLOAT);                           \
540             }                                                                          \
541                                                                                        \
542             pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);                          \
543             if (pref_offsety > 0)                                                      \
544             {                                                                          \
545                 pref_offsety = L1_DATA_LINESIZE - pref_offsety;                        \
546                 pref_offsety = pref_offsety / sizeof(FLOAT);                           \
547             }                                                                          \
548                                                                                        \
549             LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);                                    \
550             LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);                                    \
551                                                                                        \
552             for (j = (n >> 3) - 1; j--;)                                               \
553             {                                                                          \
554                 PREFETCH(px + pref_offsetx + 16);                                      \
555                 PREFETCH(px + pref_offsetx + 20);                                      \
556                 PREFETCH(px + pref_offsetx + 24);                                      \
557                 PREFETCH(px + pref_offsetx + 28);                                      \
558                 PREFETCH(py + pref_offsety + 16);                                      \
559                 PREFETCH(py + pref_offsety + 20);                                      \
560                 PREFETCH(py + pref_offsety + 24);                                      \
561                 PREFETCH(py + pref_offsety + 28);                                      \
562                                                                                        \
563                 x4 = LD_DP(px); px += inc_x2;                                          \
564                 out0 = c0 * x0;                                                        \
565                 x5 = LD_DP(px); px += inc_x2;                                          \
566                 out2 = c0 * x1;                                                        \
567                 x6 = LD_DP(px); px += inc_x2;                                          \
568                 out4 = c0 * x2;                                                        \
569                 x7 = LD_DP(px); px += inc_x2;                                          \
570                 out6 = c0 * x3;                                                        \
571                 y4 = LD_DP(py); py += inc_y2;                                          \
572                 out1 = c0 * y0;                                                        \
573                 y5 = LD_DP(py); py += inc_y2;                                          \
574                 out3 = c0 * y1;                                                        \
575                 y6 = LD_DP(py); py += inc_y2;                                          \
576                 out5 = c0 * y2;                                                        \
577                 y7 = LD_DP(py); py += inc_y2;                                          \
578                 out7 = c0 * y3;                                                        \
579                                                                                        \
580                 out0 += s0 * y0;                                                       \
581                 out2 += s0 * y1;                                                       \
582                 out4 += s0 * y2;                                                       \
583                 out6 += s0 * y3;                                                       \
584                 out1 -= s0 * x0;                                                       \
585                 out3 -= s0 * x1;                                                       \
586                 out5 -= s0 * x2;                                                       \
587                 out7 -= s0 * x3;                                                       \
588                                                                                        \
589                 ST_DP(out0, x); x += inc_x2;                                           \
590                 out0 = c0 * x4;                                                        \
591                 ST_DP(out2, x); x += inc_x2;                                           \
592                 out2 = c0 * x5;                                                        \
593                 ST_DP(out4, x); x += inc_x2;                                           \
594                 out4 = c0 * x6;                                                        \
595                 ST_DP(out6, x); x += inc_x2;                                           \
596                 out6 = c0 * x7;                                                        \
597                 ST_DP(out1, y); y += inc_y2;                                           \
598                 out1 = c0 * y4;                                                        \
599                 ST_DP(out3, y); y += inc_y2;                                           \
600                 out3 = c0 * y5;                                                        \
601                 ST_DP(out5, y); y += inc_y2;                                           \
602                 out5 = c0 * y6;                                                        \
603                 ST_DP(out7, y); y += inc_y2;                                           \
604                 out7 = c0 * y7;                                                        \
605                                                                                        \
606                 x0 = LD_DP(px); px += inc_x2;                                          \
607                 out0 += s0 * y4;                                                       \
608                 x1 = LD_DP(px); px += inc_x2;                                          \
609                 out2 += s0 * y5;                                                       \
610                 x2 = LD_DP(px); px += inc_x2;                                          \
611                 out4 += s0 * y6;                                                       \
612                 x3 = LD_DP(px); px += inc_x2;                                          \
613                 out6 += s0 * y7;                                                       \
614                 y0 = LD_DP(py); py += inc_y2;                                          \
615                 out1 -= s0 * x4;                                                       \
616                 y1 = LD_DP(py); py += inc_y2;                                          \
617                 out3 -= s0 * x5;                                                       \
618                 y2 = LD_DP(py); py += inc_y2;                                          \
619                 out5 -= s0 * x6;                                                       \
620                 y3 = LD_DP(py); py += inc_y2;                                          \
621                 out7 -= s0 * x7;                                                       \
622                                                                                        \
623                 ST_DP4_INC(out0, out2, out4, out6, x, inc_x2);                         \
624                 ST_DP4_INC(out1, out3, out5, out7, y, inc_y2);                         \
625             }                                                                          \
626                                                                                        \
627             out0 = c0 * x0;                                                            \
628             out0 += s0 * y0;                                                           \
629             out1 = c0 * y0;                                                            \
630             out1 -= s0 * x0;                                                           \
631             out2 = c0 * x1;                                                            \
632             out2 += s0 * y1;                                                           \
633             out3 = c0 * y1;                                                            \
634             out3 -= s0 * x1;                                                           \
635             out4 = c0 * x2;                                                            \
636             out4 += s0 * y2;                                                           \
637             out5 = c0 * y2;                                                            \
638             out5 -= s0 * x2;                                                           \
639             out6 = c0 * x3;                                                            \
640             out6 += s0 * y3;                                                           \
641             out7 = c0 * y3;                                                            \
642             out7 -= s0 * x3;                                                           \
643                                                                                        \
644             ST_DP4_INC(out0, out2, out4, out6, x, inc_x2);                             \
645             ST_DP4_INC(out1, out3, out5, out7, y, inc_y2);                             \
646                                                                                        \
647             LD_DP4_INC(px, inc_x2, x4, x5, x6, x7);                                    \
648             LD_DP4_INC(py, inc_y2, y4, y5, y6, y7);                                    \
649                                                                                        \
650             out8 = c0 * x4;                                                            \
651             out8 += s0 * y4;                                                           \
652             out9 = c0 * y4;                                                            \
653             out9 -= s0 * x4;                                                           \
654             out10 = c0 * x5;                                                           \
655             out10 += s0 * y5;                                                          \
656             out11 = c0 * y5;                                                           \
657             out11 -= s0 * x5;                                                          \
658             out12 = c0 * x6;                                                           \
659             out12 += s0 * y6;                                                          \
660             out13 = c0 * y6;                                                           \
661             out13 -= s0 * x6;                                                          \
662             out14 = c0 * x7;                                                           \
663             out14 += s0 * y7;                                                          \
664             out15 = c0 * y7;                                                           \
665             out15 -= s0 * x7;                                                          \
666                                                                                        \
667             ST_DP4_INC(out8, out10, out12, out14, x, inc_x2);                          \
668             ST_DP4_INC(out9, out11, out13, out15, y, inc_y2);                          \
669         }                                                                              \
670         if (n & 4)                                                                     \
671         {                                                                              \
672             LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);                                    \
673             LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);                                    \
674                                                                                        \
675             out0 = (c0 * x0) + (s0 * y0);                                              \
676             out1 = (c0 * y0) - (s0 * x0);                                              \
677             out2 = (c0 * x1) + (s0 * y1);                                              \
678             out3 = (c0 * y1) - (s0 * x1);                                              \
679             out4 = (c0 * x2) + (s0 * y2);                                              \
680             out5 = (c0 * y2) - (s0 * x2);                                              \
681             out6 = (c0 * x3) + (s0 * y3);                                              \
682             out7 = (c0 * y3) - (s0 * x3);                                              \
683                                                                                        \
684             ST_DP4_INC(out0, out2, out4, out6, x, inc_x2);                             \
685             ST_DP4_INC(out1, out3, out5, out7, y, inc_y2);                             \
686         }                                                                              \
687         if (n & 2)                                                                     \
688         {                                                                              \
689             LD_DP2_INC(px, inc_x2, x0, x1);                                            \
690             LD_DP2_INC(py, inc_y2, y0, y1);                                            \
691                                                                                        \
692             out0 = (c0 * x0) + (s0 * y0);                                              \
693             out1 = (c0 * y0) - (s0 * x0);                                              \
694             out2 = (c0 * x1) + (s0 * y1);                                              \
695             out3 = (c0 * y1) - (s0 * x1);                                              \
696                                                                                        \
697             ST_DP2_INC(out0, out2, x, inc_x2);                                         \
698             ST_DP2_INC(out1, out3, y, inc_y2);                                         \
699         }                                                                              \
700         if (n & 1)                                                                     \
701         {                                                                              \
702             x0 = LD_DP(px);                                                            \
703             y0 = LD_DP(py);                                                            \
704                                                                                        \
705             out0 = (c0 * x0) + (s0 * y0);                                              \
706             out1 = (c0 * y0) - (s0 * x0);                                              \
707                                                                                        \
708             ST_DP(out0, px);                                                           \
709             ST_DP(out1, py);                                                           \
710         }                                                                              \
711     }
712
713 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
714           FLOAT c, FLOAT s)
715 {
716     BLASLONG j;
717     FLOAT *px, *py;
718     v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
719     v2f64 out0, out1, out2, out3, out4, out5, out6, out7, c0, s0;
720     v2f64 out8, out9, out10, out11, out12, out13, out14, out15;
721
722     px = x;
723     py = y;
724
725     if ((1 == inc_x) && (1 == inc_y))
726     {
727         PROCESS_ZROT(2, 2);
728     }
729     else
730     {
731         inc_x *= 2;
732         inc_y *= 2;
733
734         PROCESS_ZROT(inc_x, inc_y);
735     }
736
737     return 0;
738 }