fix build error
[platform/upstream/openblas.git] / kernel / mips / crot_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
32           FLOAT c, FLOAT s)
33 {
34     BLASLONG i, j;
35     FLOAT *px, *py;
36     FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
37     FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
38     BLASLONG inc_x2, inc_y2;
39     v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
40     v4f32 out0, out1, out2, out3, out4, out5, out6, out7;
41     v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
42
43     if (n <= 0)  return (0);
44
45     px = x;
46     py = y;
47
48     if ((1 == inc_x) && (1 == inc_y))
49     {
50         if ((0 == c) && (0 == s))
51         {
52             v4f32 zero = __msa_cast_to_vector_float(0);
53             zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
54             zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
55             zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
56             zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
57
58             /* process 2 elements */
59             for (j = (n >> 1); j--;)
60             {
61                 ST_SP(zero, px);
62                 ST_SP(zero, py);
63
64                 px += 4;
65                 py += 4;
66             }
67             if (n & 1)
68             {
69                 px[0] = 0;
70                 px[1] = 0;
71                 py[0] = 0;
72                 py[1] = 0;
73             }
74         }
75         else if ((1 == c) && (1 == s))
76         {
77             if (n >> 4)
78             {
79                 BLASLONG pref_offsetx, pref_offsety;
80
81                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
82                 if (pref_offsetx > 0)
83                 {
84                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
85                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
86                 }
87
88                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
89                 if (pref_offsety > 0)
90                 {
91                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
92                     pref_offsety = pref_offsety / sizeof(FLOAT);
93                 }
94
95                 x0 = LD_SP(px); px += 4;
96                 x1 = LD_SP(px); px += 4;
97                 x2 = LD_SP(px); px += 4;
98                 x3 = LD_SP(px); px += 4;
99                 y0 = LD_SP(py); py += 4;
100                 y1 = LD_SP(py); py += 4;
101                 y2 = LD_SP(py); py += 4;
102                 y3 = LD_SP(py); py += 4;
103
104                 for (j = (n >> 4) - 1; j--;)
105                 {
106                     PREFETCH(px + pref_offsetx + 32);
107                     PREFETCH(px + pref_offsetx + 40);
108                     PREFETCH(px + pref_offsetx + 48);
109                     PREFETCH(px + pref_offsetx + 56);
110                     PREFETCH(py + pref_offsety + 32);
111                     PREFETCH(py + pref_offsety + 40);
112                     PREFETCH(py + pref_offsety + 48);
113                     PREFETCH(py + pref_offsety + 56);
114
115                     out0 = x0 + y0;
116                     x4 = LD_SP(px); px += 4;
117                     out1 = y0 - x0;
118                     x5 = LD_SP(px); px += 4;
119                     out2 = x1 + y1;
120                     x6 = LD_SP(px); px += 4;
121                     out3 = y1 - x1;
122                     x7 = LD_SP(px); px += 4;
123                     out4 = x2 + y2;
124                     y4 = LD_SP(py); py += 4;
125                     out5 = y2 - x2;
126                     y5 = LD_SP(py); py += 4;
127                     out6 = x3 + y3;
128                     y6 = LD_SP(py); py += 4;
129                     out7 = y3 - x3;
130                     y7 = LD_SP(py); py += 4;
131
132                     ST_SP(out0, x); x += 4;
133                     out8 = x4 + y4;
134                     ST_SP(out1, y); y += 4;
135                     out9 = y4 - x4;
136                     ST_SP(out2, x); x += 4;
137                     out10 = x5 + y5;
138                     ST_SP(out3, y); y += 4;
139                     out11 = y5 - x5;
140                     ST_SP(out4, x); x += 4;
141                     out12 = x6 + y6;
142                     ST_SP(out5, y); y += 4;
143                     out13 = y6 - x6;
144                     ST_SP(out6, x); x += 4;
145                     out14 = x7 + y7;
146                     ST_SP(out7, y); y += 4;
147                     out15 = y7 - x7;
148
149                     x0 = LD_SP(px); px += 4;
150                     ST_SP(out8, x); x += 4;
151                     x1 = LD_SP(px); px += 4;
152                     ST_SP(out10, x); x += 4;
153                     x2 = LD_SP(px); px += 4;
154                     ST_SP(out12, x); x += 4;
155                     x3 = LD_SP(px); px += 4;
156                     ST_SP(out14, x); x += 4;
157
158                     y0 = LD_SP(py); py += 4;
159                     ST_SP(out9, y); y += 4;
160                     y1 = LD_SP(py); py += 4;
161                     ST_SP(out11, y); y += 4;
162                     y2 = LD_SP(py); py += 4;
163                     ST_SP(out13, y); y += 4;
164                     y3 = LD_SP(py); py += 4;
165                     ST_SP(out15, y); y += 4;
166                 }
167
168                 x4 = LD_SP(px); px += 4;
169                 x5 = LD_SP(px); px += 4;
170                 x6 = LD_SP(px); px += 4;
171                 x7 = LD_SP(px); px += 4;
172                 y4 = LD_SP(py); py += 4;
173                 y5 = LD_SP(py); py += 4;
174                 y6 = LD_SP(py); py += 4;
175                 y7 = LD_SP(py); py += 4;
176
177                 out0 = x0 + y0;
178                 out1 = y0 - x0;
179                 out2 = x1 + y1;
180                 out3 = y1 - x1;
181                 out4 = x2 + y2;
182                 out5 = y2 - x2;
183                 out6 = x3 + y3;
184                 out7 = y3 - x3;
185                 out8 = x4 + y4;
186                 out9 = y4 - x4;
187                 out10 = x5 + y5;
188                 out11 = y5 - x5;
189                 out12 = x6 + y6;
190                 out13 = y6 - x6;
191                 out14 = x7 + y7;
192                 out15 = y7 - x7;
193
194                 ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4);
195                 ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4);
196             }
197             if (n & 8)
198             {
199                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
200                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
201
202                 out0 = x0 + y0;
203                 out1 = y0 - x0;
204                 out2 = x1 + y1;
205                 out3 = y1 - x1;
206                 out4 = x2 + y2;
207                 out5 = y2 - x2;
208                 out6 = x3 + y3;
209                 out7 = y3 - x3;
210
211                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
212                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
213             }
214             if (n & 4)
215             {
216                 LD_SP2_INC(px, 4, x0, x1);
217                 LD_SP2_INC(py, 4, y0, y1);
218
219                 out0 = x0 + y0;
220                 out1 = y0 - x0;
221                 out2 = x1 + y1;
222                 out3 = y1 - x1;
223
224                 ST_SP2_INC(out0, out2, x, 4);
225                 ST_SP2_INC(out1, out3, y, 4);
226             }
227             if (n & 2)
228             {
229                 x0 = LD_SP(px);
230                 y0 = LD_SP(py);
231                 px += 4;
232                 py += 4;
233
234                 out0 = x0 + y0;
235                 out1 = y0 - x0;
236
237                 ST_SP(out0, x);
238                 ST_SP(out1, y);
239                 x += 4;
240                 y += 4;
241             }
242             if (n & 1)
243             {
244                 LD_GP2_INC(px, 1, fx0, fx1);
245                 LD_GP2_INC(py, 1, fy0, fy1);
246
247                 tp0 = fx0 + fy0;
248                 tp1 = fy0 - fx0;
249                 tp2 = fx1 + fy1;
250                 tp3 = fy1 - fx1;
251
252                 ST_GP2_INC(tp0, tp2, x, 1);
253                 ST_GP2_INC(tp1, tp3, y, 1);
254             }
255         }
256         else if (0 == s)
257         {
258
259             c0 = COPY_FLOAT_TO_VECTOR(c);
260
261             if (n >> 4)
262             {
263                 BLASLONG pref_offsetx, pref_offsety;
264
265                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
266                 if (pref_offsetx > 0)
267                 {
268                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
269                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
270                 }
271
272                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
273                 if (pref_offsety > 0)
274                 {
275                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
276                     pref_offsety = pref_offsety / sizeof(FLOAT);
277                 }
278
279                 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
280
281                 for (j = (n >> 4) - 1; j--;)
282                 {
283                     PREFETCH(px + pref_offsetx + 32);
284                     PREFETCH(px + pref_offsetx + 40);
285                     PREFETCH(px + pref_offsetx + 48);
286                     PREFETCH(px + pref_offsetx + 56);
287                     PREFETCH(py + pref_offsety + 32);
288                     PREFETCH(py + pref_offsety + 40);
289                     PREFETCH(py + pref_offsety + 48);
290                     PREFETCH(py + pref_offsety + 56);
291
292                     y0 = LD_SP(py); py += 4;
293                     x0 *= c0;
294                     y1 = LD_SP(py); py += 4;
295                     x1 *= c0;
296                     y2 = LD_SP(py); py += 4;
297                     x2 *= c0;
298                     y3 = LD_SP(py); py += 4;
299                     x3 *= c0;
300                     y4 = LD_SP(py); py += 4;
301                     x4 *= c0;
302                     y5 = LD_SP(py); py += 4;
303                     x5 *= c0;
304                     y6 = LD_SP(py); py += 4;
305                     x6 *= c0;
306                     y7 = LD_SP(py); py += 4;
307                     x7 *= c0;
308
309                     ST_SP(x0, x); x += 4;
310                     y0 *= c0;
311                     ST_SP(x1, x); x += 4;
312                     y1 *= c0;
313                     ST_SP(x2, x); x += 4;
314                     y2 *= c0;
315                     ST_SP(x3, x); x += 4;
316                     y3 *= c0;
317                     ST_SP(x4, x); x += 4;
318                     y4 *= c0;
319                     ST_SP(x5, x); x += 4;
320                     y5 *= c0;
321                     ST_SP(x6, x); x += 4;
322                     y6 *= c0;
323                     ST_SP(x7, x); x += 4;
324                     y7 *= c0;
325
326                     x0 = LD_SP(px); px += 4;
327                     ST_SP(y0, y); y += 4;
328                     x1 = LD_SP(px); px += 4;
329                     ST_SP(y1, y); y += 4;
330                     x2 = LD_SP(px); px += 4;
331                     ST_SP(y2, y); y += 4;
332                     x3 = LD_SP(px); px += 4;
333                     ST_SP(y3, y); y += 4;
334                     x4 = LD_SP(px); px += 4;
335                     ST_SP(y4, y); y += 4;
336                     x5 = LD_SP(px); px += 4;
337                     ST_SP(y5, y); y += 4;
338                     x6 = LD_SP(px); px += 4;
339                     ST_SP(y6, y); y += 4;
340                     x7 = LD_SP(px); px += 4;
341                     ST_SP(y7, y); y += 4;
342                 }
343
344                 LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
345
346                 x0 *= c0;
347                 y0 *= c0;
348                 x1 *= c0;
349                 y1 *= c0;
350                 x2 *= c0;
351                 y2 *= c0;
352                 x3 *= c0;
353                 y3 *= c0;
354                 x4 *= c0;
355                 y4 *= c0;
356                 x5 *= c0;
357                 y5 *= c0;
358                 x6 *= c0;
359                 y6 *= c0;
360                 x7 *= c0;
361                 y7 *= c0;
362
363                 ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
364                 ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
365             }
366             if (n & 8)
367             {
368                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
369                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
370
371                 x0 *= c0;
372                 y0 *= c0;
373                 x1 *= c0;
374                 y1 *= c0;
375                 x2 *= c0;
376                 y2 *= c0;
377                 x3 *= c0;
378                 y3 *= c0;
379
380                 ST_SP4_INC(x0, x1, x2, x3, x, 4);
381                 ST_SP4_INC(y0, y1, y2, y3, y, 4);
382             }
383             if (n & 4)
384             {
385                 LD_SP2_INC(px, 4, x0, x1);
386                 LD_SP2_INC(py, 4, y0, y1);
387
388                 x0 *= c0;
389                 y0 *= c0;
390                 x1 *= c0;
391                 y1 *= c0;
392
393                 ST_SP2_INC(x0, x1, x, 4);
394                 ST_SP2_INC(y0, y1, y, 4);
395             }
396             if (n & 2)
397             {
398                 x0 = LD_SP(px);
399                 y0 = LD_SP(py);
400                 px += 4;
401                 py += 4;
402
403                 x0 *= c0;
404                 y0 *= c0;
405
406                 ST_SP(x0, x);
407                 ST_SP(y0, y);
408                 x += 4;
409                 y += 4;
410             }
411             if (n & 1)
412             {
413                 LD_GP2_INC(px, 1, fx0, fx1);
414                 LD_GP2_INC(py, 1, fy0, fy1);
415
416                 tp0 = (c * fx0);
417                 tp1 = (c * fy0);
418                 tp2 = (c * fx1);
419                 tp3 = (c * fy1);
420
421                 ST_GP2_INC(tp0, tp2, x, 1);
422                 ST_GP2_INC(tp1, tp3, y, 1);
423             }
424         }
425         else if (0 == c)
426         {
427             s0 = COPY_FLOAT_TO_VECTOR(s);
428
429             /* process 16 floats */
430             if (n >> 4)
431             {
432                 BLASLONG pref_offsetx, pref_offsety;
433
434                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
435                 if (pref_offsetx > 0)
436                 {
437                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
438                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
439                 }
440
441                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
442                 if (pref_offsety > 0)
443                 {
444                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
445                     pref_offsety = pref_offsety / sizeof(FLOAT);
446                 }
447
448                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
449                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
450
451                 for (j = (n >> 4) - 1; j--;)
452                 {
453                     PREFETCH(px + pref_offsetx + 32);
454                     PREFETCH(px + pref_offsetx + 40);
455                     PREFETCH(px + pref_offsetx + 48);
456                     PREFETCH(px + pref_offsetx + 56);
457                     PREFETCH(py + pref_offsety + 32);
458                     PREFETCH(py + pref_offsety + 40);
459                     PREFETCH(py + pref_offsety + 48);
460                     PREFETCH(py + pref_offsety + 56);
461
462                     x4 = LD_SP(px); px += 4;
463                     out0 = s0 * y0;
464                     x5 = LD_SP(px); px += 4;
465                     out2 = s0 * y1;
466                     x6 = LD_SP(px); px += 4;
467                     out4 = s0 * y2;
468                     x7 = LD_SP(px); px += 4;
469                     out6 = s0 * y3;
470                     y4 = LD_SP(py); py += 4;
471                     out1 = -(s0 * x0);
472                     y5 = LD_SP(py); py += 4;
473                     out3 = -(s0 * x1);
474                     y6 = LD_SP(py); py += 4;
475                     out5 = -(s0 * x2);
476                     y7 = LD_SP(py); py += 4;
477                     out7 = -(s0 * x3);
478
479                     ST_SP(out0, x); x += 4;
480                     out0 = s0 * y4;
481                     ST_SP(out2, x); x += 4;
482                     out2 = s0 * y5;
483                     ST_SP(out4, x); x += 4;
484                     out4 = s0 * y6;
485                     ST_SP(out6, x); x += 4;
486                     out6 = s0 * y7;
487                     ST_SP(out1, y); y += 4;
488                     out1 = -(s0 * x4);
489                     ST_SP(out3, y); y += 4;
490                     out3 = -(s0 * x5);
491                     ST_SP(out5, y); y += 4;
492                     out5 = -(s0 * x6);
493                     ST_SP(out7, y); y += 4;
494                     out7 = -(s0 * x7);
495
496                     x0 = LD_SP(px); px += 4;
497                     ST_SP(out0, x); x += 4;
498                     x1 = LD_SP(px); px += 4;
499                     ST_SP(out2, x); x += 4;
500                     x2 = LD_SP(px); px += 4;
501                     ST_SP(out4, x); x += 4;
502                     x3 = LD_SP(px); px += 4;
503                     ST_SP(out6, x); x += 4;
504                     y0 = LD_SP(py); py += 4;
505                     ST_SP(out1, y); y += 4;
506                     y1 = LD_SP(py); py += 4;
507                     ST_SP(out3, y); y += 4;
508                     y2 = LD_SP(py); py += 4;
509                     ST_SP(out5, y); y += 4;
510                     y3 = LD_SP(py); py += 4;
511                     ST_SP(out7, y); y += 4;
512                 }
513
514                 out0 = s0 * y0;
515                 out2 = s0 * y1;
516                 out4 = s0 * y2;
517                 out6 = s0 * y3;
518                 out1 = -(s0 * x0);
519                 out3 = -(s0 * x1);
520                 out5 = -(s0 * x2);
521                 out7 = -(s0 * x3);
522
523                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
524                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
525
526                 LD_SP4_INC(px, 4, x4, x5, x6, x7);
527                 LD_SP4_INC(py, 4, y4, y5, y6, y7);
528
529                 out0 = s0 * y4;
530                 out2 = s0 * y5;
531                 out4 = s0 * y6;
532                 out6 = s0 * y7;
533                 out1 = -(s0 * x4);
534                 out3 = -(s0 * x5);
535                 out5 = -(s0 * x6);
536                 out7 = -(s0 * x7);
537
538                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
539                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
540             }
541             if (n & 8)
542             {
543                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
544                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
545
546                 out0 = s0 * y0;
547                 out1 = - (s0 * x0);
548                 out2 = s0 * y1;
549                 out3 = - (s0 * x1);
550                 out4 = s0 * y2;
551                 out5 = - (s0 * x2);
552                 out6 = s0 * y3;
553                 out7 = - (s0 * x3);
554
555                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
556                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
557             }
558             if (n & 4)
559             {
560                 LD_SP2_INC(px, 4, x0, x1);
561                 LD_SP2_INC(py, 4, y0, y1);
562
563                 out0 = s0 * y0;
564                 out1 = - (s0 * x0);
565                 out2 = s0 * y1;
566                 out3 = - (s0 * x1);
567
568                 ST_SP2_INC(out0, out2, x, 4);
569                 ST_SP2_INC(out1, out3, y, 4);
570             }
571             if (n & 2)
572             {
573                 x0 = LD_SP(px); px += 4;
574                 y0 = LD_SP(py); py += 4;
575
576                 out0 = s0 * y0;
577                 out1 = - (s0 * x0);
578
579                 ST_SP(out0, x); x += 4;
580                 ST_SP(out1, y); y += 4;
581             }
582             if (n & 1)
583             {
584                 LD_GP2_INC(px, 1, fx0, fx1);
585                 LD_GP2_INC(py, 1, fy0, fy1);
586
587                 tp0 = s * fy0;
588                 tp1 = - (s * fx0);
589                 tp2 = s * fy1;
590                 tp3 = - (s * fx1);
591
592                 ST_GP2_INC(tp0, tp2, x, 1);
593                 ST_GP2_INC(tp1, tp3, y, 1);
594             }
595         }
596         else
597         {
598             c0 = COPY_FLOAT_TO_VECTOR(c);
599             s0 = COPY_FLOAT_TO_VECTOR(s);
600
601             if (n >> 4)
602             {
603                 BLASLONG pref_offsetx, pref_offsety;
604
605                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
606                 if (pref_offsetx > 0)
607                 {
608                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
609                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
610                 }
611
612                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
613                 if (pref_offsety > 0)
614                 {
615                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
616                     pref_offsety = pref_offsety / sizeof(FLOAT);
617                 }
618
619                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
620                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
621
622                 for (j = (n >> 4) - 1; j--;)
623                 {
624                     PREFETCH(px + pref_offsetx + 32);
625                     PREFETCH(px + pref_offsetx + 40);
626                     PREFETCH(px + pref_offsetx + 48);
627                     PREFETCH(px + pref_offsetx + 56);
628                     PREFETCH(py + pref_offsety + 32);
629                     PREFETCH(py + pref_offsety + 40);
630                     PREFETCH(py + pref_offsety + 48);
631                     PREFETCH(py + pref_offsety + 56);
632
633                     x4 = LD_SP(px); px += 4;
634                     out0 = c0 * x0;
635                     x5 = LD_SP(px); px += 4;
636                     out1 = c0 * y0;
637                     x6 = LD_SP(px); px += 4;
638                     out2 = c0 * x1;
639                     x7 = LD_SP(px); px += 4;
640                     out3 = c0 * y1;
641                     y4 = LD_SP(py); py += 4;
642                     out4 = c0 * x2;
643                     y5 = LD_SP(py); py += 4;
644                     out5 = c0 * y2;
645                     y6 = LD_SP(py); py += 4;
646                     out6 = c0 * x3;
647                     y7 = LD_SP(py); py += 4;
648                     out7 = c0 * y3;
649
650                     out0 += s0 * y0;
651                     out1 -= s0 * x0;
652                     out2 += s0 * y1;
653                     out3 -= s0 * x1;
654                     out4 += s0 * y2;
655                     out5 -= s0 * x2;
656                     out6 += s0 * y3;
657                     out7 -= s0 * x3;
658
659                     ST_SP(out0, x); x += 4;
660                     out8 = c0 * x4;
661                     ST_SP(out2, x); x += 4;
662                     out9 = c0 * y4;
663                     ST_SP(out4, x); x += 4;
664                     out10 = c0 * x5;
665                     ST_SP(out6, x); x += 4;
666                     out11 = c0 * y5;
667                     ST_SP(out1, y); y += 4;
668                     out12 = c0 * x6;
669                     ST_SP(out3, y); y += 4;
670                     out13 = c0 * y6;
671                     ST_SP(out5, y); y += 4;
672                     out14 = c0 * x7;
673                     ST_SP(out7, y); y += 4;
674                     out15 = c0 * y7;
675
676                     x0 = LD_SP(px); px += 4;
677                     out8 += s0 * y4;
678                     x1 = LD_SP(px); px += 4;
679                     out9 -= s0 * x4;
680                     x2 = LD_SP(px); px += 4;
681                     out10 += s0 * y5;
682                     x3 = LD_SP(px); px += 4;
683                     out11 -= s0 * x5;
684                     y0 = LD_SP(py); py += 4;
685                     out12 += s0 * y6;
686                     y1 = LD_SP(py); py += 4;
687                     out13 -= s0 * x6;
688                     y2 = LD_SP(py); py += 4;
689                     out14 += s0 * y7;
690                     y3 = LD_SP(py); py += 4;
691                     out15 -= s0 * x7;
692
693                     ST_SP(out8,  x); x += 4;
694                     ST_SP(out10, x); x += 4;
695                     ST_SP(out12, x); x += 4;
696                     ST_SP(out14, x); x += 4;
697                     ST_SP(out9,  y); y += 4;
698                     ST_SP(out11, y); y += 4;
699                     ST_SP(out13, y); y += 4;
700                     ST_SP(out15, y); y += 4;
701                 }
702
703                 out0 = c0 * x0;
704                 out0 += s0 * y0;
705                 out1 = c0 * y0;
706                 out1 -= s0 * x0;
707                 out2 = c0 * x1;
708                 out2 += s0 * y1;
709                 out3 = c0 * y1;
710                 out3 -= s0 * x1;
711                 out4 = c0 * x2;
712                 out4 += s0 * y2;
713                 out5 = c0 * y2;
714                 out5 -= s0 * x2;
715                 out6 = c0 * x3;
716                 out6 += s0 * y3;
717                 out7 = c0 * y3;
718                 out7 -= s0 * x3;
719
720                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
721                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
722
723                 LD_SP4_INC(px, 4, x4, x5, x6, x7);
724                 LD_SP4_INC(py, 4, y4, y5, y6, y7);
725
726                 out8 = c0 * x4;
727                 out8 += s0 * y4;
728                 out9 = c0 * y4;
729                 out9 -= s0 * x4;
730                 out10 = c0 * x5;
731                 out10 += s0 * y5;
732                 out11 = c0 * y5;
733                 out11 -= s0 * x5;
734                 out12 = c0 * x6;
735                 out12 += s0 * y6;
736                 out13 = c0 * y6;
737                 out13 -= s0 * x6;
738                 out14 = c0 * x7;
739                 out14 += s0 * y7;
740                 out15 = c0 * y7;
741                 out15 -= s0 * x7;
742
743                 ST_SP4_INC(out8, out10, out12, out14, x, 4);
744                 ST_SP4_INC(out9, out11, out13, out15, y, 4);
745             }
746             if (n & 8)
747             {
748                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
749                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
750
751                 out0 = (c0 * x0) + (s0 * y0);
752                 out1 = (c0 * y0) - (s0 * x0);
753                 out2 = (c0 * x1) + (s0 * y1);
754                 out3 = (c0 * y1) - (s0 * x1);
755                 out4 = (c0 * x2) + (s0 * y2);
756                 out5 = (c0 * y2) - (s0 * x2);
757                 out6 = (c0 * x3) + (s0 * y3);
758                 out7 = (c0 * y3) - (s0 * x3);
759
760                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
761                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
762             }
763             if (n & 4)
764             {
765                 LD_SP2_INC(px, 4, x0, x1);
766                 LD_SP2_INC(py, 4, y0, y1);
767
768                 out0 = (c0 * x0) + (s0 * y0);
769                 out1 = (c0 * y0) - (s0 * x0);
770                 out2 = (c0 * x1) + (s0 * y1);
771                 out3 = (c0 * y1) - (s0 * x1);
772
773                 ST_SP2_INC(out0, out2, x, 4);
774                 ST_SP2_INC(out1, out3, y, 4);
775             }
776             if (n & 2)
777             {
778                 x0 = LD_SP(px);
779                 y0 = LD_SP(py);
780                 px += 4;
781                 py += 4;
782
783                 out0 = (c0 * x0) + (s0 * y0);
784                 out1 = (c0 * y0) - (s0 * x0);
785
786                 ST_SP(out0, x);
787                 ST_SP(out1, y);
788                 x += 4;
789                 y += 4;
790             }
791             if (n & 1)
792             {
793                 LD_GP2_INC(px, 1, fx0, fx1);
794                 LD_GP2_INC(py, 1, fy0, fy1);
795
796                 tp0 = (c * fx0) + (s * fy0);
797                 tp1 = (c * fy0) - (s * fx0);
798                 tp2 = (c * fx1) + (s * fy1);
799                 tp3 = (c * fy1) - (s * fx1);
800
801                 ST_GP2_INC(tp0, tp2, x, 1);
802                 ST_GP2_INC(tp1, tp3, y, 1);
803             }
804         }
805     }
806     else
807     {
808         inc_x2 = 2 * inc_x;
809         inc_y2 = 2 * inc_y;
810
811         if ((0 == c) && (0 == s))
812         {
813             for (i = n; i--;)
814             {
815                 *x = 0;
816                 *(x + 1) = 0;
817                 *y = 0;
818                 *(y + 1) = 0;
819
820                 x += inc_x2;
821                 y += inc_y2;
822             }
823         }
824         else if ((1 == c) && (1 == s))
825         {
826             if (n >> 1)
827             {
828                 fx0 = *px;
829                 fx1 = *(px+1); px += inc_x2;
830                 fx2 = *px;
831                 fx3 = *(px+1); px += inc_x2;
832
833                 fy0 = *py;
834                 fy1 = *(py+1); py += inc_y2;
835                 fy2 = *py;
836                 fy3 = *(py+1); py += inc_y2;
837
838                 for (i = (n >> 1) - 1; i--;)
839                 {
840                     tp0 = fx0 + fy0;
841                     tp1 = fx1 + fy1;
842                     tp2 = fy0 - fx0;
843                     tp3 = fy1 - fx1;
844                     tp4 = fx2 + fy2;
845                     tp5 = fx3 + fy3;
846                     tp6 = fy2 - fx2;
847                     tp7 = fy3 - fx3;
848
849                     fx0 = *px;
850                     *x = tp0;
851                     fx1 = *(px+1); px += inc_x2;
852                     *(x+1) = tp1; x += inc_x2;
853                     fx2 = *px;
854                     *x = tp4;
855                     fx3 = *(px+1); px += inc_x2;
856                     *(x+1) = tp5; x += inc_x2;
857
858                     fy0 = *py;
859                     *y = tp2;
860                     fy1 = *(py+1); py += inc_y2;
861                     *(y+1) = tp3; y += inc_y2;
862                     fy2 = *py;
863                     *y = tp6;
864                     fy3 = *(py+1); py += inc_y2;
865                     *(y+1) = tp7; y += inc_y2;
866                 }
867
868                 tp0 = fx0 + fy0;
869                 tp1 = fx1 + fy1;
870                 tp2 = fy0 - fx0;
871                 tp3 = fy1 - fx1;
872                 tp4 = fx2 + fy2;
873                 tp5 = fx3 + fy3;
874                 tp6 = fy2 - fx2;
875                 tp7 = fy3 - fx3;
876
877                 *x = tp0;
878                 *(x+1) = tp1; x += inc_x2;
879                 *x = tp4;
880                 *(x+1) = tp5; x += inc_x2;
881
882                 *y = tp2;
883                 *(y+1) = tp3; y += inc_y2;
884                 *y = tp6;
885                 *(y+1) = tp7; y += inc_y2;
886             }
887             if (n & 1)
888             {
889                 fx0 = *px;
890                 fx1 = *(px+1);
891
892                 fy0 = *py;
893                 fy1 = *(py+1);
894
895                 tp0 = fx0 + fy0;
896                 tp1 = fx1 + fy1;
897                 tp2 = fy0 - fx0;
898                 tp3 = fy1 - fx1;
899
900                 *x = tp0;
901                 *(x+1) = tp1;
902
903                 *y = tp2;
904                 *(y+1) = tp3;
905             }
906         }
907         else if (0 == s)
908         {
909             if (n >> 1)
910             {
911                 fx0 = *px;
912                 fx1 = *(px+1); px += inc_x2;
913                 fx2 = *px;
914                 fx3 = *(px+1); px += inc_x2;
915
916                 fy0 = *py;
917                 fy1 = *(py+1); py += inc_y2;
918                 fy2 = *py;
919                 fy3 = *(py+1); py += inc_y2;
920
921                 for (i = (n >> 1) - 1; i--;)
922                 {
923                     tp0 = c * fx0;
924                     tp1 = c * fx1;
925                     tp2 = c * fx2;
926                     tp3 = c * fx3;
927                     tp4 = c * fy0;
928                     tp5 = c * fy1;
929                     tp6 = c * fy2;
930                     tp7 = c * fy3;
931
932                     fx0 = *px;
933                     *x = tp0;
934                     fx1 = *(px+1); px += inc_x2;
935                     *(x+1) = tp1; x += inc_x2;
936                     fx2 = *px;
937                     *x = tp2;
938                     fx3 = *(px+1); px += inc_x2;
939                     *(x+1) = tp3; x += inc_x2;
940                     fy0 = *py;
941                     *y = tp4;
942                     fy1 = *(py+1); py += inc_y2;
943                     *(y+1) = tp5; y += inc_y2;
944                     fy2 = *py;
945                     *y = tp6;
946                     fy3 = *(py+1); py += inc_y2;
947                     *(y+1) = tp7; y += inc_y2;
948                 }
949
950                 tp0 = c * fx0;
951                 tp1 = c * fx1;
952                 tp2 = c * fx2;
953                 tp3 = c * fx3;
954                 tp4 = c * fy0;
955                 tp5 = c * fy1;
956                 tp6 = c * fy2;
957                 tp7 = c * fy3;
958
959                 *x = tp0;
960                 *(x+1) = tp1; x += inc_x2;
961                 *x = tp2;
962                 *(x+1) = tp3; x += inc_x2;
963
964                 *y = tp4;
965                 *(y+1) = tp5; y += inc_y2;
966                 *y = tp6;
967                 *(y+1) = tp7; y += inc_y2;
968             }
969             if (n & 1)
970             {
971                 fx0 = *px;
972                 fx1 = *(px+1);
973
974                 fy0 = *py;
975                 fy1 = *(py+1);
976
977                 tp0 = c * fx0;
978                 tp1 = c * fx1;
979                 tp2 = c * fy0;
980                 tp3 = c * fy1;
981
982                 *x = tp0;
983                 *(x+1) = tp1;
984
985                 *y = tp2;
986                 *(y+1) = tp3;
987             }
988         }
989         else
990         {
991             if (n >> 1)
992             {
993                 fx0 = *px;
994                 fx1 = *(px+1); px += inc_x2;
995                 fx2 = *px;
996                 fx3 = *(px+1); px += inc_x2;
997                 fy0 = *py;
998                 fy1 = *(py+1); py += inc_y2;
999                 fy2 = *py;
1000                 fy3 = *(py+1); py += inc_y2;
1001
1002                 for (i = (n >> 1) - 1; i--;)
1003                 {
1004                     tp0 = c * fx0 + s * fy0;
1005                     tp1 = c * fx1 + s * fy1;
1006                     tp2 = c * fy0 - s * fx0;
1007                     tp3 = c * fy1 - s * fx1;
1008                     tp4 = c * fx2 + s * fy2;
1009                     tp5 = c * fx3 + s * fy3;
1010                     tp6 = c * fy2 - s * fx2;
1011                     tp7 = c * fy3 - s * fx3;
1012
1013                     fx0 = *px;
1014                     *x = tp0;
1015                     fx1 = *(px+1); px += inc_x2;
1016                     *(x+1) = tp1; x += inc_x2;
1017                     fx2 = *px;
1018                     *x = tp4;
1019                     fx3 = *(px+1); px += inc_x2;
1020                     *(x+1) = tp5; x += inc_x2;
1021                     fy0 = *py;
1022                     *y = tp2;
1023                     fy1 = *(py+1); py += inc_y2;
1024                     *(y+1) = tp3; y += inc_y2;
1025                     fy2 = *py;
1026                     *y = tp6;
1027                     fy3 = *(py+1); py += inc_y2;
1028                     *(y+1) = tp7; y += inc_y2;
1029                 }
1030
1031                 tp0 = c * fx0 + s * fy0;
1032                 tp1 = c * fx1 + s * fy1;
1033                 tp2 = c * fy0 - s * fx0;
1034                 tp3 = c * fy1 - s * fx1;
1035                 tp4 = c * fx2 + s * fy2;
1036                 tp5 = c * fx3 + s * fy3;
1037                 tp6 = c * fy2 - s * fx2;
1038                 tp7 = c * fy3 - s * fx3;
1039
1040                 *x = tp0;
1041                 *(x+1) = tp1; x += inc_x2;
1042                 *x = tp4;
1043                 *(x+1) = tp5; x += inc_x2;
1044                 *y = tp2;
1045                 *(y+1) = tp3; y += inc_y2;
1046                 *y = tp6;
1047                 *(y+1) = tp7; y += inc_y2;
1048             }
1049             if (n & 1)
1050             {
1051                 fx0 = *px;
1052                 fx1 = *(px+1);
1053
1054                 fy0 = *py;
1055                 fy1 = *(py+1);
1056
1057                 tp0 = c * fx0 + s * fy0;
1058                 tp1 = c * fx1 + s * fy1;
1059                 tp2 = c * fy0 - s * fx0;
1060                 tp3 = c * fy1 - s * fx1;
1061
1062                 *x = tp0;
1063                 *(x+1) = tp1;
1064
1065                 *y = tp2;
1066                 *(y+1) = tp3;
1067             }
1068         }
1069     }
1070
1071     return 0;
1072 }