fix build error
[platform/upstream/openblas.git] / kernel / mips / srot_msa.c
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #include "common.h"
29 #include "macros_msa.h"
30
31 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
32           FLOAT c, FLOAT s)
33 {
34     BLASLONG i, j;
35     FLOAT *px, *py;
36     FLOAT tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
37     FLOAT fx0, fx1, fx2, fx3, fy0, fy1, fy2, fy3;
38     v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
39     v4f32 out0, out1, out2, out3, out4, out5, out6, out7;
40     v4f32 out8, out9, out10, out11, out12, out13, out14, out15, c0, s0;
41
42     if (n <= 0)  return (0);
43
44     px = x;
45     py = y;
46
47     if ((1 == inc_x) && (1 == inc_y))
48     {
49         if ((0 == c) && (0 == s))
50         {
51             v4f32 zero = __msa_cast_to_vector_float(0);
52             zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
53             zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
54             zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
55             zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
56
57             /* process 4 floats */
58             for (j = (n >> 2); j--;)
59             {
60                 ST_SP(zero, px);
61                 ST_SP(zero, py);
62                 px += 4;
63                 py += 4;
64             }
65             if (n & 2)
66             {
67                 px[0] = 0;
68                 py[0] = 0;
69                 px[1] = 0;
70                 py[1] = 0;
71                 px += 2;
72                 py += 2;
73             }
74             if (n & 1)
75             {
76                 px[0] = 0;
77                 py[0] = 0;
78             }
79         }
80         else if ((1 == c) && (1 == s))
81         {
82             if (n >> 5)
83             {
84                 BLASLONG pref_offsetx, pref_offsety;
85
86                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
87                 if (pref_offsetx > 0)
88                 {
89                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
90                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
91                 }
92
93                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
94                 if (pref_offsety > 0)
95                 {
96                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
97                     pref_offsety = pref_offsety / sizeof(FLOAT);
98                 }
99
100                 x0 = LD_SP(px); px += 4;
101                 x1 = LD_SP(px); px += 4;
102                 x2 = LD_SP(px); px += 4;
103                 x3 = LD_SP(px); px += 4;
104                 y0 = LD_SP(py); py += 4;
105                 y1 = LD_SP(py); py += 4;
106                 y2 = LD_SP(py); py += 4;
107                 y3 = LD_SP(py); py += 4;
108
109                 for (j = (n >> 5) - 1; j--;)
110                 {
111                     PREFETCH(px + pref_offsetx + 32);
112                     PREFETCH(px + pref_offsetx + 40);
113                     PREFETCH(px + pref_offsetx + 48);
114                     PREFETCH(px + pref_offsetx + 56);
115                     PREFETCH(py + pref_offsety + 32);
116                     PREFETCH(py + pref_offsety + 40);
117                     PREFETCH(py + pref_offsety + 48);
118                     PREFETCH(py + pref_offsety + 56);
119
120                     out0 = x0 + y0;
121                     x4 = LD_SP(px); px += 4;
122                     out1 = y0 - x0;
123                     x5 = LD_SP(px); px += 4;
124                     out2 = x1 + y1;
125                     x6 = LD_SP(px); px += 4;
126                     out3 = y1 - x1;
127                     x7 = LD_SP(px); px += 4;
128                     out4 = x2 + y2;
129                     y4 = LD_SP(py); py += 4;
130                     out5 = y2 - x2;
131                     y5 = LD_SP(py); py += 4;
132                     out6 = x3 + y3;
133                     y6 = LD_SP(py); py += 4;
134                     out7 = y3 - x3;
135                     y7 = LD_SP(py); py += 4;
136
137                     ST_SP(out0, x); x += 4;
138                     out8 = x4 + y4;
139                     ST_SP(out1, y); y += 4;
140                     out9 = y4 - x4;
141                     ST_SP(out2, x); x += 4;
142                     out10 = x5 + y5;
143                     ST_SP(out3, y); y += 4;
144                     out11 = y5 - x5;
145                     ST_SP(out4, x); x += 4;
146                     out12 = x6 + y6;
147                     ST_SP(out5, y); y += 4;
148                     out13 = y6 - x6;
149                     ST_SP(out6, x); x += 4;
150                     out14 = x7 + y7;
151                     ST_SP(out7, y); y += 4;
152                     out15 = y7 - x7;
153
154                     x0 = LD_SP(px); px += 4;
155                     ST_SP(out8, x); x += 4;
156                     x1 = LD_SP(px); px += 4;
157                     ST_SP(out10, x); x += 4;
158                     x2 = LD_SP(px); px += 4;
159                     ST_SP(out12, x); x += 4;
160                     x3 = LD_SP(px); px += 4;
161                     ST_SP(out14, x); x += 4;
162                     y0 = LD_SP(py); py += 4;
163                     ST_SP(out9, y); y += 4;
164                     y1 = LD_SP(py); py += 4;
165                     ST_SP(out11, y); y += 4;
166                     y2 = LD_SP(py); py += 4;
167                     ST_SP(out13, y); y += 4;
168                     y3 = LD_SP(py); py += 4;
169                     ST_SP(out15, y); y += 4;
170                 }
171
172                 x4 = LD_SP(px); px += 4;
173                 x5 = LD_SP(px); px += 4;
174                 x6 = LD_SP(px); px += 4;
175                 x7 = LD_SP(px); px += 4;
176                 y4 = LD_SP(py); py += 4;
177                 y5 = LD_SP(py); py += 4;
178                 y6 = LD_SP(py); py += 4;
179                 y7 = LD_SP(py); py += 4;
180
181                 out0 = x0 + y0;
182                 out1 = y0 - x0;
183                 out2 = x1 + y1;
184                 out3 = y1 - x1;
185                 out4 = x2 + y2;
186                 out5 = y2 - x2;
187                 out6 = x3 + y3;
188                 out7 = y3 - x3;
189                 out8 = x4 + y4;
190                 out9 = y4 - x4;
191                 out10 = x5 + y5;
192                 out11 = y5 - x5;
193                 out12 = x6 + y6;
194                 out13 = y6 - x6;
195                 out14 = x7 + y7;
196                 out15 = y7 - x7;
197
198                 ST_SP8_INC(out0, out2, out4, out6, out8, out10, out12, out14, x, 4);
199                 ST_SP8_INC(out1, out3, out5, out7, out9, out11, out13, out15, y, 4);
200             }
201             if (n & 16)
202             {
203                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
204                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
205
206                 out0 = x0 + y0;
207                 out1 = y0 - x0;
208                 out2 = x1 + y1;
209                 out3 = y1 - x1;
210                 out4 = x2 + y2;
211                 out5 = y2 - x2;
212                 out6 = x3 + y3;
213                 out7 = y3 - x3;
214
215                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
216                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
217             }
218             if (n & 8)
219             {
220                 LD_SP2_INC(px, 4, x0, x1);
221                 LD_SP2_INC(py, 4, y0, y1);
222
223                 out0 = x0 + y0;
224                 out1 = y0 - x0;
225                 out2 = x1 + y1;
226                 out3 = y1 - x1;
227
228                 ST_SP2_INC(out0, out2, x, 4);
229                 ST_SP2_INC(out1, out3, y, 4);
230             }
231             if (n & 4)
232             {
233                 x0 = LD_SP(px);
234                 y0 = LD_SP(py);
235                 px += 4;
236                 py += 4;
237
238                 out0 = x0 + y0;
239                 out1 = y0 - x0;
240
241                 ST_SP(out0, x);
242                 ST_SP(out1, y);
243                 x += 4;
244                 y += 4;
245             }
246             if (n & 2)
247             {
248                 LD_GP2_INC(px, 1, fx0, fx1);
249                 LD_GP2_INC(py, 1, fy0, fy1);
250
251                 tp0 = fx0 + fy0;
252                 tp1 = fy0 - fx0;
253                 tp2 = fx1 + fy1;
254                 tp3 = fy1 - fx1;
255
256                 ST_GP2_INC(tp0, tp2, x, 1);
257                 ST_GP2_INC(tp1, tp3, y, 1);
258             }
259             if (n & 1)
260             {
261                 fx0 = *px;
262                 fy0 = *py;
263
264                 tp0 = fx0 + fy0;
265                 tp1 = fy0 - fx0;
266
267                 *x = tp0;
268                 *y = tp1;
269             }
270         }
271         else if (0 == s)
272         {
273             c0 = COPY_FLOAT_TO_VECTOR(c);
274
275             if (n >> 5)
276             {
277                 BLASLONG pref_offsetx, pref_offsety;
278
279                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
280                 if (pref_offsetx > 0)
281                 {
282                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
283                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
284                 }
285
286                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
287                 if (pref_offsety > 0)
288                 {
289                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
290                     pref_offsety = pref_offsety / sizeof(FLOAT);
291                 }
292
293                 LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
294
295                 for (j = (n >> 5) - 1; j--;)
296                 {
297                     PREFETCH(px + pref_offsetx + 32);
298                     PREFETCH(px + pref_offsetx + 40);
299                     PREFETCH(px + pref_offsetx + 48);
300                     PREFETCH(px + pref_offsetx + 56);
301                     PREFETCH(py + pref_offsety + 32);
302                     PREFETCH(py + pref_offsety + 40);
303                     PREFETCH(py + pref_offsety + 48);
304                     PREFETCH(py + pref_offsety + 56);
305
306                     y0 = LD_SP(py); py += 4;
307                     x0 *= c0;
308                     y1 = LD_SP(py); py += 4;
309                     x1 *= c0;
310                     y2 = LD_SP(py); py += 4;
311                     x2 *= c0;
312                     y3 = LD_SP(py); py += 4;
313                     x3 *= c0;
314                     y4 = LD_SP(py); py += 4;
315                     x4 *= c0;
316                     y5 = LD_SP(py); py += 4;
317                     x5 *= c0;
318                     y6 = LD_SP(py); py += 4;
319                     x6 *= c0;
320                     y7 = LD_SP(py); py += 4;
321                     x7 *= c0;
322
323                     ST_SP(x0, x); x += 4;
324                     y0 *= c0;
325                     ST_SP(x1, x); x += 4;
326                     y1 *= c0;
327                     ST_SP(x2, x); x += 4;
328                     y2 *= c0;
329                     ST_SP(x3, x); x += 4;
330                     y3 *= c0;
331                     ST_SP(x4, x); x += 4;
332                     y4 *= c0;
333                     ST_SP(x5, x); x += 4;
334                     y5 *= c0;
335                     ST_SP(x6, x); x += 4;
336                     y6 *= c0;
337                     ST_SP(x7, x); x += 4;
338                     y7 *= c0;
339
340                     x0 = LD_SP(px); px += 4;
341                     ST_SP(y0, y); y += 4;
342                     x1 = LD_SP(px); px += 4;
343                     ST_SP(y1, y); y += 4;
344                     x2 = LD_SP(px); px += 4;
345                     ST_SP(y2, y); y += 4;
346                     x3 = LD_SP(px); px += 4;
347                     ST_SP(y3, y); y += 4;
348                     x4 = LD_SP(px); px += 4;
349                     ST_SP(y4, y); y += 4;
350                     x5 = LD_SP(px); px += 4;
351                     ST_SP(y5, y); y += 4;
352                     x6 = LD_SP(px); px += 4;
353                     ST_SP(y6, y); y += 4;
354                     x7 = LD_SP(px); px += 4;
355                     ST_SP(y7, y); y += 4;
356                 }
357
358                 LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
359
360                 x0 *= c0;
361                 y0 *= c0;
362                 x1 *= c0;
363                 y1 *= c0;
364                 x2 *= c0;
365                 y2 *= c0;
366                 x3 *= c0;
367                 y3 *= c0;
368                 x4 *= c0;
369                 y4 *= c0;
370                 x5 *= c0;
371                 y5 *= c0;
372                 x6 *= c0;
373                 y6 *= c0;
374                 x7 *= c0;
375                 y7 *= c0;
376
377                 ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
378                 ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
379             }
380             if (n & 16)
381             {
382                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
383                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
384
385                 x0 *= c0;
386                 y0 *= c0;
387                 x1 *= c0;
388                 y1 *= c0;
389                 x2 *= c0;
390                 y2 *= c0;
391                 x3 *= c0;
392                 y3 *= c0;
393
394                 ST_SP4_INC(x0, x1, x2, x3, x, 4);
395                 ST_SP4_INC(y0, y1, y2, y3, y, 4);
396             }
397             if (n & 8)
398             {
399                 LD_SP2_INC(px, 4, x0, x1);
400                 LD_SP2_INC(py, 4, y0, y1);
401
402                 x0 *= c0;
403                 y0 *= c0;
404                 x1 *= c0;
405                 y1 *= c0;
406
407                 ST_SP2_INC(x0, x1, x, 4);
408                 ST_SP2_INC(y0, y1, y, 4);
409             }
410             if (n & 4)
411             {
412                 x0 = LD_SP(px);
413                 y0 = LD_SP(py);
414                 px += 4;
415                 py += 4;
416
417                 x0 *= c0;
418                 y0 *= c0;
419
420                 ST_SP(x0, x);
421                 ST_SP(y0, y);
422                 x += 4;
423                 y += 4;
424             }
425             if (n & 2)
426             {
427                 LD_GP2_INC(px, 1, fx0, fx1);
428                 LD_GP2_INC(py, 1, fy0, fy1);
429
430                 tp0 = (c * fx0);
431                 tp1 = (c * fy0);
432                 tp2 = (c * fx1);
433                 tp3 = (c * fy1);
434
435                 ST_GP2_INC(tp0, tp2, x, 1);
436                 ST_GP2_INC(tp1, tp3, y, 1);
437             }
438             if (n & 1)
439             {
440                 fx0 = *px;
441                 fy0 = *py;
442
443                 tp0 = (c * fx0);
444                 tp1 = (c * fy0);
445
446                 *x = tp0;
447                 *y = tp1;
448             }
449         }
450         else if (0 == c)
451         {
452             s0 = COPY_FLOAT_TO_VECTOR(s);
453
454             /* process 16 floats */
455             if (n >> 5)
456             {
457                 BLASLONG pref_offsetx, pref_offsety;
458
459                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
460                 if (pref_offsetx > 0)
461                 {
462                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
463                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
464                 }
465
466                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
467                 if (pref_offsety > 0)
468                 {
469                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
470                     pref_offsety = pref_offsety / sizeof(FLOAT);
471                 }
472
473                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
474                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
475
476                 for (j = (n >> 5) - 1; j--;)
477                 {
478                     PREFETCH(px + pref_offsetx + 32);
479                     PREFETCH(px + pref_offsetx + 40);
480                     PREFETCH(px + pref_offsetx + 48);
481                     PREFETCH(px + pref_offsetx + 56);
482
483                     PREFETCH(py + pref_offsety + 32);
484                     PREFETCH(py + pref_offsety + 40);
485                     PREFETCH(py + pref_offsety + 48);
486                     PREFETCH(py + pref_offsety + 56);
487
488                     x4 = LD_SP(px); px += 4;
489                     out0 = s0 * y0;
490                     x5 = LD_SP(px); px += 4;
491                     out2 = s0 * y1;
492                     x6 = LD_SP(px); px += 4;
493                     out4 = s0 * y2;
494                     x7 = LD_SP(px); px += 4;
495                     out6 = s0 * y3;
496                     y4 = LD_SP(py); py += 4;
497                     out1 = -(s0 * x0);
498                     y5 = LD_SP(py); py += 4;
499                     out3 = -(s0 * x1);
500                     y6 = LD_SP(py); py += 4;
501                     out5 = -(s0 * x2);
502                     y7 = LD_SP(py); py += 4;
503                     out7 = -(s0 * x3);
504
505                     ST_SP(out0, x); x += 4;
506                     out0 = s0 * y4;
507                     ST_SP(out2, x); x += 4;
508                     out2 = s0 * y5;
509                     ST_SP(out4, x); x += 4;
510                     out4 = s0 * y6;
511                     ST_SP(out6, x); x += 4;
512                     out6 = s0 * y7;
513                     ST_SP(out1, y); y += 4;
514                     out1 = -(s0 * x4);
515                     ST_SP(out3, y); y += 4;
516                     out3 = -(s0 * x5);
517                     ST_SP(out5, y); y += 4;
518                     out5 = -(s0 * x6);
519                     ST_SP(out7, y); y += 4;
520                     out7 = -(s0 * x7);
521
522                     x0 = LD_SP(px); px += 4;
523                     ST_SP(out0, x); x += 4;
524                     x1 = LD_SP(px); px += 4;
525                     ST_SP(out2, x); x += 4;
526                     x2 = LD_SP(px); px += 4;
527                     ST_SP(out4, x); x += 4;
528                     x3 = LD_SP(px); px += 4;
529                     ST_SP(out6, x); x += 4;
530                     y0 = LD_SP(py); py += 4;
531                     ST_SP(out1, y); y += 4;
532                     y1 = LD_SP(py); py += 4;
533                     ST_SP(out3, y); y += 4;
534                     y2 = LD_SP(py); py += 4;
535                     ST_SP(out5, y); y += 4;
536                     y3 = LD_SP(py); py += 4;
537                     ST_SP(out7, y); y += 4;
538
539                 }
540
541                 out0 = s0 * y0;
542                 out2 = s0 * y1;
543                 out4 = s0 * y2;
544                 out6 = s0 * y3;
545                 out1 = -(s0 * x0);
546                 out3 = -(s0 * x1);
547                 out5 = -(s0 * x2);
548                 out7 = -(s0 * x3);
549
550                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
551                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
552
553                 LD_SP4_INC(px, 4, x4, x5, x6, x7);
554                 LD_SP4_INC(py, 4, y4, y5, y6, y7);
555
556                 out0 = s0 * y4;
557                 out2 = s0 * y5;
558                 out4 = s0 * y6;
559                 out6 = s0 * y7;
560                 out1 = -(s0 * x4);
561                 out3 = -(s0 * x5);
562                 out5 = -(s0 * x6);
563                 out7 = -(s0 * x7);
564
565                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
566                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
567             }
568             if (n & 16)
569             {
570                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
571                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
572
573                 out0 = s0 * y0;
574                 out1 = - (s0 * x0);
575                 out2 = s0 * y1;
576                 out3 = - (s0 * x1);
577                 out4 = s0 * y2;
578                 out5 = - (s0 * x2);
579                 out6 = s0 * y3;
580                 out7 = - (s0 * x3);
581
582                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
583                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
584             }
585             if (n & 8)
586             {
587                 LD_SP2_INC(px, 4, x0, x1);
588                 LD_SP2_INC(py, 4, y0, y1);
589
590                 out0 = s0 * y0;
591                 out1 = - (s0 * x0);
592                 out2 = s0 * y1;
593                 out3 = - (s0 * x1);
594
595                 ST_SP2_INC(out0, out2, x, 4);
596                 ST_SP2_INC(out1, out3, y, 4);
597             }
598             if (n & 4)
599             {
600                 x0 = LD_SP(px); px += 4;
601                 y0 = LD_SP(py); py += 4;
602
603                 out0 = s0 * y0;
604                 out1 = - (s0 * x0);
605
606                 ST_SP(out0, x); x += 4;
607                 ST_SP(out1, y); y += 4;
608             }
609             if (n & 2)
610             {
611                 LD_GP2_INC(px, 1, fx0, fx1);
612                 LD_GP2_INC(py, 1, fy0, fy1);
613
614                 tp0 = s * fy0;
615                 tp1 = - (s * fx0);
616                 tp2 = s * fy1;
617                 tp3 = - (s * fx1);
618
619                 ST_GP2_INC(tp0, tp2, x, 1);
620                 ST_GP2_INC(tp1, tp3, y, 1);
621             }
622             if (n & 1)
623             {
624                 fx0 = *px;
625                 fy0 = *py;
626
627                 tp0 = s * fy0;
628                 tp1 = - (s * fx0);
629
630                 *x = tp0;
631                 *y = tp1;
632             }
633         }
634         else
635         {
636             c0 = COPY_FLOAT_TO_VECTOR(c);
637             s0 = COPY_FLOAT_TO_VECTOR(s);
638
639             /* process 16 floats */
640             if (n >> 5)
641             {
642                 BLASLONG pref_offsetx, pref_offsety;
643
644                 pref_offsetx = (BLASLONG)px & (L1_DATA_LINESIZE - 1);
645                 if (pref_offsetx > 0)
646                 {
647                     pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
648                     pref_offsetx = pref_offsetx / sizeof(FLOAT);
649                 }
650
651                 pref_offsety = (BLASLONG)py & (L1_DATA_LINESIZE - 1);
652                 if (pref_offsety > 0)
653                 {
654                     pref_offsety = L1_DATA_LINESIZE - pref_offsety;
655                     pref_offsety = pref_offsety / sizeof(FLOAT);
656                 }
657
658                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
659                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
660
661                 for (j = (n >> 5) - 1; j--;)
662                 {
663                     PREFETCH(px + pref_offsetx + 32);
664                     PREFETCH(px + pref_offsetx + 40);
665                     PREFETCH(px + pref_offsetx + 48);
666                     PREFETCH(px + pref_offsetx + 56);
667                     PREFETCH(py + pref_offsety + 32);
668                     PREFETCH(py + pref_offsety + 40);
669                     PREFETCH(py + pref_offsety + 48);
670                     PREFETCH(py + pref_offsety + 56);
671
672                     x4 = LD_SP(px); px += 4;
673                     out0 = c0 * x0;
674                     x5 = LD_SP(px); px += 4;
675                     out2 = c0 * x1;
676                     x6 = LD_SP(px); px += 4;
677                     out4 = c0 * x2;
678                     x7 = LD_SP(px); px += 4;
679                     out6 = c0 * x3;
680                     y4 = LD_SP(py); py += 4;
681                     out1 = c0 * y0;
682                     y5 = LD_SP(py); py += 4;
683                     out3 = c0 * y1;
684                     y6 = LD_SP(py); py += 4;
685                     out5 = c0 * y2;
686                     y7 = LD_SP(py); py += 4;
687                     out7 = c0 * y3;
688
689                     out0 += s0 * y0;
690                     out2 += s0 * y1;
691                     out4 += s0 * y2;
692                     out6 += s0 * y3;
693                     out1 -= s0 * x0;
694                     out3 -= s0 * x1;
695                     out5 -= s0 * x2;
696                     out7 -= s0 * x3;
697
698                     ST_SP(out0, x); x += 4;
699                     out0 = c0 * x4;
700                     ST_SP(out2, x); x += 4;
701                     out2 = c0 * x5;
702                     ST_SP(out4, x); x += 4;
703                     out4 = c0 * x6;
704                     ST_SP(out6, x); x += 4;
705                     out6 = c0 * x7;
706                     ST_SP(out1, y); y += 4;
707                     out1 = c0 * y4;
708                     ST_SP(out3, y); y += 4;
709                     out3 = c0 * y5;
710                     ST_SP(out5, y); y += 4;
711                     out5 = c0 * y6;
712                     ST_SP(out7, y); y += 4;
713                     out7 = c0 * y7;
714
715                     x0 = LD_SP(px); px += 4;
716                     out0 += s0 * y4;
717                     x1 = LD_SP(px); px += 4;
718                     out2 += s0 * y5;
719                     x2 = LD_SP(px); px += 4;
720                     out4 += s0 * y6;
721                     x3 = LD_SP(px); px += 4;
722                     out6 += s0 * y7;
723                     y0 = LD_SP(py); py += 4;
724                     out1 -= s0 * x4;
725                     y1 = LD_SP(py); py += 4;
726                     out3 -= s0 * x5;
727                     y2 = LD_SP(py); py += 4;
728                     out5 -= s0 * x6;
729                     y3 = LD_SP(py); py += 4;
730                     out7 -= s0 * x7;
731
732                     ST_SP4_INC(out0, out2, out4, out6, x, 4);
733                     ST_SP4_INC(out1, out3, out5, out7, y, 4);
734                 }
735
736                 out0 = c0 * x0;
737                 out2 = c0 * x1;
738                 out4 = c0 * x2;
739                 out6 = c0 * x3;
740                 out1 = c0 * y0;
741                 out3 = c0 * y1;
742                 out5 = c0 * y2;
743                 out7 = c0 * y3;
744
745                 out0 += s0 * y0;
746                 out2 += s0 * y1;
747                 out4 += s0 * y2;
748                 out6 += s0 * y3;
749                 out1 -= s0 * x0;
750                 out3 -= s0 * x1;
751                 out5 -= s0 * x2;
752                 out7 -= s0 * x3;
753
754                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
755                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
756
757                 LD_SP4_INC(px, 4, x4, x5, x6, x7);
758                 LD_SP4_INC(py, 4, y4, y5, y6, y7);
759
760                 out0 = c0 * x4;
761                 out2 = c0 * x5;
762                 out4 = c0 * x6;
763                 out6 = c0 * x7;
764                 out1 = c0 * y4;
765                 out3 = c0 * y5;
766                 out5 = c0 * y6;
767                 out7 = c0 * y7;
768
769                 out0 += s0 * y4;
770                 out2 += s0 * y5;
771                 out4 += s0 * y6;
772                 out6 += s0 * y7;
773                 out1 -= s0 * x4;
774                 out3 -= s0 * x5;
775                 out5 -= s0 * x6;
776                 out7 -= s0 * x7;
777
778                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
779                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
780             }
781             if (n & 16)
782             {
783                 LD_SP4_INC(px, 4, x0, x1, x2, x3);
784                 LD_SP4_INC(py, 4, y0, y1, y2, y3);
785
786                 out0 = (c0 * x0) + (s0 * y0);
787                 out1 = (c0 * y0) - (s0 * x0);
788                 out2 = (c0 * x1) + (s0 * y1);
789                 out3 = (c0 * y1) - (s0 * x1);
790                 out4 = (c0 * x2) + (s0 * y2);
791                 out5 = (c0 * y2) - (s0 * x2);
792                 out6 = (c0 * x3) + (s0 * y3);
793                 out7 = (c0 * y3) - (s0 * x3);
794
795                 ST_SP4_INC(out0, out2, out4, out6, x, 4);
796                 ST_SP4_INC(out1, out3, out5, out7, y, 4);
797             }
798             if (n & 8)
799             {
800                 LD_SP2_INC(px, 4, x0, x1);
801                 LD_SP2_INC(py, 4, y0, y1);
802
803                 out0 = (c0 * x0) + (s0 * y0);
804                 out1 = (c0 * y0) - (s0 * x0);
805                 out2 = (c0 * x1) + (s0 * y1);
806                 out3 = (c0 * y1) - (s0 * x1);
807
808                 ST_SP2_INC(out0, out2, x, 4);
809                 ST_SP2_INC(out1, out3, y, 4);
810             }
811             if (n & 4)
812             {
813                 x0 = LD_SP(px);
814                 y0 = LD_SP(py);
815                 px += 4;
816                 py += 4;
817
818                 out0 = (c0 * x0) + (s0 * y0);
819                 out1 = (c0 * y0) - (s0 * x0);
820
821                 ST_SP(out0, x);
822                 ST_SP(out1, y);
823                 x += 4;
824                 y += 4;
825             }
826             if (n & 2)
827             {
828                 LD_GP2_INC(px, 1, fx0, fx1);
829                 LD_GP2_INC(py, 1, fy0, fy1);
830
831                 tp0 = (c * fx0) + (s * fy0);
832                 tp1 = (c * fy0) - (s * fx0);
833                 tp2 = (c * fx1) + (s * fy1);
834                 tp3 = (c * fy1) - (s * fx1);
835
836                 ST_GP2_INC(tp0, tp2, x, 1);
837                 ST_GP2_INC(tp1, tp3, y, 1);
838             }
839             if (n & 1)
840             {
841                 fx0 = *px;
842                 fy0 = *py;
843
844                 tp0 = (c * fx0) + (s * fy0);
845                 tp1 = (c * fy0) - (s * fx0);
846
847                 *x = tp0;
848                 *y = tp1;
849             }
850         }
851     }
852     else
853     {
854         if ((0 == c) && (0 == s))
855         {
856             for (i = n; i--;)
857             {
858                 *x = 0;
859                 *y = 0;
860                 x += inc_x;
861                 y += inc_y;
862             }
863         }
864         else if ((1 == c) && (1 == s))
865         {
866             if (n >> 2)
867             {
868                 fx0 = *px; px += inc_x;
869                 fx1 = *px; px += inc_x;
870                 fx2 = *px; px += inc_x;
871                 fx3 = *px; px += inc_x;
872                 fy0 = *py; py += inc_y;
873                 fy1 = *py; py += inc_y;
874                 fy2 = *py; py += inc_y;
875                 fy3 = *py; py += inc_y;
876
877                 for (i = (n >> 2) -1; i--;)
878                 {
879                     tp0 = fx0 + fy0;
880                     tp1 = fy0 - fx0;
881                     tp2 = fx1 + fy1;
882                     tp3 = fy1 - fx1;
883                     tp4 = fx2 + fy2;
884                     tp5 = fy2 - fx2;
885                     tp6 = fx3 + fy3;
886                     tp7 = fy3 - fx3;
887
888                     fx0 = *px; px += inc_x;
889                     *x = tp0; x += inc_x;
890                     fx1 = *px; px += inc_x;
891                     *x = tp2; x += inc_x;
892                     fx2 = *px; px += inc_x;
893                     *x = tp4; x += inc_x;
894                     fx3 = *px; px += inc_x;
895                     *x = tp6; x += inc_x;
896                     fy0 = *py; py += inc_y;
897                     *y = tp1; y += inc_y;
898                     fy1 = *py; py += inc_y;
899                     *y = tp3; y += inc_y;
900                     fy2 = *py; py += inc_y;
901                     *y = tp5; y += inc_y;
902                     fy3 = *py; py += inc_y;
903                     *y = tp7; y += inc_y;
904                 }
905
906                 tp0 = fx0 + fy0;
907                 tp1 = fy0 - fx0;
908                 tp2 = fx1 + fy1;
909                 tp3 = fy1 - fx1;
910                 tp4 = fx2 + fy2;
911                 tp5 = fy2 - fx2;
912                 tp6 = fx3 + fy3;
913                 tp7 = fy3 - fx3;
914
915                 *x = tp0; x += inc_x;
916                 *x = tp2; x += inc_x;
917                 *x = tp4; x += inc_x;
918                 *x = tp6; x += inc_x;
919                 *y = tp1; y += inc_y;
920                 *y = tp3; y += inc_y;
921                 *y = tp5; y += inc_y;
922                 *y = tp7; y += inc_y;
923             }
924
925             if (n & 2)
926             {
927                 LD_GP2_INC(px, inc_x, fx0, fx1);
928                 LD_GP2_INC(py, inc_y, fy0, fy1);
929
930                 tp0 = fx0 + fy0;
931                 tp1 = fy0 - fx0;
932                 tp2 = fx1 + fy1;
933                 tp3 = fy1 - fx1;
934
935                 ST_GP2_INC(tp0, tp2, x, inc_x);
936                 ST_GP2_INC(tp1, tp3, y, inc_y);
937             }
938             if (n & 1)
939             {
940                 fx0 = *px;
941                 fy0 = *py;
942
943                 tp0 = fx0 + fy0;
944                 tp1 = fy0 - fx0;
945
946                 *x = tp0;
947                 *y = tp1;
948             }
949         }
950         else if (0 == s)
951         {
952             if (n >> 2)
953             {
954                 fx0 = *px; px += inc_x;
955                 fx1 = *px; px += inc_x;
956                 fx2 = *px; px += inc_x;
957                 fx3 = *px; px += inc_x;
958                 fy0 = *py; py += inc_y;
959                 fy1 = *py; py += inc_y;
960                 fy2 = *py; py += inc_y;
961                 fy3 = *py; py += inc_y;
962
963                 for (i = (n >> 2) - 1; i--;)
964                 {
965                     tp0 = c * fx0;
966                     tp1 = c * fy0;
967                     tp2 = c * fx1;
968                     tp3 = c * fy1;
969                     tp4 = c * fx2;
970                     tp5 = c * fy2;
971                     tp6 = c * fx3;
972                     tp7 = c * fy3;
973
974                     fx0 = *px; px += inc_x;
975                     *x = tp0; x += inc_x;
976                     fx1 = *px; px += inc_x;
977                     *x = tp2; x += inc_x;
978                     fx2 = *px; px += inc_x;
979                     *x = tp4; x += inc_x;
980                     fx3 = *px; px += inc_x;
981                     *x = tp6; x += inc_x;
982                     fy0 = *py; py += inc_y;
983                     *y = tp1; y += inc_y;
984                     fy1 = *py; py += inc_y;
985                     *y = tp3; y += inc_y;
986                     fy2 = *py; py += inc_y;
987                     *y = tp5; y += inc_y;
988                     fy3 = *py; py += inc_y;
989                     *y = tp7; y += inc_y;
990                 }
991
992                 tp0 = c * fx0;
993                 tp1 = c * fy0;
994                 tp2 = c * fx1;
995                 tp3 = c * fy1;
996                 tp4 = c * fx2;
997                 tp5 = c * fy2;
998                 tp6 = c * fx3;
999                 tp7 = c * fy3;
1000
1001                 *x = tp0; x += inc_x;
1002                 *x = tp2; x += inc_x;
1003                 *x = tp4; x += inc_x;
1004                 *x = tp6; x += inc_x;
1005                 *y = tp1; y += inc_y;
1006                 *y = tp3; y += inc_y;
1007                 *y = tp5; y += inc_y;
1008                 *y = tp7; y += inc_y;
1009             }
1010             if (n & 2)
1011             {
1012                 LD_GP2_INC(px, inc_x, fx0, fx1);
1013                 LD_GP2_INC(py, inc_y, fy0, fy1);
1014
1015                 tp0 = c * fx0;
1016                 tp1 = c * fy0;
1017                 tp2 = c * fx1;
1018                 tp3 = c * fy1;
1019
1020                 ST_GP2_INC(tp0, tp2, x, inc_x);
1021                 ST_GP2_INC(tp1, tp3, y, inc_y);
1022             }
1023             if (n & 1)
1024             {
1025                 fx0 = *px;
1026                 fy0 = *py;
1027
1028                 tp0 = c * fx0;
1029                 tp1 = c * fy0;
1030
1031                 *x = tp0;
1032                 *y = tp1;
1033             }
1034         }
1035         else
1036         {
1037             if (n >> 2)
1038             {
1039                 fx0 = *px; px += inc_x;
1040                 fx1 = *px; px += inc_x;
1041                 fx2 = *px; px += inc_x;
1042                 fx3 = *px; px += inc_x;
1043                 fy0 = *py; py += inc_y;
1044                 fy1 = *py; py += inc_y;
1045                 fy2 = *py; py += inc_y;
1046                 fy3 = *py; py += inc_y;
1047
1048                 for (i = (n >> 2) - 1; i--;)
1049                 {
1050                     tp0 = c * fx0 + s * fy0;
1051                     tp1 = c * fy0 - s * fx0;
1052                     tp2 = c * fx1 + s * fy1;
1053                     tp3 = c * fy1 - s * fx1;
1054                     tp4 = c * fx2 + s * fy2;
1055                     tp5 = c * fy2 - s * fx2;
1056                     tp6 = c * fx3 + s * fy3;
1057                     tp7 = c * fy3 - s * fx3;
1058
1059                     fx0 = *px; px += inc_x;
1060                     *x = tp0; x += inc_x;
1061                     fx1 = *px; px += inc_x;
1062                     *x = tp2; x += inc_x;
1063                     fx2 = *px; px += inc_x;
1064                     *x = tp4; x += inc_x;
1065                     fx3 = *px; px += inc_x;
1066                     *x = tp6; x += inc_x;
1067                     fy0 = *py; py += inc_y;
1068                     *y = tp1; y += inc_y;
1069                     fy1 = *py; py += inc_y;
1070                     *y = tp3; y += inc_y;
1071                     fy2 = *py; py += inc_y;
1072                     *y = tp5; y += inc_y;
1073                     fy3 = *py; py += inc_y;
1074                     *y = tp7; y += inc_y;
1075                 }
1076
1077                 tp0 = c * fx0 + s * fy0;
1078                 tp1 = c * fy0 - s * fx0;
1079                 tp2 = c * fx1 + s * fy1;
1080                 tp3 = c * fy1 - s * fx1;
1081                 tp4 = c * fx2 + s * fy2;
1082                 tp5 = c * fy2 - s * fx2;
1083                 tp6 = c * fx3 + s * fy3;
1084                 tp7 = c * fy3 - s * fx3;
1085
1086                 *x = tp0; x += inc_x;
1087                 *x = tp2; x += inc_x;
1088                 *x = tp4; x += inc_x;
1089                 *x = tp6; x += inc_x;
1090                 *y = tp1; y += inc_y;
1091                 *y = tp3; y += inc_y;
1092                 *y = tp5; y += inc_y;
1093                 *y = tp7; y += inc_y;
1094             }
1095             if (n & 2)
1096             {
1097                 LD_GP2_INC(px, inc_x, fx0, fx1);
1098                 LD_GP2_INC(py, inc_y, fy0, fy1);
1099
1100                 tp0 = c * fx0 + s * fy0;
1101                 tp1 = c * fy0 - s * fx0;
1102                 tp2 = c * fx1 + s * fy1;
1103                 tp3 = c * fy1 - s * fx1;
1104
1105                 ST_GP2_INC(tp0, tp2, x, inc_x);
1106                 ST_GP2_INC(tp1, tp3, y, inc_y);
1107             }
1108             if (n & 1)
1109             {
1110                 fx0 = *px;
1111                 fy0 = *py;
1112
1113                 tp0 = c * fx0 + s * fy0;
1114                 tp1 = c * fy0 - s * fx0;
1115
1116                 *x = tp0;
1117                 *y = tp1;
1118             }
1119         }
1120     }
1121
1122     return 0;
1123 }