fix build error
[platform/upstream/openblas.git] / kernel / mips / macros_msa.h
1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
3 All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
6 met:
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
12 distribution.
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
27
28 #ifndef __MACROS_MSA_H__
29 #define __MACROS_MSA_H__
30
31 #include <stdint.h>
32 #include <msa.h>
33
34 #define ENABLE_PREFETCH
35
36 #ifdef ENABLE_PREFETCH
37 inline static void prefetch_load_lf(unsigned char *src)
38 {
39     __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r" (src));
40 }
41
42 #define PREFETCH(PTR)   prefetch_load_lf((unsigned char *)(PTR));
43
44 #define STRNG(X) #X
45 #define PREF_OFFSET(src_ptr, offset)                  \
46     __asm__ __volatile__("pref 0, " STRNG(offset) "(%[src]) \n\t" : : [src] "r" (src_ptr));
47
48 #else
49 #define PREFETCH(PTR)
50 #define PREF_OFFSET(src_ptr, offset)
51 #endif
52
53 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
54 #define LD_SP(...) LD_W(v4f32, __VA_ARGS__)
55
56 #define LD_D(RTYPE, psrc) *((RTYPE *)(psrc))
57 #define LD_DP(...) LD_D(v2f64, __VA_ARGS__)
58
59 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
60 #define ST_SP(...) ST_W(v4f32, __VA_ARGS__)
61
62 #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
63 #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
64
65 #define COPY_FLOAT_TO_VECTOR(a) ( {                \
66     v4f32  out;                                    \
67     out = __msa_cast_to_vector_float(a);           \
68     out = (v4f32) __msa_splati_w((v4i32) out, 0);  \
69     out;                                           \
70 } )
71
72 #define COPY_DOUBLE_TO_VECTOR(a) ( {               \
73     v2f64  out;                                    \
74     out = __msa_cast_to_vector_double(a);          \
75     out = (v2f64) __msa_splati_d((v2i64) out, 0);  \
76     out;                                           \
77 } )
78
79 /* Description : Load 2 variables with stride
80    Arguments   : Inputs  - psrc, stride
81                  Outputs - out0, out1
82 */
83 #define LD_GP2_INC(psrc, stride, out0, out1)  \
84 {                                             \
85     out0 = *(psrc);                           \
86     (psrc) += stride;                         \
87     out1 = *(psrc);                           \
88     (psrc) += stride;                         \
89 }
90
91 #define LD_GP3_INC(psrc, stride, out0,     \
92                    out1, out2)             \
93 {                                          \
94     LD_GP2_INC(psrc, stride, out0, out1);  \
95     out2 = *(psrc);                        \
96     (psrc) += stride;                      \
97 }
98
99 #define LD_GP4_INC(psrc, stride, out0,     \
100                    out1, out2, out3)       \
101 {                                          \
102     LD_GP2_INC(psrc, stride, out0, out1);  \
103     LD_GP2_INC(psrc, stride, out2, out3);  \
104 }
105
106 #define LD_GP5_INC(psrc, stride, out0,      \
107                    out1, out2, out3, out4)  \
108 {                                           \
109     LD_GP2_INC(psrc, stride, out0, out1);   \
110     LD_GP2_INC(psrc, stride, out2, out3);   \
111     out4 = *(psrc);                         \
112     (psrc) += stride;                       \
113 }
114
115 #define LD_GP6_INC(psrc, stride, out0,     \
116                    out1, out2, out3,       \
117                    out4, out5)             \
118 {                                          \
119     LD_GP2_INC(psrc, stride, out0, out1);  \
120     LD_GP2_INC(psrc, stride, out2, out3);  \
121     LD_GP2_INC(psrc, stride, out4, out5);  \
122 }
123
124 #define LD_GP7_INC(psrc, stride, out0,     \
125                    out1, out2, out3,       \
126                    out4, out5, out6)       \
127 {                                          \
128     LD_GP2_INC(psrc, stride, out0, out1);  \
129     LD_GP2_INC(psrc, stride, out2, out3);  \
130     LD_GP2_INC(psrc, stride, out4, out5);  \
131     out6 = *(psrc);                        \
132     (psrc) += stride;                      \
133 }
134
135 #define LD_GP8_INC(psrc, stride, out0, out1, out2,     \
136                    out3, out4, out5, out6, out7)       \
137 {                                                      \
138     LD_GP4_INC(psrc, stride, out0, out1, out2, out3);  \
139     LD_GP4_INC(psrc, stride, out4, out5, out6, out7);  \
140 }
141
142 /* Description : Load 2 vectors of single precision floating point elements with stride
143    Arguments   : Inputs  - psrc, stride
144                  Outputs - out0, out1
145                  Return Type - single precision floating point
146 */
147 #define LD_SP2(psrc, stride, out0, out1)  \
148 {                                         \
149     out0 = LD_SP((psrc));                 \
150     out1 = LD_SP((psrc) + stride);        \
151 }
152
153 #define LD_SP4(psrc, stride, out0, out1, out2, out3)  \
154 {                                                     \
155     LD_SP2(psrc, stride, out0, out1)                  \
156     LD_SP2(psrc + 2 * stride, stride, out2, out3)     \
157 }
158
159 #define LD_SP2_INC(psrc, stride, out0, out1)  \
160 {                                             \
161     out0 = LD_SP((psrc));                     \
162     (psrc) += stride;                         \
163     out1 = LD_SP((psrc));                     \
164     (psrc) += stride;                         \
165 }
166
167 #define LD_SP3_INC(psrc, stride, out0,     \
168                    out1, out2)             \
169 {                                          \
170     LD_SP2_INC(psrc, stride, out0, out1);  \
171     out2 = LD_SP((psrc));                  \
172     (psrc) += stride;                      \
173 }
174
175 #define LD_SP4_INC(psrc, stride, out0,     \
176                    out1, out2, out3)       \
177 {                                          \
178     LD_SP2_INC(psrc, stride, out0, out1);  \
179     LD_SP2_INC(psrc, stride, out2, out3);  \
180 }
181
182 #define LD_SP5_INC(psrc, stride, out0,      \
183                    out1, out2, out3, out4)  \
184 {                                           \
185     LD_SP2_INC(psrc, stride, out0, out1);   \
186     LD_SP2_INC(psrc, stride, out2, out3);   \
187     out4 = LD_SP((psrc));                   \
188     (psrc) += stride;                       \
189 }
190
191 #define LD_SP6_INC(psrc, stride, out0,     \
192                    out1, out2, out3,       \
193                    out4, out5)             \
194 {                                          \
195     LD_SP2_INC(psrc, stride, out0, out1);  \
196     LD_SP2_INC(psrc, stride, out2, out3);  \
197     LD_SP2_INC(psrc, stride, out4, out5);  \
198 }
199
200 #define LD_SP7_INC(psrc, stride, out0,     \
201                    out1, out2, out3,       \
202                    out4, out5, out6)       \
203 {                                          \
204     LD_SP2_INC(psrc, stride, out0, out1);  \
205     LD_SP2_INC(psrc, stride, out2, out3);  \
206     LD_SP2_INC(psrc, stride, out4, out5);  \
207     out6 = LD_SP((psrc));                  \
208     (psrc) += stride;                      \
209 }
210
211 #define LD_SP8_INC(psrc, stride, out0, out1, out2,     \
212                    out3, out4, out5, out6, out7)       \
213 {                                                      \
214     LD_SP4_INC(psrc, stride, out0, out1, out2, out3);  \
215     LD_SP4_INC(psrc, stride, out4, out5, out6, out7);  \
216 }
217
218 #define LD_SP16_INC(psrc, stride, out0, out1, out2,      \
219                     out3, out4, out5, out6, out7, out8,  \
220                     out9, out10, out11, out12, out13,    \
221                     out14, out15)                        \
222 {                                                        \
223     LD_SP8_INC(psrc, stride, out0, out1, out2,           \
224                out3, out4, out5, out6, out7);            \
225     LD_SP8_INC(psrc, stride, out8, out9, out10,          \
226                out11, out12, out13, out14, out15);       \
227 }
228
229 /* Description : Load 2 vectors of double precision floating point elements with stride
230    Arguments   : Inputs  - psrc, stride
231                  Outputs - out0, out1
232                  Return Type - double precision floating point
233 */
234 #define LD_DP2(psrc, stride, out0, out1)  \
235 {                                         \
236     out0 = LD_DP((psrc));                 \
237     out1 = LD_DP((psrc) + stride);        \
238 }
239
240 #define LD_DP4(psrc, stride, out0, out1, out2, out3)  \
241 {                                                     \
242     LD_DP2(psrc, stride, out0, out1)                  \
243     LD_DP2(psrc + 2 * stride, stride, out2, out3)     \
244 }
245
246 #define LD_DP2_INC(psrc, stride, out0, out1)  \
247 {                                             \
248     out0 = LD_DP(psrc);                       \
249     (psrc) += stride;                         \
250     out1 = LD_DP(psrc);                       \
251     (psrc) += stride;                         \
252 }
253
254 #define LD_DP3_INC(psrc, stride, out0,     \
255                    out1, out2)             \
256 {                                          \
257     LD_DP2_INC(psrc, stride, out0, out1);  \
258     out2 = LD_DP((psrc));                  \
259     (psrc) += stride;                      \
260 }
261
262 #define LD_DP4_INC(psrc, stride, out0,     \
263                    out1, out2, out3)       \
264 {                                          \
265     LD_DP2_INC(psrc, stride, out0, out1);  \
266     LD_DP2_INC(psrc, stride, out2, out3);  \
267 }
268
269 #define LD_DP5_INC(psrc, stride, out0,      \
270                    out1, out2, out3, out4)  \
271 {                                           \
272     LD_DP2_INC(psrc, stride, out0, out1);   \
273     LD_DP2_INC(psrc, stride, out2, out3);   \
274     out4 = LD_DP((psrc));                   \
275     (psrc) += stride;                       \
276 }
277
278 #define LD_DP6_INC(psrc, stride, out0,     \
279                    out1, out2, out3,       \
280                    out4, out5)             \
281 {                                          \
282     LD_DP2_INC(psrc, stride, out0, out1);  \
283     LD_DP2_INC(psrc, stride, out2, out3);  \
284     LD_DP2_INC(psrc, stride, out4, out5);  \
285 }
286
287 #define LD_DP7_INC(psrc, stride, out0,     \
288                    out1, out2, out3,       \
289                    out4, out5, out6)       \
290 {                                          \
291     LD_DP2_INC(psrc, stride, out0, out1);  \
292     LD_DP2_INC(psrc, stride, out2, out3);  \
293     LD_DP2_INC(psrc, stride, out4, out5);  \
294     out6 = LD_DP((psrc));                  \
295     (psrc) += stride;                      \
296 }
297
298 #define LD_DP8_INC(psrc, stride, out0, out1, out2,     \
299                    out3, out4, out5, out6, out7)       \
300 {                                                      \
301     LD_DP4_INC(psrc, stride, out0, out1, out2, out3);  \
302     LD_DP4_INC(psrc, stride, out4, out5, out6, out7);  \
303 }
304
305 #define LD_DP16_INC(psrc, stride, out0, out1, out2,      \
306                     out3, out4, out5, out6, out7, out8,  \
307                     out9, out10, out11, out12, out13,    \
308                     out14, out15)                        \
309 {                                                        \
310     LD_DP8_INC(psrc, stride, out0, out1, out2,           \
311                out3, out4, out5, out6, out7);            \
312     LD_DP8_INC(psrc, stride, out8, out9, out10,          \
313                out11, out12, out13, out14, out15);       \
314 }
315
316 /* Description : Store GP variable with stride
317    Arguments   : Inputs - in0, in1, pdst, stride
318    Details     : Store 4 single precision floating point elements from 'in0' to (pdst)
319                  Store 4 single precision floating point elements from 'in1' to (pdst + stride)
320 */
321 #define ST_GP2_INC(in0, in1,      \
322                    pdst, stride)  \
323 {                                 \
324     *(pdst) = in0;                \
325     (pdst) += stride;             \
326     *(pdst) = in1;                \
327     (pdst) += stride;             \
328 }
329
330 #define ST_GP3_INC(in0, in1, in2,        \
331                    pdst, stride)         \
332 {                                        \
333     ST_GP2_INC(in0, in1, pdst, stride);  \
334     *(pdst) = in2;                       \
335     (pdst) += stride;                    \
336 }
337
338 #define ST_GP4_INC(in0, in1, in2, in3,   \
339                    pdst, stride)         \
340 {                                        \
341     ST_GP2_INC(in0, in1, pdst, stride);  \
342     ST_GP2_INC(in2, in3, pdst, stride);  \
343 }
344
345 #define ST_GP5_INC(in0, in1, in2, in3,   \
346                    in4, pdst, stride)    \
347 {                                        \
348     ST_GP2_INC(in0, in1, pdst, stride);  \
349     ST_GP2_INC(in2, in3, pdst, stride);  \
350     *(pdst) = in4;                       \
351     (pdst) += stride;                    \
352 }
353
354 #define ST_GP6_INC(in0, in1, in2, in3,     \
355                    in4, in5, pdst, stride) \
356 {                                          \
357     ST_GP2_INC(in0, in1, pdst, stride);    \
358     ST_GP2_INC(in2, in3, pdst, stride);    \
359     ST_GP2_INC(in4, in5, pdst, stride);    \
360 }
361
362 #define ST_GP7_INC(in0, in1, in2, in3, in4,  \
363                    in5, in6, pdst, stride)   \
364 {                                            \
365     ST_GP2_INC(in0, in1, pdst, stride);      \
366     ST_GP2_INC(in2, in3, pdst, stride);      \
367     ST_GP2_INC(in4, in5, pdst, stride);      \
368     *(pdst) = in6;                           \
369     (pdst) += stride;                        \
370 }
371
372 #define ST_GP8_INC(in0, in1, in2, in3, in4, in5,   \
373                    in6, in7, pdst, stride)         \
374 {                                                  \
375     ST_GP4_INC(in0, in1, in2, in3, pdst, stride);  \
376     ST_GP4_INC(in4, in5, in6, in7, pdst, stride);  \
377 }
378
379 /* Description : Store vectors of single precision floating point elements with stride
380    Arguments   : Inputs - in0, in1, pdst, stride
381    Details     : Store 4 single precision floating point elements from 'in0' to (pdst)
382                  Store 4 single precision floating point elements from 'in1' to (pdst + stride)
383 */
384 #define ST_SP2(in0, in1, pdst, stride)  \
385 {                                       \
386     ST_SP(in0, (pdst));                 \
387     ST_SP(in1, (pdst) + stride);        \
388 }
389
390 #define ST_SP4(in0, in1, in2, in3, pdst, stride)    \
391 {                                                   \
392     ST_SP2(in0, in1, (pdst), stride);               \
393     ST_SP2(in2, in3, (pdst + 2 * stride), stride);  \
394 }
395
396 #define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
397 {                                                                     \
398     ST_SP4(in0, in1, in2, in3, (pdst), stride);                       \
399     ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride);          \
400 }
401
402 #define ST_SP2_INC(in0, in1, pdst, stride)  \
403 {                                           \
404     ST_SP(in0, (pdst));                     \
405     (pdst) += stride;                       \
406     ST_SP(in1, (pdst));                     \
407     (pdst) += stride;                       \
408 }
409
410 #define ST_SP3_INC(in0, in1, in2,        \
411                    pdst, stride)         \
412 {                                        \
413     ST_SP2_INC(in0, in1, pdst, stride);  \
414     ST_SP(in2, (pdst));                  \
415     (pdst) += stride;                    \
416 }
417
418 #define ST_SP4_INC(in0, in1, in2, in3,   \
419                    pdst, stride)         \
420 {                                        \
421     ST_SP2_INC(in0, in1, pdst, stride);  \
422     ST_SP2_INC(in2, in3, pdst, stride);  \
423 }
424
425 #define ST_SP5_INC(in0, in1, in2, in3,   \
426                    in4, pdst, stride)    \
427 {                                        \
428     ST_SP2_INC(in0, in1, pdst, stride);  \
429     ST_SP2_INC(in2, in3, pdst, stride);  \
430     ST_SP(in4, (pdst));                  \
431     (pdst) += stride;                    \
432 }
433
434 #define ST_SP6_INC(in0, in1, in2, in3,     \
435                    in4, in5, pdst, stride) \
436 {                                          \
437     ST_SP2_INC(in0, in1, pdst, stride);    \
438     ST_SP2_INC(in2, in3, pdst, stride);    \
439     ST_SP2_INC(in4, in5, pdst, stride);    \
440 }
441
442 #define ST_SP7_INC(in0, in1, in2, in3, in4,  \
443                    in5, in6, pdst, stride)   \
444 {                                            \
445     ST_SP2_INC(in0, in1, pdst, stride);      \
446     ST_SP2_INC(in2, in3, pdst, stride);      \
447     ST_SP2_INC(in4, in5, pdst, stride);      \
448     ST_SP(in6, (pdst));                      \
449     (pdst) += stride;                        \
450 }
451
452 #define ST_SP8_INC(in0, in1, in2, in3, in4, in5,   \
453                    in6, in7, pdst, stride)         \
454 {                                                  \
455     ST_SP4_INC(in0, in1, in2, in3, pdst, stride);  \
456     ST_SP4_INC(in4, in5, in6, in7, pdst, stride);  \
457 }
458
459 #define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6,  \
460                     in7, in8, in9, in10, in11, in12,    \
461                     in13, in14, in15, pdst, stride)     \
462 {                                                       \
463     ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6,       \
464                in7, pdst, stride);                      \
465     ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14,  \
466                in15, pdst, stride);                     \
467 }
468
469 /* Description : Store vectors of double precision floating point elements with stride
470    Arguments   : Inputs - in0, in1, pdst, stride
471    Details     : Store 2 double precision floating point elements from 'in0' to (pdst)
472                  Store 2 double precision floating point elements from 'in1' to (pdst + stride)
473 */
474 #define ST_DP2(in0, in1, pdst, stride)  \
475 {                                       \
476     ST_DP(in0, (pdst));                 \
477     ST_DP(in1, (pdst) + stride);        \
478 }
479
480 #define ST_DP4(in0, in1, in2, in3, pdst, stride)   \
481 {                                                  \
482     ST_DP2(in0, in1, (pdst), stride);              \
483     ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \
484 }
485
486 #define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
487 {                                                                     \
488     ST_DP4(in0, in1, in2, in3, (pdst), stride);                       \
489     ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
490 }
491
492 #define ST_DP2_INC(in0, in1, pdst, stride)  \
493 {                                           \
494     ST_DP(in0, (pdst));                     \
495     (pdst) += stride;                       \
496     ST_DP(in1, (pdst));                     \
497     (pdst) += stride;                       \
498 }
499
500 #define ST_DP3_INC(in0, in1, in2,        \
501                    pdst, stride)         \
502 {                                        \
503     ST_DP2_INC(in0, in1, pdst, stride);  \
504     ST_DP(in2, (pdst));                  \
505     (pdst) += stride;                    \
506 }
507
508 #define ST_DP4_INC(in0, in1, in2, in3,   \
509                    pdst, stride)         \
510 {                                        \
511     ST_DP2_INC(in0, in1, pdst, stride);  \
512     ST_DP2_INC(in2, in3, pdst, stride);  \
513 }
514
515 #define ST_DP5_INC(in0, in1, in2, in3,   \
516                    in4, pdst, stride)    \
517 {                                        \
518     ST_DP2_INC(in0, in1, pdst, stride);  \
519     ST_DP2_INC(in2, in3, pdst, stride);  \
520     ST_DP(in4, (pdst));                  \
521     (pdst) += stride;                    \
522 }
523
524 #define ST_DP6_INC(in0, in1, in2, in3,     \
525                    in4, in5, pdst, stride) \
526 {                                          \
527     ST_DP2_INC(in0, in1, pdst, stride);    \
528     ST_DP2_INC(in2, in3, pdst, stride);    \
529     ST_DP2_INC(in4, in5, pdst, stride);    \
530 }
531
532 #define ST_DP7_INC(in0, in1, in2, in3, in4,  \
533                    in5, in6, pdst, stride)   \
534 {                                            \
535     ST_DP2_INC(in0, in1, pdst, stride);      \
536     ST_DP2_INC(in2, in3, pdst, stride);      \
537     ST_DP2_INC(in4, in5, pdst, stride);      \
538     ST_DP(in6, (pdst));                      \
539     (pdst) += stride;                        \
540 }
541
542 #define ST_DP8_INC(in0, in1, in2, in3, in4, in5,   \
543                    in6, in7, pdst, stride)         \
544 {                                                  \
545     ST_DP4_INC(in0, in1, in2, in3, pdst, stride);  \
546     ST_DP4_INC(in4, in5, in6, in7, pdst, stride);  \
547 }
548
549 #define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6,  \
550                     in7, in8, in9, in10, in11, in12,    \
551                     in13, in14, in15, pdst, stride)     \
552 {                                                       \
553     ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6,       \
554                in7, pdst, stride);                      \
555     ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14,  \
556                in15, pdst, stride);                     \
557 }
558
559 /* Description : shuffle elements in vector as shf_val
560    Arguments   : Inputs  - in0, in1
561                  Outputs - out0, out1
562                  Return Type - as per RTYPE
563 */
564 #define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val)   \
565 {                                                      \
566     out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val);  \
567     out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val);  \
568 }
569 #define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
570 #define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
571
572 #define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2,  \
573                shf_val)                                 \
574 {                                                       \
575     out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val);   \
576     out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val);   \
577     out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val);   \
578 }
579 #define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
580
581 #define SHF_W4(RTYPE, in0, in1, in2, in3,           \
582                out0, out1, out2, out3, shf_val)     \
583 {                                                   \
584     SHF_W2(RTYPE, in0, in1, out0, out1, shf_val);   \
585     SHF_W2(RTYPE, in2, in3, out2, out3, shf_val);   \
586 }
587 #define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
588 #define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
589
590 /* Description : Interleave both left and right half of input vectors
591    Arguments   : Inputs  - in0, in1
592                  Outputs - out0, out1
593                  Return Type - as per RTYPE
594    Details     : Right half of byte elements from 'in0' and 'in1' are
595                  interleaved and written to 'out0'
596 */
597 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
598 {                                                           \
599     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
600     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
601 }
602 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
603 #define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
604
605 #define ILVRL_D2(RTYPE, in0, in1, out0, out1)               \
606 {                                                           \
607     out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
608     out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
609 }
610 #define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
611 #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
612
613 /* Description : Indexed word element values are replicated to all
614                  elements in output vector
615    Arguments   : Inputs  - in, stidx
616                  Outputs - out0, out1
617                  Return Type - as per RTYPE
618    Details     : 'stidx' element value from 'in' vector is replicated to all
619                  elements in 'out0' vector
620                  'stidx + 1' element value from 'in' vector is replicated to all
621                  elements in 'out1' vector
622                  Valid index range for word operation is 0-3
623 */
624 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
625 {                                                          \
626     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
627     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
628 }
629 #define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
630
631 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
632 {                                                     \
633     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
634     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
635 }
636 #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
637
638 #define SPLATI_D2(RTYPE, in, out0, out1)           \
639 {                                                  \
640     out0 = (RTYPE) __msa_splati_d((v2i64) in, 0);  \
641     out1 = (RTYPE) __msa_splati_d((v2i64) in, 1);  \
642 }
643 #define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
644
645 /* Description : Pack even double word elements of vector pairs
646    Arguments   : Inputs  - in0, in1, in2, in3
647                  Outputs - out0, out1
648                  Return Type - as per RTYPE
649    Details     : Even double word elements of 'in0' are copied to the left half
650                  of 'out0' & even double word elements of 'in1' are copied to
651                  the right half of 'out0'.
652 */
653 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
654 {                                                            \
655     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
656     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
657 }
658 #define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
659 #define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
660
661 #define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5,        \
662                  out0, out1, out2)                           \
663 {                                                            \
664     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
665     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
666     out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5);  \
667 }
668 #define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
669
670 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
671                  out0, out1, out2, out3)                         \
672 {                                                                \
673     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
674     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
675 }
676 #define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
677
678 /* Description : pack both even and odd half of input vectors
679    Arguments   : Inputs  - in0, in1
680                  Outputs - out0, out1
681                  Return Type - as per RTYPE
682    Details     : Even double word elements of 'in0' and 'in1' are copied to the
683                  'out0' & odd double word elements of 'in0' and 'in1' are
684                  copied to the 'out1'.
685 */
686 #define PCKEVOD_W2(RTYPE, in0, in1, out0, out1)              \
687 {                                                            \
688     out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1);  \
689     out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1);  \
690 }
691 #define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
692
693 #define PCKEVOD_D2(RTYPE, in0, in1, out0, out1)              \
694 {                                                            \
695     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
696     out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
697 }
698 #define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
699
700 /* Description : Multiplication of pairs of vectors
701    Arguments   : Inputs  - in0, in1, in2, in3
702                  Outputs - out0, out1
703    Details     : Each element from 'in0' is multiplied with elements from 'in1'
704                  and the result is written to 'out0'
705 */
706 #define MUL2(in0, in1, in2, in3, out0, out1)  \
707 {                                             \
708     out0 = in0 * in1;                         \
709     out1 = in2 * in3;                         \
710 }
711 #define MUL3(in0, in1, in2, in3, in4, in5,  \
712              out0, out1, out2)              \
713 {                                           \
714     out0 = in0 * in1;                       \
715     out1 = in2 * in3;                       \
716     out2 = in4 * in5;                       \
717 }
718 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
719              out0, out1, out2, out3)                  \
720 {                                                     \
721     MUL2(in0, in1, in2, in3, out0, out1);             \
722     MUL2(in4, in5, in6, in7, out2, out3);             \
723 }
724
725 /* Description : Multiplication of pairs of vectors and added in output
726    Arguments   : Inputs  - in0, in1, vec, out0, out1
727                  Outputs - out0, out1
728    Details     : Each element from 'in0' is multiplied with elements from 'vec'
729                  and the result is added to 'out0'
730 */
731 #define FMADD2(in0, in1, vec, inout0, inout1)  \
732 {                                              \
733     inout0 += in0 * vec;                       \
734     inout1 += in1 * vec;                       \
735 }
736 #define FMADD3(in0, in1, in2, vec,      \
737                inout0, inout1, inout2)  \
738 {                                       \
739     inout0 += in0 * vec;                \
740     inout1 += in1 * vec;                \
741     inout2 += in2 * vec;                \
742 }
743 #define FMADD4(in0, in1, in2, in3, vec,         \
744                inout0, inout1, inout2, inout3)  \
745 {                                               \
746     FMADD2(in0, in1, vec, inout0, inout1);      \
747     FMADD2(in2, in3, vec, inout2, inout3);      \
748 }
749
750 /* Description : Addition of 2 pairs of variables
751    Arguments   : Inputs  - in0, in1, in2, in3
752                  Outputs - out0, out1
753    Details     : Each element in 'in0' is added to 'in1' and result is written
754                  to 'out0'.
755 */
756 #define ADD2(in0, in1, in2, in3, out0, out1)  \
757 {                                             \
758     out0 = in0 + in1;                         \
759     out1 = in2 + in3;                         \
760 }
761 #define ADD3(in0, in1, in2, in3, in4, in5,  \
762              out0, out1, out2)              \
763 {                                           \
764     out0 = in0 + in1;                       \
765     out1 = in2 + in3;                       \
766     out2 = in4 + in5;                       \
767 }
768 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
769              out0, out1, out2, out3)                  \
770 {                                                     \
771     ADD2(in0, in1, in2, in3, out0, out1);             \
772     ADD2(in4, in5, in6, in7, out2, out3);             \
773 }
774
775 /* Description : Transpose 4x4 block with word elements in vectors
776    Arguments   : Inputs  - in0, in1, in2, in3
777                  Outputs - out0, out1, out2, out3
778                  Return Type - as per RTYPE
779 */
780 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3,  \
781                        out0, out1, out2, out3)     \
782 {                                                  \
783     v4i32 s0_m, s1_m, s2_m, s3_m;                  \
784                                                    \
785     ILVRL_W2_SW(in1, in0, s0_m, s1_m);             \
786     ILVRL_W2_SW(in3, in2, s2_m, s3_m);             \
787     ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1);       \
788     ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3);       \
789 }
790 #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
791
792 #endif  /* __MACROS_MSA_H__ */