1 /*******************************************************************************
2 Copyright (c) 2016, The OpenBLAS Project
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are
7 1. Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 2. Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in
11 the documentation and/or other materials provided with the
13 3. Neither the name of the OpenBLAS project nor the names of
14 its contributors may be used to endorse or promote products
15 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *******************************************************************************/
28 #ifndef __MACROS_MSA_H__
29 #define __MACROS_MSA_H__
34 #define ENABLE_PREFETCH
36 #ifdef ENABLE_PREFETCH
37 inline static void prefetch_load_lf(unsigned char *src)
39 __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r" (src));
42 #define PREFETCH(PTR) prefetch_load_lf((unsigned char *)(PTR));
45 #define PREF_OFFSET(src_ptr, offset) \
46 __asm__ __volatile__("pref 0, " STRNG(offset) "(%[src]) \n\t" : : [src] "r" (src_ptr));
50 #define PREF_OFFSET(src_ptr, offset)
53 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
54 #define LD_SP(...) LD_W(v4f32, __VA_ARGS__)
56 #define LD_D(RTYPE, psrc) *((RTYPE *)(psrc))
57 #define LD_DP(...) LD_D(v2f64, __VA_ARGS__)
59 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
60 #define ST_SP(...) ST_W(v4f32, __VA_ARGS__)
62 #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
63 #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
65 #define COPY_FLOAT_TO_VECTOR(a) ( { \
67 out = __msa_cast_to_vector_float(a); \
68 out = (v4f32) __msa_splati_w((v4i32) out, 0); \
72 #define COPY_DOUBLE_TO_VECTOR(a) ( { \
74 out = __msa_cast_to_vector_double(a); \
75 out = (v2f64) __msa_splati_d((v2i64) out, 0); \
79 /* Description : Load 2 variables with stride
80 Arguments : Inputs - psrc, stride
83 #define LD_GP2_INC(psrc, stride, out0, out1) \
91 #define LD_GP3_INC(psrc, stride, out0, \
94 LD_GP2_INC(psrc, stride, out0, out1); \
99 #define LD_GP4_INC(psrc, stride, out0, \
102 LD_GP2_INC(psrc, stride, out0, out1); \
103 LD_GP2_INC(psrc, stride, out2, out3); \
106 #define LD_GP5_INC(psrc, stride, out0, \
107 out1, out2, out3, out4) \
109 LD_GP2_INC(psrc, stride, out0, out1); \
110 LD_GP2_INC(psrc, stride, out2, out3); \
115 #define LD_GP6_INC(psrc, stride, out0, \
119 LD_GP2_INC(psrc, stride, out0, out1); \
120 LD_GP2_INC(psrc, stride, out2, out3); \
121 LD_GP2_INC(psrc, stride, out4, out5); \
124 #define LD_GP7_INC(psrc, stride, out0, \
128 LD_GP2_INC(psrc, stride, out0, out1); \
129 LD_GP2_INC(psrc, stride, out2, out3); \
130 LD_GP2_INC(psrc, stride, out4, out5); \
135 #define LD_GP8_INC(psrc, stride, out0, out1, out2, \
136 out3, out4, out5, out6, out7) \
138 LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \
139 LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \
142 /* Description : Load 2 vectors of single precision floating point elements with stride
143 Arguments : Inputs - psrc, stride
145 Return Type - single precision floating point
147 #define LD_SP2(psrc, stride, out0, out1) \
149 out0 = LD_SP((psrc)); \
150 out1 = LD_SP((psrc) + stride); \
153 #define LD_SP4(psrc, stride, out0, out1, out2, out3) \
155 LD_SP2(psrc, stride, out0, out1) \
156 LD_SP2(psrc + 2 * stride, stride, out2, out3) \
159 #define LD_SP2_INC(psrc, stride, out0, out1) \
161 out0 = LD_SP((psrc)); \
163 out1 = LD_SP((psrc)); \
167 #define LD_SP3_INC(psrc, stride, out0, \
170 LD_SP2_INC(psrc, stride, out0, out1); \
171 out2 = LD_SP((psrc)); \
175 #define LD_SP4_INC(psrc, stride, out0, \
178 LD_SP2_INC(psrc, stride, out0, out1); \
179 LD_SP2_INC(psrc, stride, out2, out3); \
182 #define LD_SP5_INC(psrc, stride, out0, \
183 out1, out2, out3, out4) \
185 LD_SP2_INC(psrc, stride, out0, out1); \
186 LD_SP2_INC(psrc, stride, out2, out3); \
187 out4 = LD_SP((psrc)); \
191 #define LD_SP6_INC(psrc, stride, out0, \
195 LD_SP2_INC(psrc, stride, out0, out1); \
196 LD_SP2_INC(psrc, stride, out2, out3); \
197 LD_SP2_INC(psrc, stride, out4, out5); \
200 #define LD_SP7_INC(psrc, stride, out0, \
204 LD_SP2_INC(psrc, stride, out0, out1); \
205 LD_SP2_INC(psrc, stride, out2, out3); \
206 LD_SP2_INC(psrc, stride, out4, out5); \
207 out6 = LD_SP((psrc)); \
211 #define LD_SP8_INC(psrc, stride, out0, out1, out2, \
212 out3, out4, out5, out6, out7) \
214 LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \
215 LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \
218 #define LD_SP16_INC(psrc, stride, out0, out1, out2, \
219 out3, out4, out5, out6, out7, out8, \
220 out9, out10, out11, out12, out13, \
223 LD_SP8_INC(psrc, stride, out0, out1, out2, \
224 out3, out4, out5, out6, out7); \
225 LD_SP8_INC(psrc, stride, out8, out9, out10, \
226 out11, out12, out13, out14, out15); \
229 /* Description : Load 2 vectors of double precision floating point elements with stride
230 Arguments : Inputs - psrc, stride
232 Return Type - double precision floating point
234 #define LD_DP2(psrc, stride, out0, out1) \
236 out0 = LD_DP((psrc)); \
237 out1 = LD_DP((psrc) + stride); \
240 #define LD_DP4(psrc, stride, out0, out1, out2, out3) \
242 LD_DP2(psrc, stride, out0, out1) \
243 LD_DP2(psrc + 2 * stride, stride, out2, out3) \
246 #define LD_DP2_INC(psrc, stride, out0, out1) \
248 out0 = LD_DP(psrc); \
250 out1 = LD_DP(psrc); \
254 #define LD_DP3_INC(psrc, stride, out0, \
257 LD_DP2_INC(psrc, stride, out0, out1); \
258 out2 = LD_DP((psrc)); \
262 #define LD_DP4_INC(psrc, stride, out0, \
265 LD_DP2_INC(psrc, stride, out0, out1); \
266 LD_DP2_INC(psrc, stride, out2, out3); \
269 #define LD_DP5_INC(psrc, stride, out0, \
270 out1, out2, out3, out4) \
272 LD_DP2_INC(psrc, stride, out0, out1); \
273 LD_DP2_INC(psrc, stride, out2, out3); \
274 out4 = LD_DP((psrc)); \
278 #define LD_DP6_INC(psrc, stride, out0, \
282 LD_DP2_INC(psrc, stride, out0, out1); \
283 LD_DP2_INC(psrc, stride, out2, out3); \
284 LD_DP2_INC(psrc, stride, out4, out5); \
287 #define LD_DP7_INC(psrc, stride, out0, \
291 LD_DP2_INC(psrc, stride, out0, out1); \
292 LD_DP2_INC(psrc, stride, out2, out3); \
293 LD_DP2_INC(psrc, stride, out4, out5); \
294 out6 = LD_DP((psrc)); \
298 #define LD_DP8_INC(psrc, stride, out0, out1, out2, \
299 out3, out4, out5, out6, out7) \
301 LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \
302 LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \
305 #define LD_DP16_INC(psrc, stride, out0, out1, out2, \
306 out3, out4, out5, out6, out7, out8, \
307 out9, out10, out11, out12, out13, \
310 LD_DP8_INC(psrc, stride, out0, out1, out2, \
311 out3, out4, out5, out6, out7); \
312 LD_DP8_INC(psrc, stride, out8, out9, out10, \
313 out11, out12, out13, out14, out15); \
316 /* Description : Store GP variable with stride
317 Arguments : Inputs - in0, in1, pdst, stride
318 Details : Store 4 single precision floating point elements from 'in0' to (pdst)
319 Store 4 single precision floating point elements from 'in1' to (pdst + stride)
321 #define ST_GP2_INC(in0, in1, \
330 #define ST_GP3_INC(in0, in1, in2, \
333 ST_GP2_INC(in0, in1, pdst, stride); \
338 #define ST_GP4_INC(in0, in1, in2, in3, \
341 ST_GP2_INC(in0, in1, pdst, stride); \
342 ST_GP2_INC(in2, in3, pdst, stride); \
345 #define ST_GP5_INC(in0, in1, in2, in3, \
348 ST_GP2_INC(in0, in1, pdst, stride); \
349 ST_GP2_INC(in2, in3, pdst, stride); \
354 #define ST_GP6_INC(in0, in1, in2, in3, \
355 in4, in5, pdst, stride) \
357 ST_GP2_INC(in0, in1, pdst, stride); \
358 ST_GP2_INC(in2, in3, pdst, stride); \
359 ST_GP2_INC(in4, in5, pdst, stride); \
362 #define ST_GP7_INC(in0, in1, in2, in3, in4, \
363 in5, in6, pdst, stride) \
365 ST_GP2_INC(in0, in1, pdst, stride); \
366 ST_GP2_INC(in2, in3, pdst, stride); \
367 ST_GP2_INC(in4, in5, pdst, stride); \
372 #define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \
373 in6, in7, pdst, stride) \
375 ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \
376 ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \
379 /* Description : Store vectors of single precision floating point elements with stride
380 Arguments : Inputs - in0, in1, pdst, stride
381 Details : Store 4 single precision floating point elements from 'in0' to (pdst)
382 Store 4 single precision floating point elements from 'in1' to (pdst + stride)
384 #define ST_SP2(in0, in1, pdst, stride) \
386 ST_SP(in0, (pdst)); \
387 ST_SP(in1, (pdst) + stride); \
390 #define ST_SP4(in0, in1, in2, in3, pdst, stride) \
392 ST_SP2(in0, in1, (pdst), stride); \
393 ST_SP2(in2, in3, (pdst + 2 * stride), stride); \
396 #define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
398 ST_SP4(in0, in1, in2, in3, (pdst), stride); \
399 ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \
402 #define ST_SP2_INC(in0, in1, pdst, stride) \
404 ST_SP(in0, (pdst)); \
406 ST_SP(in1, (pdst)); \
410 #define ST_SP3_INC(in0, in1, in2, \
413 ST_SP2_INC(in0, in1, pdst, stride); \
414 ST_SP(in2, (pdst)); \
418 #define ST_SP4_INC(in0, in1, in2, in3, \
421 ST_SP2_INC(in0, in1, pdst, stride); \
422 ST_SP2_INC(in2, in3, pdst, stride); \
425 #define ST_SP5_INC(in0, in1, in2, in3, \
428 ST_SP2_INC(in0, in1, pdst, stride); \
429 ST_SP2_INC(in2, in3, pdst, stride); \
430 ST_SP(in4, (pdst)); \
434 #define ST_SP6_INC(in0, in1, in2, in3, \
435 in4, in5, pdst, stride) \
437 ST_SP2_INC(in0, in1, pdst, stride); \
438 ST_SP2_INC(in2, in3, pdst, stride); \
439 ST_SP2_INC(in4, in5, pdst, stride); \
442 #define ST_SP7_INC(in0, in1, in2, in3, in4, \
443 in5, in6, pdst, stride) \
445 ST_SP2_INC(in0, in1, pdst, stride); \
446 ST_SP2_INC(in2, in3, pdst, stride); \
447 ST_SP2_INC(in4, in5, pdst, stride); \
448 ST_SP(in6, (pdst)); \
452 #define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \
453 in6, in7, pdst, stride) \
455 ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \
456 ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \
459 #define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \
460 in7, in8, in9, in10, in11, in12, \
461 in13, in14, in15, pdst, stride) \
463 ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \
464 in7, pdst, stride); \
465 ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \
466 in15, pdst, stride); \
469 /* Description : Store vectors of double precision floating point elements with stride
470 Arguments : Inputs - in0, in1, pdst, stride
471 Details : Store 2 double precision floating point elements from 'in0' to (pdst)
472 Store 2 double precision floating point elements from 'in1' to (pdst + stride)
474 #define ST_DP2(in0, in1, pdst, stride) \
476 ST_DP(in0, (pdst)); \
477 ST_DP(in1, (pdst) + stride); \
480 #define ST_DP4(in0, in1, in2, in3, pdst, stride) \
482 ST_DP2(in0, in1, (pdst), stride); \
483 ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \
486 #define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
488 ST_DP4(in0, in1, in2, in3, (pdst), stride); \
489 ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
492 #define ST_DP2_INC(in0, in1, pdst, stride) \
494 ST_DP(in0, (pdst)); \
496 ST_DP(in1, (pdst)); \
500 #define ST_DP3_INC(in0, in1, in2, \
503 ST_DP2_INC(in0, in1, pdst, stride); \
504 ST_DP(in2, (pdst)); \
508 #define ST_DP4_INC(in0, in1, in2, in3, \
511 ST_DP2_INC(in0, in1, pdst, stride); \
512 ST_DP2_INC(in2, in3, pdst, stride); \
515 #define ST_DP5_INC(in0, in1, in2, in3, \
518 ST_DP2_INC(in0, in1, pdst, stride); \
519 ST_DP2_INC(in2, in3, pdst, stride); \
520 ST_DP(in4, (pdst)); \
524 #define ST_DP6_INC(in0, in1, in2, in3, \
525 in4, in5, pdst, stride) \
527 ST_DP2_INC(in0, in1, pdst, stride); \
528 ST_DP2_INC(in2, in3, pdst, stride); \
529 ST_DP2_INC(in4, in5, pdst, stride); \
532 #define ST_DP7_INC(in0, in1, in2, in3, in4, \
533 in5, in6, pdst, stride) \
535 ST_DP2_INC(in0, in1, pdst, stride); \
536 ST_DP2_INC(in2, in3, pdst, stride); \
537 ST_DP2_INC(in4, in5, pdst, stride); \
538 ST_DP(in6, (pdst)); \
542 #define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \
543 in6, in7, pdst, stride) \
545 ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \
546 ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \
549 #define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \
550 in7, in8, in9, in10, in11, in12, \
551 in13, in14, in15, pdst, stride) \
553 ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \
554 in7, pdst, stride); \
555 ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \
556 in15, pdst, stride); \
559 /* Description : shuffle elements in vector as shf_val
560 Arguments : Inputs - in0, in1
562 Return Type - as per RTYPE
564 #define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \
566 out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
567 out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
569 #define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
570 #define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
572 #define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \
575 out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
576 out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
577 out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \
579 #define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
581 #define SHF_W4(RTYPE, in0, in1, in2, in3, \
582 out0, out1, out2, out3, shf_val) \
584 SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \
585 SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \
587 #define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
588 #define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
590 /* Description : Interleave both left and right half of input vectors
591 Arguments : Inputs - in0, in1
593 Return Type - as per RTYPE
594 Details : Right half of byte elements from 'in0' and 'in1' are
595 interleaved and written to 'out0'
597 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
599 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
600 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
602 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
603 #define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
605 #define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
607 out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
608 out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
610 #define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
611 #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
613 /* Description : Indexed word element values are replicated to all
614 elements in output vector
615 Arguments : Inputs - in, stidx
617 Return Type - as per RTYPE
618 Details : 'stidx' element value from 'in' vector is replicated to all
619 elements in 'out0' vector
620 'stidx + 1' element value from 'in' vector is replicated to all
621 elements in 'out1' vector
622 Valid index range for word operation is 0-3
624 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
626 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
627 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
629 #define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
631 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
633 SPLATI_W2(RTYPE, in, 0, out0, out1); \
634 SPLATI_W2(RTYPE, in, 2, out2, out3); \
636 #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
638 #define SPLATI_D2(RTYPE, in, out0, out1) \
640 out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \
641 out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \
643 #define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
645 /* Description : Pack even double word elements of vector pairs
646 Arguments : Inputs - in0, in1, in2, in3
648 Return Type - as per RTYPE
649 Details : Even double word elements of 'in0' are copied to the left half
650 of 'out0' & even double word elements of 'in1' are copied to
651 the right half of 'out0'.
653 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
655 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
656 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
658 #define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
659 #define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
661 #define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \
664 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
665 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
666 out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \
668 #define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
670 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
671 out0, out1, out2, out3) \
673 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
674 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
676 #define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
678 /* Description : pack both even and odd half of input vectors
679 Arguments : Inputs - in0, in1
681 Return Type - as per RTYPE
682 Details : Even double word elements of 'in0' and 'in1' are copied to the
683 'out0' & odd double word elements of 'in0' and 'in1' are
684 copied to the 'out1'.
686 #define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \
688 out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \
689 out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \
691 #define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
693 #define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \
695 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
696 out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
698 #define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
700 /* Description : Multiplication of pairs of vectors
701 Arguments : Inputs - in0, in1, in2, in3
703 Details : Each element from 'in0' is multiplied with elements from 'in1'
704 and the result is written to 'out0'
706 #define MUL2(in0, in1, in2, in3, out0, out1) \
711 #define MUL3(in0, in1, in2, in3, in4, in5, \
718 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
719 out0, out1, out2, out3) \
721 MUL2(in0, in1, in2, in3, out0, out1); \
722 MUL2(in4, in5, in6, in7, out2, out3); \
725 /* Description : Multiplication of pairs of vectors and added in output
726 Arguments : Inputs - in0, in1, vec, out0, out1
728 Details : Each element from 'in0' is multiplied with elements from 'vec'
729 and the result is added to 'out0'
731 #define FMADD2(in0, in1, vec, inout0, inout1) \
733 inout0 += in0 * vec; \
734 inout1 += in1 * vec; \
736 #define FMADD3(in0, in1, in2, vec, \
737 inout0, inout1, inout2) \
739 inout0 += in0 * vec; \
740 inout1 += in1 * vec; \
741 inout2 += in2 * vec; \
743 #define FMADD4(in0, in1, in2, in3, vec, \
744 inout0, inout1, inout2, inout3) \
746 FMADD2(in0, in1, vec, inout0, inout1); \
747 FMADD2(in2, in3, vec, inout2, inout3); \
750 /* Description : Addition of 2 pairs of variables
751 Arguments : Inputs - in0, in1, in2, in3
753 Details : Each element in 'in0' is added to 'in1' and result is written
756 #define ADD2(in0, in1, in2, in3, out0, out1) \
761 #define ADD3(in0, in1, in2, in3, in4, in5, \
768 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
769 out0, out1, out2, out3) \
771 ADD2(in0, in1, in2, in3, out0, out1); \
772 ADD2(in4, in5, in6, in7, out2, out3); \
775 /* Description : Transpose 4x4 block with word elements in vectors
776 Arguments : Inputs - in0, in1, in2, in3
777 Outputs - out0, out1, out2, out3
778 Return Type - as per RTYPE
780 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \
781 out0, out1, out2, out3) \
783 v4i32 s0_m, s1_m, s2_m, s3_m; \
785 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
786 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
787 ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \
788 ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \
790 #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
792 #endif /* __MACROS_MSA_H__ */