1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_VSX_UTILS_HPP
46 #define OPENCV_HAL_VSX_UTILS_HPP
48 #include "opencv2/core/cvdef.h"
50 //! @addtogroup core_utils_vsx
54 #define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
55 #define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
56 #define __VSX_S4__(c, v) (c){v, v, v, v}
57 #define __VSX_S2__(c, v) (c){v, v}
59 typedef __vector unsigned char vec_uchar16;
60 #define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
61 #define vec_uchar16_sp(c) (__VSX_S16__(vec_uchar16, c))
62 #define vec_uchar16_c(v) ((vec_uchar16)(v))
63 #define vec_uchar16_mx vec_uchar16_sp(0xFF)
64 #define vec_uchar16_mn vec_uchar16_sp(0)
65 #define vec_uchar16_z vec_uchar16_mn
67 typedef __vector signed char vec_char16;
68 #define vec_char16_set(...) (vec_char16){__VA_ARGS__}
69 #define vec_char16_sp(c) (__VSX_S16__(vec_char16, c))
70 #define vec_char16_c(v) ((vec_char16)(v))
71 #define vec_char16_mx vec_char16_sp(0x7F)
72 #define vec_char16_mn vec_char16_sp(-0x7F-1)
73 #define vec_char16_z vec_char16_sp(0)
75 typedef __vector unsigned short vec_ushort8;
76 #define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
77 #define vec_ushort8_sp(c) (__VSX_S8__(vec_ushort8, c))
78 #define vec_ushort8_c(v) ((vec_ushort8)(v))
79 #define vec_ushort8_mx vec_ushort8_sp(0xFFFF)
80 #define vec_ushort8_mn vec_ushort8_sp(0)
81 #define vec_ushort8_z vec_ushort8_mn
83 typedef __vector signed short vec_short8;
84 #define vec_short8_set(...) (vec_short8){__VA_ARGS__}
85 #define vec_short8_sp(c) (__VSX_S8__(vec_short8, c))
86 #define vec_short8_c(v) ((vec_short8)(v))
87 #define vec_short8_mx vec_short8_sp(0x7FFF)
88 #define vec_short8_mn vec_short8_sp(-0x7FFF-1)
89 #define vec_short8_z vec_short8_sp(0)
91 typedef __vector unsigned int vec_uint4;
92 #define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
93 #define vec_uint4_sp(c) (__VSX_S4__(vec_uint4, c))
94 #define vec_uint4_c(v) ((vec_uint4)(v))
95 #define vec_uint4_mx vec_uint4_sp(0xFFFFFFFFU)
96 #define vec_uint4_mn vec_uint4_sp(0)
97 #define vec_uint4_z vec_uint4_mn
99 typedef __vector signed int vec_int4;
100 #define vec_int4_set(...) (vec_int4){__VA_ARGS__}
101 #define vec_int4_sp(c) (__VSX_S4__(vec_int4, c))
102 #define vec_int4_c(v) ((vec_int4)(v))
103 #define vec_int4_mx vec_int4_sp(0x7FFFFFFF)
104 #define vec_int4_mn vec_int4_sp(-0x7FFFFFFF-1)
105 #define vec_int4_z vec_int4_sp(0)
107 typedef __vector float vec_float4;
108 #define vec_float4_set(...) (vec_float4){__VA_ARGS__}
109 #define vec_float4_sp(c) (__VSX_S4__(vec_float4, c))
110 #define vec_float4_c(v) ((vec_float4)(v))
111 #define vec_float4_mx vec_float4_sp(3.40282347E+38F)
112 #define vec_float4_mn vec_float4_sp(1.17549435E-38F)
113 #define vec_float4_z vec_float4_sp(0)
115 typedef __vector unsigned long long vec_udword2;
116 #define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
117 #define vec_udword2_sp(c) (__VSX_S2__(vec_udword2, c))
118 #define vec_udword2_c(v) ((vec_udword2)(v))
119 #define vec_udword2_mx vec_udword2_sp(18446744073709551615ULL)
120 #define vec_udword2_mn vec_udword2_sp(0)
121 #define vec_udword2_z vec_udword2_mn
123 typedef __vector signed long long vec_dword2;
124 #define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
125 #define vec_dword2_sp(c) (__VSX_S2__(vec_dword2, c))
126 #define vec_dword2_c(v) ((vec_dword2)(v))
127 #define vec_dword2_mx vec_dword2_sp(9223372036854775807LL)
128 #define vec_dword2_mn vec_dword2_sp(-9223372036854775807LL-1)
129 #define vec_dword2_z vec_dword2_sp(0)
131 typedef __vector double vec_double2;
132 #define vec_double2_set(...) (vec_double2){__VA_ARGS__}
133 #define vec_double2_c(v) ((vec_double2)(v))
134 #define vec_double2_sp(c) (__VSX_S2__(vec_double2, c))
135 #define vec_double2_mx vec_double2_sp(1.7976931348623157E+308)
136 #define vec_double2_mn vec_double2_sp(2.2250738585072014E-308)
137 #define vec_double2_z vec_double2_sp(0)
139 #define vec_bchar16 __vector __bool char
140 #define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
141 #define vec_bchar16_c(v) ((vec_bchar16)(v))
142 #define vec_bchar16_f (__VSX_S16__(vec_bchar16, 0))
143 #define vec_bchar16_t (__VSX_S16__(vec_bchar16, 1))
145 #define vec_bshort8 __vector __bool short
146 #define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
147 #define vec_bshort8_c(v) ((vec_bshort8)(v))
148 #define vec_bshort8_f (__VSX_S8__(vec_bshort8, 0))
149 #define vec_bshort8_t (__VSX_S8__(vec_bshort8, 1))
151 #define vec_bint4 __vector __bool int
152 #define vec_bint4_set(...) (vec_bint4){__VA_ARGS__}
153 #define vec_bint4_c(v) ((vec_bint4)(v))
154 #define vec_bint4_f (__VSX_S4__(vec_bint4, 0))
155 #define vec_bint4_t (__VSX_S4__(vec_bint4, 1))
157 #define vec_bdword2 __vector __bool long long
158 #define vec_bdword2_set(...) (vec_bdword2){__VA_ARGS__}
159 #define vec_bdword2_c(v) ((vec_bdword2)(v))
160 #define vec_bdword2_f (__VSX_S2__(vec_bdword2, 0))
161 #define vec_bdword2_t (__VSX_S2__(vec_bdword2, 1))
164 #define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
166 #define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
167 VSX_FINLINE(rt) fnm(const rg& a) { return fn2(a); }
169 #define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
170 VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
173 * GCC VSX compatibility
175 #if defined(__GNUG__) && !defined(__clang__)
178 #define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
179 VSX_FINLINE(rt) fnm(const rg& a) \
180 { rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "="#rto (rs) : #rgo (a)); return rs; }
182 #define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
183 VSX_FINLINE(rt) fnm(const rg& a) \
184 { rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
186 #define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm) \
187 VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
188 { rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
190 #define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
193 // up to GCC 6 vec_mul only supports precisions and llong
198 * there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07,
199 * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
200 * todo: Do I need to support 8-bit ?
202 # define VSX_IMPL_MULH(Tvec, Tcast) \
203 VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
205 static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21, \
206 8, 9, 24, 25, 12, 13, 28, 29}; \
207 return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \
209 VSX_IMPL_MULH(vec_short8, vec_short8_c)
210 VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c)
211 // vmuluwm can be used for unsigned or signed integers, that's what they said
212 VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
213 VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
214 // redirect to GCC builtin vec_mul, since it already supports precisions and llong
215 VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
216 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
217 VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
218 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
219 #endif // __GNUG__ < 7
223 * Instruction "compare greater than or equal" in ISA 2.07 only supports single
224 * and double precision.
225 * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
233 # define vec_cmple(a, b) vec_cmpge(b, a)
234 # define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
235 VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
237 VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
238 VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
239 VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
240 VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
241 VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
242 VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
243 VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
244 VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
246 // redirect to GCC builtin cmpge, since it already supports precisions
247 VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
248 VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
250 // up to gcc5 vec_nor doesn't support bool long long
253 VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
255 VSX_FINLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
256 { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
258 // vec_packs doesn't support double words in gcc4 and old versions of gcc5
260 VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
261 VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
262 VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
263 VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
265 VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
266 VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
267 #endif // __GNUG__ < 6
270 // vec_xxpermdi in gcc4 missing little-endian supports just like clang
271 # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
273 # define vec_permi vec_xxpermdi
274 #endif // __GNUG__ < 5
276 // shift left double by word immediate
278 # define vec_sldw __builtin_vsx_xxsldwi
281 // vector population count
282 VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
283 VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcntu)
284 VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
285 VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcntu)
286 VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcntu)
287 VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcntu)
288 VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
289 VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcntu)
291 // converts between single and double-precision
292 VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
293 VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
295 // converts word and doubleword to double-precision
299 VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, vec_ctdo)
300 VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, vec_ctdo)
301 VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, vec_ctd)
302 VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, vec_ctd)
304 // converts word and doubleword to single-precision
306 VSX_IMPL_1RG(vec_float4, wf, vec_int4, wa, xvcvsxwsp, vec_ctf)
307 VSX_IMPL_1RG(vec_float4, wf, vec_uint4, wa, xvcvuxwsp, vec_ctf)
308 VSX_IMPL_1RG(vec_float4, wf, vec_dword2, wi, xvcvsxdsp, vec_ctfo)
309 VSX_IMPL_1RG(vec_float4, wf, vec_udword2, wi, xvcvuxdsp, vec_ctfo)
311 // converts single and double precision to signed word
313 VSX_IMPL_1RG(vec_int4, wa, vec_double2, wd, xvcvdpsxws, vec_ctso)
314 VSX_IMPL_1RG(vec_int4, wa, vec_float4, wf, xvcvspsxws, vec_cts)
316 // converts single and double precision to unsigned word
318 VSX_IMPL_1RG(vec_uint4, wa, vec_double2, wd, xvcvdpuxws, vec_ctuo)
319 VSX_IMPL_1RG(vec_uint4, wa, vec_float4, wf, xvcvspuxws, vec_ctu)
321 // converts single and double precision to signed doubleword
325 VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, vec_ctsl)
326 VSX_IMPL_1RG(vec_dword2, wi, vec_float4, wf, xvcvspsxds, vec_ctslo)
328 // converts single and double precision to unsigned doubleword
332 VSX_IMPL_1RG(vec_udword2, wi, vec_double2, wd, xvcvdpuxds, vec_ctul)
333 VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo)
335 // just in case if GCC doesn't define it
337 # define vec_xl vec_vsx_ld
338 # define vec_xst vec_vsx_st
341 #endif // GCC VSX compatibility
344 * CLANG VSX compatibility
346 #if defined(__clang__) && !defined(__IBMCPP__)
349 * CLANG doesn't support %x<n> in the inline asm template which fixes register number
350 * when using any of the register constraints wa, wd, wf
352 * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
353 * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
355 * So we're not able to use inline asm and only use built-in functions that CLANG supports
356 * and use __builtin_convertvector if clang missng any of vector conversions built-in functions
359 // convert vector helper
360 #define VSX_IMPL_CONVERT(rt, rg, fnm) \
361 VSX_FINLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
363 #if __clang_major__ < 5
364 // implement vec_permi in a dirty way
365 # define VSX_IMPL_CLANG_4_PERMI(Tvec) \
366 VSX_FINLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c) \
371 return vec_mergeh(a, b); \
373 return vec_mergel(vec_mergeh(a, a), b); \
375 return vec_mergeh(vec_mergel(a, a), b); \
377 return vec_mergel(a, b); \
380 VSX_IMPL_CLANG_4_PERMI(vec_udword2)
381 VSX_IMPL_CLANG_4_PERMI(vec_dword2)
382 VSX_IMPL_CLANG_4_PERMI(vec_double2)
384 // vec_xxsldwi is missing in clang 4
385 # define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
387 // vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
388 # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
389 #endif // __clang_major__ < 5
391 // shift left double by word immediate
393 # define vec_sldw vec_xxsldwi
396 // Implement vec_rsqrt since clang only supports vec_rsqrte
398 VSX_FINLINE(vec_float4) vec_rsqrt(const vec_float4& a)
399 { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
401 VSX_FINLINE(vec_double2) vec_rsqrt(const vec_double2& a)
402 { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
405 // vec_promote missing support for doubleword
406 VSX_FINLINE(vec_dword2) vec_promote(long long a, int b)
408 vec_dword2 ret = vec_dword2_z;
413 VSX_FINLINE(vec_udword2) vec_promote(unsigned long long a, int b)
415 vec_udword2 ret = vec_udword2_z;
420 // vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
421 #define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \
422 VSX_FINLINE(Tvec) vec_popcntu(const Tvec2& a) \
423 { return ucast(vec_popcnt(a)); }
424 VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
425 VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
426 VSX_IMPL_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
427 // redirect unsigned types
428 VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
429 VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
430 VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
432 // converts between single and double precision
433 VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
434 VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
436 // converts word and doubleword to double-precision
440 VSX_REDIRECT_1RG(vec_double2, vec_int4, vec_ctdo, __builtin_vsx_xvcvsxwdp)
441 VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
443 VSX_IMPL_CONVERT(vec_double2, vec_dword2, vec_ctd)
444 VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
446 // converts word and doubleword to single-precision
447 #if __clang_major__ > 4
450 VSX_IMPL_CONVERT(vec_float4, vec_int4, vec_ctf)
451 VSX_IMPL_CONVERT(vec_float4, vec_uint4, vec_ctf)
452 VSX_REDIRECT_1RG(vec_float4, vec_dword2, vec_ctfo, __builtin_vsx_xvcvsxdsp)
453 VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)
455 // converts single and double precision to signed word
456 #if __clang_major__ > 4
459 VSX_REDIRECT_1RG(vec_int4, vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
460 VSX_IMPL_CONVERT(vec_int4, vec_float4, vec_cts)
462 // converts single and double precision to unsigned word
463 #if __clang_major__ > 4
466 VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
467 VSX_IMPL_CONVERT(vec_uint4, vec_float4, vec_ctu)
469 // converts single and double precision to signed doubleword
473 VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
474 // __builtin_convertvector unable to convert, xvcvspsxds is missing on it
475 VSX_FINLINE(vec_dword2) vec_ctslo(const vec_float4& a)
476 { return vec_ctsl(vec_cvfo(a)); }
478 // converts single and double precision to unsigned doubleword
482 VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
483 // __builtin_convertvector unable to convert, xvcvspuxds is missing on it
484 VSX_FINLINE(vec_udword2) vec_ctulo(const vec_float4& a)
485 { return vec_ctul(vec_cvfo(a)); }
487 #endif // CLANG VSX compatibility
490 * Common GCC, CLANG compatibility
492 #if defined(__GNUG__) && !defined(__IBMCPP__)
498 #define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2) \
499 VSX_FINLINE(rt) fnm(const rg& a) \
500 { return fn2(vec_sldw(a, a, 1)); }
502 VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf, vec_cvfo)
503 VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4, vec_ctd, vec_ctdo)
504 VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4, vec_ctd, vec_ctdo)
506 VSX_IMPL_CONV_EVEN_4_2(vec_dword2, vec_float4, vec_ctsl, vec_ctslo)
507 VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)
509 #define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2) \
510 VSX_FINLINE(rt) fnm(const rg& a) \
513 return vec_sldw(v4, v4, 3); \
516 VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
517 VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2, vec_ctf, vec_ctfo)
518 VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)
520 VSX_IMPL_CONV_EVEN_2_4(vec_int4, vec_double2, vec_cts, vec_ctso)
521 VSX_IMPL_CONV_EVEN_2_4(vec_uint4, vec_double2, vec_ctu, vec_ctuo)
523 #endif // Common GCC, CLANG compatibility
526 * XLC VSX compatibility
528 #if defined(__IBMCPP__)
530 // vector population count
531 #define vec_popcntu vec_popcnt
533 // overload and redirect wih setting second arg to zero
534 // since we only support conversions without the second arg
535 #define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
536 VSX_FINLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
538 VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4, vec_ctd)
539 VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4, vec_ctd)
540 VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2, vec_ctd)
541 VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
543 VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_int4, vec_ctf)
544 VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_uint4, vec_ctf)
545 VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_dword2, vec_ctf)
546 VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_udword2, vec_ctf)
548 VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_double2, vec_cts)
549 VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_float4, vec_cts)
551 VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_double2, vec_ctu)
552 VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_float4, vec_ctu)
554 VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_double2, vec_ctsl)
555 VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_float4, vec_ctsl)
557 VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
558 VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4, vec_ctul)
560 // fixme: implement conversions of odd-numbered elements in a dirty way
561 // since xlc doesn't support VSX registers operand in inline asm.
562 #define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2) \
563 VSX_FINLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
565 VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo, vec_cvf)
566 VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4, vec_ctdo, vec_ctd)
567 VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4, vec_ctdo, vec_ctd)
569 VSX_IMPL_CONV_ODD_4_2(vec_dword2, vec_float4, vec_ctslo, vec_ctsl)
570 VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
572 #define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2) \
573 VSX_FINLINE(rt) fnm(const rg& a) \
576 return vec_sldw(v4, v4, 1); \
579 VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
580 VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2, vec_ctfo, vec_ctf)
581 VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)
583 VSX_IMPL_CONV_ODD_2_4(vec_int4, vec_double2, vec_ctso, vec_cts)
584 VSX_IMPL_CONV_ODD_2_4(vec_uint4, vec_double2, vec_ctuo, vec_ctu)
586 #endif // XLC VSX compatibility
588 // ignore GCC warning that casued by -Wunused-but-set-variable in rare cases
589 #if defined(__GNUG__) && !defined(__clang__)
590 # define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
592 # define VSX_UNUSED(Tvec) Tvec
595 // gcc can find his way in casting log int and XLC, CLANG ambiguous
596 #if defined(__clang__) || defined(__IBMCPP__)
597 VSX_FINLINE(vec_udword2) vec_splats(uint64 v)
598 { return vec_splats((unsigned long long) v); }
600 VSX_FINLINE(vec_dword2) vec_splats(int64 v)
601 { return vec_splats((long long) v); }
603 VSX_FINLINE(vec_udword2) vec_promote(uint64 a, int b)
604 { return vec_promote((unsigned long long) a, b); }
606 VSX_FINLINE(vec_dword2) vec_promote(int64 a, int b)
607 { return vec_promote((long long) a, b); }
611 * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
612 * load and set using offset depend on the pointer type
614 * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
615 * load and set using offset depend on fixed bytes size
617 * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
618 * so we are using vec_vsx_ld, vec_vsx_st instead
621 #if defined(__clang__) && !defined(__IBMCPP__)
622 # define vsx_ldf vec_vsx_ld
623 # define vsx_stf vec_vsx_st
625 # define vsx_ldf vec_xl
626 # define vsx_stf vec_xst
629 #define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
630 #define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
631 #define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
634 * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
635 * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
636 * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
638 * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
640 #if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
641 VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
642 { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
644 VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
645 { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
647 VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
648 { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
650 VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
651 { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
653 VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
654 { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
656 VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
657 { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
659 VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
660 { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
662 VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
663 { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
666 // load 4 unsigned bytes into uint4 vector
667 #define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
669 // load 4 signed bytes into int4 vector
670 #define vec_ld_bsw(p) vec_int4_set((p)[0], (p)[1], (p)[2], (p)[3])
672 // load 4 unsigned bytes into float vector
673 #define vec_ld_bps(p) vec_ctf(vec_ld_buw(p), 0)
675 // Store lower 8 byte
676 #define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
678 // Store higher 8 byte
679 #define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
682 * vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
683 * vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
685 #define VSX_IMPL_LOAD_L8(Tvec, Tp) \
686 VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p) \
687 { return ((Tvec)vec_promote(*((uint64*)p), 0)); } \
688 VSX_FINLINE(Tvec) vec_ldz_l8(const Tp *p) \
690 /* TODO: try (Tvec)(vec_udword2{*((uint64*)p), 0}) */ \
691 static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000}; \
692 return vec_and(vec_ld_l8(p), (Tvec)mask); \
694 VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
695 VSX_IMPL_LOAD_L8(vec_char16, schar)
696 VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
697 VSX_IMPL_LOAD_L8(vec_short8, short)
698 VSX_IMPL_LOAD_L8(vec_uint4, uint)
699 VSX_IMPL_LOAD_L8(vec_int4, int)
700 VSX_IMPL_LOAD_L8(vec_float4, float)
701 VSX_IMPL_LOAD_L8(vec_udword2, uint64)
702 VSX_IMPL_LOAD_L8(vec_dword2, int64)
703 VSX_IMPL_LOAD_L8(vec_double2, double)
706 #define vec_not(a) vec_nor(a, a)
711 # define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
714 // absoulte difference
716 # define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
720 * Implement vec_unpacklu and vec_unpackhu
721 * since vec_unpackl, vec_unpackh only support signed integers
723 #define VSX_IMPL_UNPACKU(rt, rg, zero) \
724 VSX_FINLINE(rt) vec_unpacklu(const rg& a) \
725 { return reinterpret_cast<rt>(vec_mergel(a, zero)); } \
726 VSX_FINLINE(rt) vec_unpackhu(const rg& a) \
727 { return reinterpret_cast<rt>(vec_mergeh(a, zero)); }
729 VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
730 VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
731 VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
734 * Implement vec_mergesqe and vec_mergesqo
735 * Merges the sequence values of even and odd elements of two vectors
737 #define VSX_IMPL_PERM(rt, fnm, ...) \
738 VSX_FINLINE(rt) fnm(const rt& a, const rt& b) \
739 { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
742 #define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
743 #define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
744 VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
745 VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
746 VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
747 VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
749 #define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
750 #define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
751 VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
752 VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
753 VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
754 VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
756 #define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
757 #define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
758 VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
759 VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
760 VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
761 VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
762 VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
763 VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
765 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
766 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
767 VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
768 VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
769 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
770 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
773 * Implement vec_mergesqh and vec_mergesql
774 * Merges the sequence most and least significant halves of two vectors
776 #define VSX_IMPL_MERGESQHL(Tvec) \
777 VSX_FINLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b) \
778 { return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); } \
779 VSX_FINLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b) \
780 { return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
781 VSX_IMPL_MERGESQHL(vec_uchar16)
782 VSX_IMPL_MERGESQHL(vec_char16)
783 VSX_IMPL_MERGESQHL(vec_ushort8)
784 VSX_IMPL_MERGESQHL(vec_short8)
785 VSX_IMPL_MERGESQHL(vec_uint4)
786 VSX_IMPL_MERGESQHL(vec_int4)
787 VSX_IMPL_MERGESQHL(vec_float4)
788 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
789 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
790 VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
791 VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
792 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
793 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
796 // 2 and 4 channels interleave for all types except 2 lanes
797 #define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec) \
798 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
800 vsx_stf(vec_mergeh(a, b), 0, ptr); \
801 vsx_stf(vec_mergel(a, b), 16, ptr); \
803 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
804 const Tvec& c, const Tvec& d, Tp* ptr) \
806 Tvec ac = vec_mergeh(a, c); \
807 Tvec bd = vec_mergeh(b, d); \
808 vsx_stf(vec_mergeh(ac, bd), 0, ptr); \
809 vsx_stf(vec_mergel(ac, bd), 16, ptr); \
810 ac = vec_mergel(a, c); \
811 bd = vec_mergel(b, d); \
812 vsx_stf(vec_mergeh(ac, bd), 32, ptr); \
813 vsx_stf(vec_mergel(ac, bd), 48, ptr); \
815 VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
816 VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
817 VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
818 VSX_IMPL_ST_INTERLEAVE(short, vec_short8)
819 VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
820 VSX_IMPL_ST_INTERLEAVE(int, vec_int4)
821 VSX_IMPL_ST_INTERLEAVE(float, vec_float4)
823 // 2 and 4 channels deinterleave for 16 lanes
824 #define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \
825 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
827 Tvec v0 = vsx_ld(0, ptr); \
828 Tvec v1 = vsx_ld(16, ptr); \
829 a = vec_mergesqe(v0, v1); \
830 b = vec_mergesqo(v0, v1); \
832 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
835 Tvec v0 = vsx_ld(0, ptr); \
836 Tvec v1 = vsx_ld(16, ptr); \
837 Tvec v2 = vsx_ld(32, ptr); \
838 Tvec v3 = vsx_ld(48, ptr); \
839 Tvec m0 = vec_mergesqe(v0, v1); \
840 Tvec m1 = vec_mergesqe(v2, v3); \
841 a = vec_mergesqe(m0, m1); \
842 c = vec_mergesqo(m0, m1); \
843 m0 = vec_mergesqo(v0, v1); \
844 m1 = vec_mergesqo(v2, v3); \
845 b = vec_mergesqe(m0, m1); \
846 d = vec_mergesqo(m0, m1); \
848 VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
849 VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
851 // 2 and 4 channels deinterleave for 8 lanes
852 #define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec) \
853 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
855 Tvec v0 = vsx_ld(0, ptr); \
856 Tvec v1 = vsx_ld(8, ptr); \
857 a = vec_mergesqe(v0, v1); \
858 b = vec_mergesqo(v0, v1); \
860 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
863 Tvec v0 = vsx_ld(0, ptr); \
864 Tvec v1 = vsx_ld(8, ptr); \
865 Tvec m0 = vec_mergeh(v0, v1); \
866 Tvec m1 = vec_mergel(v0, v1); \
867 Tvec ab0 = vec_mergeh(m0, m1); \
868 Tvec cd0 = vec_mergel(m0, m1); \
869 v0 = vsx_ld(16, ptr); \
870 v1 = vsx_ld(24, ptr); \
871 m0 = vec_mergeh(v0, v1); \
872 m1 = vec_mergel(v0, v1); \
873 Tvec ab1 = vec_mergeh(m0, m1); \
874 Tvec cd1 = vec_mergel(m0, m1); \
875 a = vec_mergesqh(ab0, ab1); \
876 b = vec_mergesql(ab0, ab1); \
877 c = vec_mergesqh(cd0, cd1); \
878 d = vec_mergesql(cd0, cd1); \
880 VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
881 VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8)
883 // 2 and 4 channels deinterleave for 4 lanes
884 #define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \
885 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
887 a = vsx_ld(0, ptr); \
888 b = vsx_ld(4, ptr); \
889 Tvec m0 = vec_mergeh(a, b); \
890 Tvec m1 = vec_mergel(a, b); \
891 a = vec_mergeh(m0, m1); \
892 b = vec_mergel(m0, m1); \
894 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
897 Tvec v0 = vsx_ld(0, ptr); \
898 Tvec v1 = vsx_ld(4, ptr); \
899 Tvec v2 = vsx_ld(8, ptr); \
900 Tvec v3 = vsx_ld(12, ptr); \
901 Tvec m0 = vec_mergeh(v0, v2); \
902 Tvec m1 = vec_mergeh(v1, v3); \
903 a = vec_mergeh(m0, m1); \
904 b = vec_mergel(m0, m1); \
905 m0 = vec_mergel(v0, v2); \
906 m1 = vec_mergel(v1, v3); \
907 c = vec_mergeh(m0, m1); \
908 d = vec_mergel(m0, m1); \
910 VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
911 VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4)
912 VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
914 // 2 and 4 channels interleave and deinterleave for 2 lanes
915 #define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func) \
916 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
918 st_func(vec_mergeh(a, b), 0, ptr); \
919 st_func(vec_mergel(a, b), 2, ptr); \
921 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
922 const Tvec& c, const Tvec& d, Tp* ptr) \
924 st_func(vec_mergeh(a, b), 0, ptr); \
925 st_func(vec_mergeh(c, d), 2, ptr); \
926 st_func(vec_mergel(a, b), 4, ptr); \
927 st_func(vec_mergel(c, d), 6, ptr); \
929 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
931 Tvec m0 = ld_func(0, ptr); \
932 Tvec m1 = ld_func(2, ptr); \
933 a = vec_mergeh(m0, m1); \
934 b = vec_mergel(m0, m1); \
936 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
939 Tvec v0 = ld_func(0, ptr); \
940 Tvec v1 = ld_func(2, ptr); \
941 Tvec v2 = ld_func(4, ptr); \
942 Tvec v3 = ld_func(6, ptr); \
943 a = vec_mergeh(v0, v2); \
944 b = vec_mergel(v0, v2); \
945 c = vec_mergeh(v1, v3); \
946 d = vec_mergel(v1, v3); \
948 VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
949 VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
950 VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st)
953 #define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \
954 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
955 const Tvec& c, Tp* ptr) \
957 static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5}; \
958 static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15}; \
959 vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
960 static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26}; \
961 static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15}; \
962 vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr); \
963 static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0}; \
964 static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31}; \
965 vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr); \
967 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
969 Tvec v1 = vsx_ld(0, ptr); \
970 Tvec v2 = vsx_ld(16, ptr); \
971 Tvec v3 = vsx_ld(32, ptr); \
972 static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0}; \
973 static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}; \
974 a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
975 static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
976 static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}; \
977 b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
978 static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0}; \
979 static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}; \
980 c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
982 VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
983 VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
985 #define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec) \
986 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
987 const Tvec& c, Tp* ptr) \
989 static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21}; \
990 static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15}; \
991 vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
992 static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11}; \
993 static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15}; \
994 vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr); \
995 static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0}; \
996 static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31}; \
997 vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr); \
999 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
1001 Tvec v1 = vsx_ld(0, ptr); \
1002 Tvec v2 = vsx_ld(8, ptr); \
1003 Tvec v3 = vsx_ld(16, ptr); \
1004 static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
1005 static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27}; \
1006 a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
1007 static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0}; \
1008 static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29}; \
1009 b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
1010 static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
1011 static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31}; \
1012 c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
1014 VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
1015 VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8)
1017 #define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \
1018 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
1019 const Tvec& c, Tp* ptr) \
1021 Tvec hbc = vec_mergeh(b, c); \
1022 static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; \
1023 vsx_st(vec_perm(a, hbc, ahbc), 0, ptr); \
1024 Tvec lab = vec_mergel(a, b); \
1025 vsx_st(vec_sld(lab, hbc, 8), 4, ptr); \
1026 static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
1027 vsx_st(vec_perm(c, lab, clab), 8, ptr); \
1029 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
1031 Tvec v1 = vsx_ld(0, ptr); \
1032 Tvec v2 = vsx_ld(4, ptr); \
1033 Tvec v3 = vsx_ld(8, ptr); \
1034 static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; \
1035 a = vec_perm(v1, vec_sld(v3, v2, 8), flp); \
1036 static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; \
1037 b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \
1038 c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \
1040 VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
1041 VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4)
1042 VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
1044 #define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \
1045 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
1046 const Tvec& c, Tp* ptr) \
1048 st_func(vec_mergeh(a, b), 0, ptr); \
1049 st_func(vec_permi(c, a, 1), 2, ptr); \
1050 st_func(vec_mergel(b, c), 4, ptr); \
1052 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \
1055 Tvec v1 = ld_func(0, ptr); \
1056 Tvec v2 = ld_func(2, ptr); \
1057 Tvec v3 = ld_func(4, ptr); \
1058 a = vec_permi(v1, v2, 1); \
1059 b = vec_permi(v1, v3, 2); \
1060 c = vec_permi(v2, v3, 1); \
1062 VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
1063 VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
1064 VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st)
1070 #endif // OPENCV_HAL_VSX_UTILS_HPP