modules/core/include/opencv2/core/vsx_utils.hpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                          License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
  17 // Third party copyrights are property of their respective owners.
  18 //
  19 // Redistribution and use in source and binary forms, with or without modification,
  20 // are permitted provided that the following conditions are met:
  21 //
  22 //   * Redistribution's of source code must retain the above copyright notice,
  23 //     this list of conditions and the following disclaimer.
  24 //
  25 //   * Redistribution's in binary form must reproduce the above copyright notice,
  26 //     this list of conditions and the following disclaimer in the documentation
  27 //     and/or other materials provided with the distribution.
  28 //
  29 //   * The name of the copyright holders may not be used to endorse or promote products
  30 //     derived from this software without specific prior written permission.
  31 //
  32 // This software is provided by the copyright holders and contributors "as is" and
  33 // any express or implied warranties, including, but not limited to, the implied
  34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  35 // In no event shall the Intel Corporation or contributors be liable for any direct,
  36 // indirect, incidental, special, exemplary, or consequential damages
  37 // (including, but not limited to, procurement of substitute goods or services;
  38 // loss of use, data, or profits; or business interruption) however caused
  39 // and on any theory of liability, whether in contract, strict liability,
  40 // or tort (including negligence or otherwise) arising in any way out of
  41 // the use of this software, even if advised of the possibility of such damage.
  42 //
  43 //M*/
  44
  45 #ifndef OPENCV_HAL_VSX_UTILS_HPP
  46 #define OPENCV_HAL_VSX_UTILS_HPP
  47
  48 #include "opencv2/core/cvdef.h"
  49
  50 //! @addtogroup core_utils_vsx
  51 //! @{
  52 #if CV_VSX
  53
  54 #define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
  55 #define __VSX_S8__(c, v)  (c){v, v, v, v, v, v, v, v}
  56 #define __VSX_S4__(c, v)  (c){v, v, v, v}
  57 #define __VSX_S2__(c, v)  (c){v, v}
  58
  59 typedef __vector unsigned char vec_uchar16;
  60 #define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
  61 #define vec_uchar16_sp(c)    (__VSX_S16__(vec_uchar16, c))
  62 #define vec_uchar16_c(v)     ((vec_uchar16)(v))
  63 #define vec_uchar16_mx       vec_uchar16_sp(0xFF)
  64 #define vec_uchar16_mn       vec_uchar16_sp(0)
  65 #define vec_uchar16_z        vec_uchar16_mn
  66
  67 typedef __vector signed char vec_char16;
  68 #define vec_char16_set(...) (vec_char16){__VA_ARGS__}
  69 #define vec_char16_sp(c)    (__VSX_S16__(vec_char16, c))
  70 #define vec_char16_c(v)     ((vec_char16)(v))
  71 #define vec_char16_mx       vec_char16_sp(0x7F)
  72 #define vec_char16_mn       vec_char16_sp(-0x7F-1)
  73 #define vec_char16_z        vec_char16_sp(0)
  74
  75 typedef __vector unsigned short vec_ushort8;
  76 #define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
  77 #define vec_ushort8_sp(c)    (__VSX_S8__(vec_ushort8, c))
  78 #define vec_ushort8_c(v)     ((vec_ushort8)(v))
  79 #define vec_ushort8_mx       vec_ushort8_sp(0xFFFF)
  80 #define vec_ushort8_mn       vec_ushort8_sp(0)
  81 #define vec_ushort8_z        vec_ushort8_mn
  82
  83 typedef __vector signed short vec_short8;
  84 #define vec_short8_set(...) (vec_short8){__VA_ARGS__}
  85 #define vec_short8_sp(c)    (__VSX_S8__(vec_short8, c))
  86 #define vec_short8_c(v)     ((vec_short8)(v))
  87 #define vec_short8_mx       vec_short8_sp(0x7FFF)
  88 #define vec_short8_mn       vec_short8_sp(-0x7FFF-1)
  89 #define vec_short8_z        vec_short8_sp(0)
  90
  91 typedef __vector unsigned int vec_uint4;
  92 #define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
  93 #define vec_uint4_sp(c)    (__VSX_S4__(vec_uint4, c))
  94 #define vec_uint4_c(v)     ((vec_uint4)(v))
  95 #define vec_uint4_mx       vec_uint4_sp(0xFFFFFFFFU)
  96 #define vec_uint4_mn       vec_uint4_sp(0)
  97 #define vec_uint4_z        vec_uint4_mn
  98
  99 typedef __vector signed int vec_int4;
 100 #define vec_int4_set(...)  (vec_int4){__VA_ARGS__}
 101 #define vec_int4_sp(c)     (__VSX_S4__(vec_int4, c))
 102 #define vec_int4_c(v)      ((vec_int4)(v))
 103 #define vec_int4_mx        vec_int4_sp(0x7FFFFFFF)
 104 #define vec_int4_mn        vec_int4_sp(-0x7FFFFFFF-1)
 105 #define vec_int4_z         vec_int4_sp(0)
 106
 107 typedef __vector float vec_float4;
 108 #define vec_float4_set(...)  (vec_float4){__VA_ARGS__}
 109 #define vec_float4_sp(c)     (__VSX_S4__(vec_float4, c))
 110 #define vec_float4_c(v)      ((vec_float4)(v))
 111 #define vec_float4_mx        vec_float4_sp(3.40282347E+38F)
 112 #define vec_float4_mn        vec_float4_sp(1.17549435E-38F)
 113 #define vec_float4_z         vec_float4_sp(0)
 114
 115 typedef __vector unsigned long long vec_udword2;
 116 #define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
 117 #define vec_udword2_sp(c)    (__VSX_S2__(vec_udword2, c))
 118 #define vec_udword2_c(v)     ((vec_udword2)(v))
 119 #define vec_udword2_mx       vec_udword2_sp(18446744073709551615ULL)
 120 #define vec_udword2_mn       vec_udword2_sp(0)
 121 #define vec_udword2_z        vec_udword2_mn
 122
 123 typedef __vector signed long long vec_dword2;
 124 #define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
 125 #define vec_dword2_sp(c)    (__VSX_S2__(vec_dword2, c))
 126 #define vec_dword2_c(v)     ((vec_dword2)(v))
 127 #define vec_dword2_mx       vec_dword2_sp(9223372036854775807LL)
 128 #define vec_dword2_mn       vec_dword2_sp(-9223372036854775807LL-1)
 129 #define vec_dword2_z        vec_dword2_sp(0)
 130
 131 typedef  __vector double vec_double2;
 132 #define vec_double2_set(...) (vec_double2){__VA_ARGS__}
 133 #define vec_double2_c(v)     ((vec_double2)(v))
 134 #define vec_double2_sp(c)    (__VSX_S2__(vec_double2, c))
 135 #define vec_double2_mx       vec_double2_sp(1.7976931348623157E+308)
 136 #define vec_double2_mn       vec_double2_sp(2.2250738585072014E-308)
 137 #define vec_double2_z        vec_double2_sp(0)
 138
 139 #define vec_bchar16           __vector __bool char
 140 #define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
 141 #define vec_bchar16_c(v)     ((vec_bchar16)(v))
 142 #define vec_bchar16_f        (__VSX_S16__(vec_bchar16, 0))
 143 #define vec_bchar16_t        (__VSX_S16__(vec_bchar16, 1))
 144
 145 #define vec_bshort8           __vector __bool short
 146 #define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
 147 #define vec_bshort8_c(v)     ((vec_bshort8)(v))
 148 #define vec_bshort8_f        (__VSX_S8__(vec_bshort8, 0))
 149 #define vec_bshort8_t        (__VSX_S8__(vec_bshort8, 1))
 150
 151 #define vec_bint4             __vector __bool int
 152 #define vec_bint4_set(...)   (vec_bint4){__VA_ARGS__}
 153 #define vec_bint4_c(v)       ((vec_bint4)(v))
 154 #define vec_bint4_f          (__VSX_S4__(vec_bint4, 0))
 155 #define vec_bint4_t          (__VSX_S4__(vec_bint4, 1))
 156
 157 #define vec_bdword2            __vector __bool long long
 158 #define vec_bdword2_set(...)  (vec_bdword2){__VA_ARGS__}
 159 #define vec_bdword2_c(v)      ((vec_bdword2)(v))
 160 #define vec_bdword2_f         (__VSX_S2__(vec_bdword2, 0))
 161 #define vec_bdword2_t         (__VSX_S2__(vec_bdword2, 1))
 162
 163
 164 #define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
 165
 166 #define VSX_REDIRECT_1RG(rt, rg, fnm, fn2)   \
 167 VSX_FINLINE(rt) fnm(const rg& a) { return fn2(a); }
 168
 169 #define VSX_REDIRECT_2RG(rt, rg, fnm, fn2)   \
 170 VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
 171
 172 /*
 173  * GCC VSX compatibility
 174 **/
 175 #if defined(__GNUG__) && !defined(__clang__)
 176
 177 // inline asm helper
 178 #define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
 179 VSX_FINLINE(rt) fnm(const rg& a)                 \
 180 { rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "="#rto (rs) : #rgo (a)); return rs; }
 181
 182 #define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
 183 VSX_FINLINE(rt) fnm(const rg& a)        \
 184 { rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
 185
 186 #define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm)     \
 187 VSX_FINLINE(rt) fnm(const rg& a, const rg& b)  \
 188 { rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
 189
 190 #define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
 191
 192 #if __GNUG__ < 7
 193 // up to GCC 6 vec_mul only supports precisions and llong
 194 #   ifdef vec_mul
 195 #       undef vec_mul
 196 #   endif
 197 /*
 198  * there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07,
 199  * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
 200  * todo: Do I need to support 8-bit ?
 201 **/
 202 #   define VSX_IMPL_MULH(Tvec, Tcast)                                               \
 203     VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b)                         \
 204     {                                                                               \
 205         static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21,           \
 206                                               8, 9, 24, 25, 12, 13, 28, 29};        \
 207         return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm);   \
 208     }
 209     VSX_IMPL_MULH(vec_short8,  vec_short8_c)
 210     VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c)
 211     // vmuluwm can be used for unsigned or signed integers, that's what they said
 212     VSX_IMPL_2VRG(vec_int4,  vec_int4,  vmuluwm, vec_mul)
 213     VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
 214     // redirect to GCC builtin vec_mul, since it already supports precisions and llong
 215     VSX_REDIRECT_2RG(vec_float4,  vec_float4,  vec_mul, __builtin_vec_mul)
 216     VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
 217     VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mul, __builtin_vec_mul)
 218     VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
 219 #endif // __GNUG__ < 7
 220
 221 #if __GNUG__ < 6
 222 /*
 223  * Instruction "compare greater than or equal" in ISA 2.07 only supports single
 224  * and double precision.
 225  * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
 226 **/
 227 #   ifdef vec_cmpge
 228 #       undef vec_cmpge
 229 #   endif
 230 #   ifdef vec_cmple
 231 #       undef vec_cmple
 232 #   endif
 233 #   define vec_cmple(a, b) vec_cmpge(b, a)
 234 #   define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
 235     VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
 236
 237     VSX_IMPL_CMPGE(vec_bchar16, vec_char16,  vcmpgtsb, vec_cmpge)
 238     VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
 239     VSX_IMPL_CMPGE(vec_bshort8, vec_short8,  vcmpgtsh, vec_cmpge)
 240     VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
 241     VSX_IMPL_CMPGE(vec_bint4,   vec_int4,    vcmpgtsw, vec_cmpge)
 242     VSX_IMPL_CMPGE(vec_bint4,   vec_uint4,   vcmpgtuw, vec_cmpge)
 243     VSX_IMPL_CMPGE(vec_bdword2, vec_dword2,  vcmpgtsd, vec_cmpge)
 244     VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
 245
 246 // redirect to GCC builtin cmpge, since it already supports precisions
 247     VSX_REDIRECT_2RG(vec_bint4,   vec_float4,  vec_cmpge, __builtin_vec_cmpge)
 248     VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
 249
 250 // up to gcc5 vec_nor doesn't support bool long long
 251 #   undef vec_nor
 252     template<typename T>
 253     VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
 254
 255     VSX_FINLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
 256     { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
 257
 258 // vec_packs doesn't support double words in gcc4 and old versions of gcc5
 259 #   undef vec_packs
 260     VSX_REDIRECT_2RG(vec_char16,  vec_short8,  vec_packs, __builtin_vec_packs)
 261     VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
 262     VSX_REDIRECT_2RG(vec_short8,  vec_int4,    vec_packs, __builtin_vec_packs)
 263     VSX_REDIRECT_2RG(vec_ushort8, vec_uint4,   vec_packs, __builtin_vec_packs)
 264
 265     VSX_IMPL_2VRG_F(vec_int4,  vec_dword2,  "vpksdss %0,%2,%1", vec_packs)
 266     VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
 267 #endif // __GNUG__ < 6
 268
 269 #if __GNUG__ < 5
 270 // vec_xxpermdi in gcc4 missing little-endian supports just like clang
 271 #   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
 272 #else
 273 #   define vec_permi vec_xxpermdi
 274 #endif // __GNUG__ < 5
 275
 276 // shift left double by word immediate
 277 #ifndef vec_sldw
 278 #   define vec_sldw __builtin_vsx_xxsldwi
 279 #endif
 280
 281 // vector population count
 282 VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
 283 VSX_IMPL_1VRG(vec_uchar16, vec_char16,  vpopcntb, vec_popcntu)
 284 VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
 285 VSX_IMPL_1VRG(vec_ushort8, vec_short8,  vpopcnth, vec_popcntu)
 286 VSX_IMPL_1VRG(vec_uint4,   vec_uint4,   vpopcntw, vec_popcntu)
 287 VSX_IMPL_1VRG(vec_uint4,   vec_int4,    vpopcntw, vec_popcntu)
 288 VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
 289 VSX_IMPL_1VRG(vec_udword2, vec_dword2,  vpopcntd, vec_popcntu)
 290
 291 // converts between single and double-precision
 292 VSX_REDIRECT_1RG(vec_float4,  vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
 293 VSX_REDIRECT_1RG(vec_double2, vec_float4,  vec_cvfo, __builtin_vsx_xvcvspdp)
 294
 295 // converts word and doubleword to double-precision
 296 #ifdef vec_ctd
 297 #   undef vec_ctd
 298 #endif
 299 VSX_IMPL_1RG(vec_double2, wd, vec_int4,    wa, xvcvsxwdp, vec_ctdo)
 300 VSX_IMPL_1RG(vec_double2, wd, vec_uint4,   wa, xvcvuxwdp, vec_ctdo)
 301 VSX_IMPL_1RG(vec_double2, wd, vec_dword2,  wi, xvcvsxddp, vec_ctd)
 302 VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, vec_ctd)
 303
 304 // converts word and doubleword to single-precision
 305 #undef vec_ctf
 306 VSX_IMPL_1RG(vec_float4, wf, vec_int4,    wa, xvcvsxwsp, vec_ctf)
 307 VSX_IMPL_1RG(vec_float4, wf, vec_uint4,   wa, xvcvuxwsp, vec_ctf)
 308 VSX_IMPL_1RG(vec_float4, wf, vec_dword2,  wi, xvcvsxdsp, vec_ctfo)
 309 VSX_IMPL_1RG(vec_float4, wf, vec_udword2, wi, xvcvuxdsp, vec_ctfo)
 310
 311 // converts single and double precision to signed word
 312 #undef vec_cts
 313 VSX_IMPL_1RG(vec_int4,  wa, vec_double2, wd, xvcvdpsxws, vec_ctso)
 314 VSX_IMPL_1RG(vec_int4,  wa, vec_float4,  wf, xvcvspsxws, vec_cts)
 315
 316 // converts single and double precision to unsigned word
 317 #undef vec_ctu
 318 VSX_IMPL_1RG(vec_uint4, wa, vec_double2, wd, xvcvdpuxws, vec_ctuo)
 319 VSX_IMPL_1RG(vec_uint4, wa, vec_float4,  wf, xvcvspuxws, vec_ctu)
 320
 321 // converts single and double precision to signed doubleword
 322 #ifdef vec_ctsl
 323 #   undef vec_ctsl
 324 #endif
 325 VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, vec_ctsl)
 326 VSX_IMPL_1RG(vec_dword2, wi, vec_float4,  wf, xvcvspsxds, vec_ctslo)
 327
 328 // converts single and double precision to unsigned doubleword
 329 #ifdef vec_ctul
 330 #   undef vec_ctul
 331 #endif
 332 VSX_IMPL_1RG(vec_udword2, wi, vec_double2, wd, xvcvdpuxds, vec_ctul)
 333 VSX_IMPL_1RG(vec_udword2, wi, vec_float4,  wf, xvcvspuxds, vec_ctulo)
 334
 335 // just in case if GCC doesn't define it
 336 #ifndef vec_xl
 337 #   define vec_xl vec_vsx_ld
 338 #   define vec_xst vec_vsx_st
 339 #endif
 340
 341 #endif // GCC VSX compatibility
 342
 343 /*
 344  * CLANG VSX compatibility
 345 **/
 346 #if defined(__clang__) && !defined(__IBMCPP__)
 347
 348 /*
 349  * CLANG doesn't support %x<n> in the inline asm template which fixes register number
 350  * when using any of the register constraints wa, wd, wf
 351  *
 352  * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
 353  * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
 354  *
 355  * So we're not able to use inline asm and only use built-in functions that CLANG supports
 356  * and use __builtin_convertvector if clang missng any of vector conversions built-in functions
 357 */
 358
 359 // convert vector helper
 360 #define VSX_IMPL_CONVERT(rt, rg, fnm) \
 361 VSX_FINLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
 362
 363 #if __clang_major__ < 5
 364 // implement vec_permi in a dirty way
 365 #   define VSX_IMPL_CLANG_4_PERMI(Tvec)                                                 \
 366     VSX_FINLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c)    \
 367     {                                                                                   \
 368         switch (c)                                                                      \
 369         {                                                                               \
 370         case 0:                                                                         \
 371             return vec_mergeh(a, b);                                                    \
 372         case 1:                                                                         \
 373             return vec_mergel(vec_mergeh(a, a), b);                                     \
 374         case 2:                                                                         \
 375             return vec_mergeh(vec_mergel(a, a), b);                                     \
 376         default:                                                                        \
 377             return vec_mergel(a, b);                                                    \
 378         }                                                                               \
 379     }
 380     VSX_IMPL_CLANG_4_PERMI(vec_udword2)
 381     VSX_IMPL_CLANG_4_PERMI(vec_dword2)
 382     VSX_IMPL_CLANG_4_PERMI(vec_double2)
 383
 384 // vec_xxsldwi is missing in clang 4
 385 #   define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
 386 #else
 387 // vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
 388 #   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
 389 #endif // __clang_major__ < 5
 390
 391 // shift left double by word immediate
 392 #ifndef vec_sldw
 393 #   define vec_sldw vec_xxsldwi
 394 #endif
 395
 396 // Implement vec_rsqrt since clang only supports vec_rsqrte
 397 #ifndef vec_rsqrt
 398     VSX_FINLINE(vec_float4) vec_rsqrt(const vec_float4& a)
 399     { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
 400
 401     VSX_FINLINE(vec_double2) vec_rsqrt(const vec_double2& a)
 402     { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
 403 #endif
 404
 405 // vec_promote missing support for doubleword
 406 VSX_FINLINE(vec_dword2) vec_promote(long long a, int b)
 407 {
 408     vec_dword2 ret = vec_dword2_z;
 409     ret[b & 1] = a;
 410     return ret;
 411 }
 412
 413 VSX_FINLINE(vec_udword2) vec_promote(unsigned long long a, int b)
 414 {
 415     vec_udword2 ret = vec_udword2_z;
 416     ret[b & 1] = a;
 417     return ret;
 418 }
 419
 420 // vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
 421 #define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast)   \
 422 VSX_FINLINE(Tvec) vec_popcntu(const Tvec2& a)  \
 423 { return ucast(vec_popcnt(a)); }
 424 VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
 425 VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
 426 VSX_IMPL_POPCNTU(vec_uint4,   vec_int4,   vec_uint4_c);
 427 // redirect unsigned types
 428 VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
 429 VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
 430 VSX_REDIRECT_1RG(vec_uint4,   vec_uint4,   vec_popcntu, vec_popcnt)
 431
 432 // converts between single and double precision
 433 VSX_REDIRECT_1RG(vec_float4,  vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
 434 VSX_REDIRECT_1RG(vec_double2, vec_float4,  vec_cvfo, __builtin_vsx_xvcvspdp)
 435
 436 // converts word and doubleword to double-precision
 437 #ifdef vec_ctd
 438 #   undef vec_ctd
 439 #endif
 440 VSX_REDIRECT_1RG(vec_double2, vec_int4,  vec_ctdo, __builtin_vsx_xvcvsxwdp)
 441 VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
 442
 443 VSX_IMPL_CONVERT(vec_double2, vec_dword2,  vec_ctd)
 444 VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
 445
 446 // converts word and doubleword to single-precision
 447 #if __clang_major__ > 4
 448 #   undef vec_ctf
 449 #endif
 450 VSX_IMPL_CONVERT(vec_float4, vec_int4,    vec_ctf)
 451 VSX_IMPL_CONVERT(vec_float4, vec_uint4,   vec_ctf)
 452 VSX_REDIRECT_1RG(vec_float4, vec_dword2,  vec_ctfo, __builtin_vsx_xvcvsxdsp)
 453 VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)
 454
 455 // converts single and double precision to signed word
 456 #if __clang_major__ > 4
 457 #   undef vec_cts
 458 #endif
 459 VSX_REDIRECT_1RG(vec_int4,  vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
 460 VSX_IMPL_CONVERT(vec_int4,  vec_float4,  vec_cts)
 461
 462 // converts single and double precision to unsigned word
 463 #if __clang_major__ > 4
 464 #   undef vec_ctu
 465 #endif
 466 VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
 467 VSX_IMPL_CONVERT(vec_uint4, vec_float4,  vec_ctu)
 468
 469 // converts single and double precision to signed doubleword
 470 #ifdef vec_ctsl
 471 #   undef vec_ctsl
 472 #endif
 473 VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
 474 // __builtin_convertvector unable to convert, xvcvspsxds is missing on it
 475 VSX_FINLINE(vec_dword2) vec_ctslo(const vec_float4& a)
 476 { return vec_ctsl(vec_cvfo(a)); }
 477
 478 // converts single and double precision to unsigned doubleword
 479 #ifdef vec_ctul
 480 #   undef vec_ctul
 481 #endif
 482 VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
 483 // __builtin_convertvector unable to convert, xvcvspuxds is missing on it
 484 VSX_FINLINE(vec_udword2) vec_ctulo(const vec_float4& a)
 485 { return vec_ctul(vec_cvfo(a)); }
 486
 487 #endif // CLANG VSX compatibility
 488
 489 /*
 490  * Common GCC, CLANG compatibility
 491 **/
 492 #if defined(__GNUG__) && !defined(__IBMCPP__)
 493
 494 #ifdef vec_cvf
 495 #   undef vec_cvf
 496 #endif
 497
 498 #define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2) \
 499 VSX_FINLINE(rt) fnm(const rg& a)                 \
 500 { return fn2(vec_sldw(a, a, 1)); }
 501
 502 VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf,  vec_cvfo)
 503 VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4,   vec_ctd,  vec_ctdo)
 504 VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4,  vec_ctd,  vec_ctdo)
 505
 506 VSX_IMPL_CONV_EVEN_4_2(vec_dword2,  vec_float4, vec_ctsl, vec_ctslo)
 507 VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)
 508
 509 #define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2) \
 510 VSX_FINLINE(rt) fnm(const rg& a)                 \
 511 {                                                \
 512     rt v4 = fn2(a);                              \
 513     return vec_sldw(v4, v4, 3);                  \
 514 }
 515
 516 VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
 517 VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2,  vec_ctf, vec_ctfo)
 518 VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)
 519
 520 VSX_IMPL_CONV_EVEN_2_4(vec_int4,   vec_double2, vec_cts, vec_ctso)
 521 VSX_IMPL_CONV_EVEN_2_4(vec_uint4,  vec_double2, vec_ctu, vec_ctuo)
 522
 523 #endif // Common GCC, CLANG compatibility
 524
 525 /*
 526  * XLC VSX compatibility
 527 **/
 528 #if defined(__IBMCPP__)
 529
 530 // vector population count
 531 #define vec_popcntu vec_popcnt
 532
 533 // overload and redirect wih setting second arg to zero
 534 // since we only support conversions without the second arg
 535 #define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
 536 VSX_FINLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
 537
 538 VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4,    vec_ctd)
 539 VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4,   vec_ctd)
 540 VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2,  vec_ctd)
 541 VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
 542
 543 VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_int4,    vec_ctf)
 544 VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_uint4,   vec_ctf)
 545 VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_dword2,  vec_ctf)
 546 VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_udword2, vec_ctf)
 547
 548 VSX_IMPL_OVERLOAD_Z2(vec_int4,    vec_double2, vec_cts)
 549 VSX_IMPL_OVERLOAD_Z2(vec_int4,    vec_float4,  vec_cts)
 550
 551 VSX_IMPL_OVERLOAD_Z2(vec_uint4,   vec_double2, vec_ctu)
 552 VSX_IMPL_OVERLOAD_Z2(vec_uint4,   vec_float4,  vec_ctu)
 553
 554 VSX_IMPL_OVERLOAD_Z2(vec_dword2,  vec_double2, vec_ctsl)
 555 VSX_IMPL_OVERLOAD_Z2(vec_dword2,  vec_float4,  vec_ctsl)
 556
 557 VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
 558 VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4,  vec_ctul)
 559
 560 // fixme: implement conversions of odd-numbered elements in a dirty way
 561 // since xlc doesn't support VSX registers operand in inline asm.
 562 #define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2) \
 563 VSX_FINLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
 564
 565 VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo,  vec_cvf)
 566 VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4,   vec_ctdo,  vec_ctd)
 567 VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4,  vec_ctdo,  vec_ctd)
 568
 569 VSX_IMPL_CONV_ODD_4_2(vec_dword2,  vec_float4, vec_ctslo, vec_ctsl)
 570 VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
 571
 572 #define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2)  \
 573 VSX_FINLINE(rt) fnm(const rg& a)                 \
 574 {                                                \
 575     rt v4 = fn2(a);                              \
 576     return vec_sldw(v4, v4, 1);                  \
 577 }
 578
 579 VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
 580 VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2,  vec_ctfo, vec_ctf)
 581 VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)
 582
 583 VSX_IMPL_CONV_ODD_2_4(vec_int4,   vec_double2, vec_ctso, vec_cts)
 584 VSX_IMPL_CONV_ODD_2_4(vec_uint4,  vec_double2, vec_ctuo, vec_ctu)
 585
 586 #endif // XLC VSX compatibility
 587
 588 // ignore GCC warning that casued by -Wunused-but-set-variable in rare cases
 589 #if defined(__GNUG__) && !defined(__clang__)
 590 #   define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
 591 #else // CLANG, XLC
 592 #   define VSX_UNUSED(Tvec) Tvec
 593 #endif
 594
 595 // gcc can find his way in casting log int and XLC, CLANG ambiguous
 596 #if defined(__clang__) || defined(__IBMCPP__)
 597     VSX_FINLINE(vec_udword2) vec_splats(uint64 v)
 598     { return vec_splats((unsigned long long) v); }
 599
 600     VSX_FINLINE(vec_dword2) vec_splats(int64 v)
 601     { return vec_splats((long long) v); }
 602
 603     VSX_FINLINE(vec_udword2) vec_promote(uint64 a, int b)
 604     { return vec_promote((unsigned long long) a, b); }
 605
 606     VSX_FINLINE(vec_dword2) vec_promote(int64 a, int b)
 607     { return vec_promote((long long) a, b); }
 608 #endif
 609
 610 /*
 611  * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
 612  * load and set using offset depend on the pointer type
 613  *
 614  * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
 615  * load and set using offset depend on fixed bytes size
 616  *
 617  * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
 618  * so we are using vec_vsx_ld, vec_vsx_st instead
 619 */
 620
 621 #if defined(__clang__) && !defined(__IBMCPP__)
 622 #   define vsx_ldf  vec_vsx_ld
 623 #   define vsx_stf  vec_vsx_st
 624 #else // GCC , XLC
 625 #   define vsx_ldf  vec_xl
 626 #   define vsx_stf  vec_xst
 627 #endif
 628
 629 #define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
 630 #define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
 631 #define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
 632
 633 /*
 634  * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
 635  * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
 636  * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
 637  *
 638  * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
 639 */
 640 #if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
 641     VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
 642     { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
 643
 644     VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
 645     { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
 646
 647     VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
 648     { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
 649
 650     VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
 651     { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
 652 #else // XLC
 653     VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
 654     { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
 655
 656     VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
 657     { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
 658
 659     VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
 660     { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
 661
 662     VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
 663     { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
 664 #endif
 665
 666 // load 4 unsigned bytes into uint4 vector
 667 #define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
 668
 669 // load 4 signed bytes into int4 vector
 670 #define vec_ld_bsw(p) vec_int4_set((p)[0], (p)[1], (p)[2], (p)[3])
 671
 672 // load 4 unsigned bytes into float vector
 673 #define vec_ld_bps(p) vec_ctf(vec_ld_buw(p), 0)
 674
 675 // Store lower 8 byte
 676 #define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
 677
 678 // Store higher 8 byte
 679 #define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
 680
 681 /*
 682  * vec_ld_l8(ptr) -> Load 64-bits of integer data to lower part
 683  * vec_ldz_l8(ptr) -> Load 64-bits of integer data to lower part and zero upper part
 684 **/
 685 #define VSX_IMPL_LOAD_L8(Tvec, Tp)                                              \
 686 VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p)                                        \
 687 { return ((Tvec)vec_promote(*((uint64*)p), 0)); }                               \
 688 VSX_FINLINE(Tvec) vec_ldz_l8(const Tp *p)                                       \
 689 {                                                                               \
 690     /* TODO: try (Tvec)(vec_udword2{*((uint64*)p), 0}) */                       \
 691     static const vec_bdword2 mask = {0xFFFFFFFFFFFFFFFF, 0x0000000000000000};   \
 692     return vec_and(vec_ld_l8(p), (Tvec)mask);                                   \
 693 }
 694 VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
 695 VSX_IMPL_LOAD_L8(vec_char16,  schar)
 696 VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
 697 VSX_IMPL_LOAD_L8(vec_short8,  short)
 698 VSX_IMPL_LOAD_L8(vec_uint4,   uint)
 699 VSX_IMPL_LOAD_L8(vec_int4,    int)
 700 VSX_IMPL_LOAD_L8(vec_float4,  float)
 701 VSX_IMPL_LOAD_L8(vec_udword2, uint64)
 702 VSX_IMPL_LOAD_L8(vec_dword2,  int64)
 703 VSX_IMPL_LOAD_L8(vec_double2, double)
 704
 705 // logical not
 706 #define vec_not(a) vec_nor(a, a)
 707
 708 // power9 yaya
 709 // not equal
 710 #ifndef vec_cmpne
 711 #   define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
 712 #endif
 713
 714 // absoulte difference
 715 #ifndef vec_absd
 716 #   define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
 717 #endif
 718
 719 /*
 720  * Implement vec_unpacklu and vec_unpackhu
 721  * since vec_unpackl, vec_unpackh only support signed integers
 722 **/
 723 #define VSX_IMPL_UNPACKU(rt, rg, zero)                 \
 724 VSX_FINLINE(rt) vec_unpacklu(const rg& a)              \
 725 { return reinterpret_cast<rt>(vec_mergel(a, zero)); }  \
 726 VSX_FINLINE(rt) vec_unpackhu(const rg& a)              \
 727 { return reinterpret_cast<rt>(vec_mergeh(a, zero));  }
 728
 729 VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
 730 VSX_IMPL_UNPACKU(vec_uint4,   vec_ushort8, vec_ushort8_z)
 731 VSX_IMPL_UNPACKU(vec_udword2, vec_uint4,   vec_uint4_z)
 732
 733 /*
 734  * Implement vec_mergesqe and vec_mergesqo
 735  * Merges the sequence values of even and odd elements of two vectors
 736 */
 737 #define VSX_IMPL_PERM(rt, fnm, ...)            \
 738 VSX_FINLINE(rt) fnm(const rt& a, const rt& b)  \
 739 { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
 740
 741 // 16
 742 #define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 743 #define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 744 VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
 745 VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
 746 VSX_IMPL_PERM(vec_char16,  vec_mergesqe, perm16_mergesqe)
 747 VSX_IMPL_PERM(vec_char16,  vec_mergesqo, perm16_mergesqo)
 748 // 8
 749 #define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
 750 #define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
 751 VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
 752 VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
 753 VSX_IMPL_PERM(vec_short8,  vec_mergesqe, perm8_mergesqe)
 754 VSX_IMPL_PERM(vec_short8,  vec_mergesqo, perm8_mergesqo)
 755 // 4
 756 #define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 757 #define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 758 VSX_IMPL_PERM(vec_uint4,  vec_mergesqe, perm4_mergesqe)
 759 VSX_IMPL_PERM(vec_uint4,  vec_mergesqo, perm4_mergesqo)
 760 VSX_IMPL_PERM(vec_int4,   vec_mergesqe, perm4_mergesqe)
 761 VSX_IMPL_PERM(vec_int4,   vec_mergesqo, perm4_mergesqo)
 762 VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
 763 VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
 764 // 2
 765 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
 766 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
 767 VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqe, vec_mergeh)
 768 VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqo, vec_mergel)
 769 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
 770 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
 771
 772 /*
 773  * Implement vec_mergesqh and vec_mergesql
 774  * Merges the sequence most and least significant halves of two vectors
 775 */
 776 #define VSX_IMPL_MERGESQHL(Tvec)                                    \
 777 VSX_FINLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b)        \
 778 { return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); }    \
 779 VSX_FINLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b)        \
 780 { return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
 781 VSX_IMPL_MERGESQHL(vec_uchar16)
 782 VSX_IMPL_MERGESQHL(vec_char16)
 783 VSX_IMPL_MERGESQHL(vec_ushort8)
 784 VSX_IMPL_MERGESQHL(vec_short8)
 785 VSX_IMPL_MERGESQHL(vec_uint4)
 786 VSX_IMPL_MERGESQHL(vec_int4)
 787 VSX_IMPL_MERGESQHL(vec_float4)
 788 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
 789 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
 790 VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqh, vec_mergeh)
 791 VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesql, vec_mergel)
 792 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
 793 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
 794
 795
 796 // 2 and 4 channels interleave for all types except 2 lanes
 797 #define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec)                                    \
 798 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr)  \
 799 {                                                                           \
 800     vsx_stf(vec_mergeh(a, b), 0, ptr);                                      \
 801     vsx_stf(vec_mergel(a, b), 16, ptr);                                     \
 802 }                                                                           \
 803 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,           \
 804                                      const Tvec& c, const Tvec& d, Tp* ptr) \
 805 {                                                                           \
 806     Tvec ac = vec_mergeh(a, c);                                             \
 807     Tvec bd = vec_mergeh(b, d);                                             \
 808     vsx_stf(vec_mergeh(ac, bd), 0, ptr);                                    \
 809     vsx_stf(vec_mergel(ac, bd), 16, ptr);                                   \
 810     ac = vec_mergel(a, c);                                                  \
 811     bd = vec_mergel(b, d);                                                  \
 812     vsx_stf(vec_mergeh(ac, bd), 32, ptr);                                   \
 813     vsx_stf(vec_mergel(ac, bd), 48, ptr);                                   \
 814 }
 815 VSX_IMPL_ST_INTERLEAVE(uchar,  vec_uchar16)
 816 VSX_IMPL_ST_INTERLEAVE(schar,  vec_char16)
 817 VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
 818 VSX_IMPL_ST_INTERLEAVE(short,  vec_short8)
 819 VSX_IMPL_ST_INTERLEAVE(uint,   vec_uint4)
 820 VSX_IMPL_ST_INTERLEAVE(int,    vec_int4)
 821 VSX_IMPL_ST_INTERLEAVE(float,  vec_float4)
 822
 823 // 2 and 4 channels deinterleave for 16 lanes
 824 #define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec)                                 \
 825 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
 826 {                                                                           \
 827     Tvec v0 = vsx_ld(0, ptr);                                               \
 828     Tvec v1 = vsx_ld(16, ptr);                                              \
 829     a = vec_mergesqe(v0, v1);                                               \
 830     b = vec_mergesqo(v0, v1);                                               \
 831 }                                                                           \
 832 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
 833                                        Tvec& c, Tvec& d)                    \
 834 {                                                                           \
 835     Tvec v0 = vsx_ld(0, ptr);                                               \
 836     Tvec v1 = vsx_ld(16, ptr);                                              \
 837     Tvec v2 = vsx_ld(32, ptr);                                              \
 838     Tvec v3 = vsx_ld(48, ptr);                                              \
 839     Tvec m0 = vec_mergesqe(v0, v1);                                         \
 840     Tvec m1 = vec_mergesqe(v2, v3);                                         \
 841     a = vec_mergesqe(m0, m1);                                               \
 842     c = vec_mergesqo(m0, m1);                                               \
 843     m0 = vec_mergesqo(v0, v1);                                              \
 844     m1 = vec_mergesqo(v2, v3);                                              \
 845     b = vec_mergesqe(m0, m1);                                               \
 846     d = vec_mergesqo(m0, m1);                                               \
 847 }
 848 VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
 849 VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
 850
 851 // 2 and 4 channels deinterleave for 8 lanes
 852 #define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec)                                \
 853 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
 854 {                                                                           \
 855     Tvec v0 = vsx_ld(0, ptr);                                               \
 856     Tvec v1 = vsx_ld(8, ptr);                                               \
 857     a = vec_mergesqe(v0, v1);                                               \
 858     b = vec_mergesqo(v0, v1);                                               \
 859 }                                                                           \
 860 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
 861                                        Tvec& c, Tvec& d)                    \
 862 {                                                                           \
 863     Tvec v0 = vsx_ld(0, ptr);                                               \
 864     Tvec v1 = vsx_ld(8, ptr);                                               \
 865     Tvec m0 = vec_mergeh(v0, v1);                                           \
 866     Tvec m1 = vec_mergel(v0, v1);                                           \
 867     Tvec ab0 = vec_mergeh(m0, m1);                                          \
 868     Tvec cd0 = vec_mergel(m0, m1);                                          \
 869     v0 = vsx_ld(16, ptr);                                                   \
 870     v1 = vsx_ld(24, ptr);                                                   \
 871     m0 = vec_mergeh(v0, v1);                                                \
 872     m1 = vec_mergel(v0, v1);                                                \
 873     Tvec ab1 = vec_mergeh(m0, m1);                                          \
 874     Tvec cd1 = vec_mergel(m0, m1);                                          \
 875     a = vec_mergesqh(ab0, ab1);                                             \
 876     b = vec_mergesql(ab0, ab1);                                             \
 877     c = vec_mergesqh(cd0, cd1);                                             \
 878     d = vec_mergesql(cd0, cd1);                                             \
 879 }
 880 VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
 881 VSX_IMPL_ST_DINTERLEAVE_16(short,  vec_short8)
 882
 883 // 2 and 4 channels deinterleave for 4 lanes
 884 #define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec)                                \
 885 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
 886 {                                                                           \
 887     a = vsx_ld(0, ptr);                                                     \
 888     b = vsx_ld(4, ptr);                                                     \
 889     Tvec m0 = vec_mergeh(a, b);                                             \
 890     Tvec m1 = vec_mergel(a, b);                                             \
 891     a = vec_mergeh(m0, m1);                                                 \
 892     b = vec_mergel(m0, m1);                                                 \
 893 }                                                                           \
 894 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
 895                                        Tvec& c, Tvec& d)                    \
 896 {                                                                           \
 897     Tvec v0 = vsx_ld(0, ptr);                                               \
 898     Tvec v1 = vsx_ld(4, ptr);                                               \
 899     Tvec v2 = vsx_ld(8, ptr);                                               \
 900     Tvec v3 = vsx_ld(12, ptr);                                              \
 901     Tvec m0 = vec_mergeh(v0, v2);                                           \
 902     Tvec m1 = vec_mergeh(v1, v3);                                           \
 903     a = vec_mergeh(m0, m1);                                                 \
 904     b = vec_mergel(m0, m1);                                                 \
 905     m0 = vec_mergel(v0, v2);                                                \
 906     m1 = vec_mergel(v1, v3);                                                \
 907     c = vec_mergeh(m0, m1);                                                 \
 908     d = vec_mergel(m0, m1);                                                 \
 909 }
 910 VSX_IMPL_ST_DINTERLEAVE_32(uint,  vec_uint4)
 911 VSX_IMPL_ST_DINTERLEAVE_32(int,   vec_int4)
 912 VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
 913
 914 // 2 and 4 channels interleave and deinterleave for 2 lanes
 915 #define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func)             \
 916 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr)  \
 917 {                                                                           \
 918     st_func(vec_mergeh(a, b), 0, ptr);                                      \
 919     st_func(vec_mergel(a, b), 2, ptr);                                      \
 920 }                                                                           \
 921 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,           \
 922                                      const Tvec& c, const Tvec& d, Tp* ptr) \
 923 {                                                                           \
 924     st_func(vec_mergeh(a, b), 0, ptr);                                      \
 925     st_func(vec_mergeh(c, d), 2, ptr);                                      \
 926     st_func(vec_mergel(a, b), 4, ptr);                                      \
 927     st_func(vec_mergel(c, d), 6, ptr);                                      \
 928 }                                                                           \
 929 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b)      \
 930 {                                                                           \
 931     Tvec m0 = ld_func(0, ptr);                                              \
 932     Tvec m1 = ld_func(2, ptr);                                              \
 933     a = vec_mergeh(m0, m1);                                                 \
 934     b = vec_mergel(m0, m1);                                                 \
 935 }                                                                           \
 936 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,      \
 937                                        Tvec& c, Tvec& d)                    \
 938 {                                                                           \
 939     Tvec v0 = ld_func(0, ptr);                                              \
 940     Tvec v1 = ld_func(2, ptr);                                              \
 941     Tvec v2 = ld_func(4, ptr);                                              \
 942     Tvec v3 = ld_func(6, ptr);                                              \
 943     a = vec_mergeh(v0, v2);                                                 \
 944     b = vec_mergel(v0, v2);                                                 \
 945     c = vec_mergeh(v1, v3);                                                 \
 946     d = vec_mergel(v1, v3);                                                 \
 947 }
 948 VSX_IMPL_ST_D_INTERLEAVE_64(int64,  vec_dword2,  vsx_ld2, vsx_st2)
 949 VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
 950 VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld,  vsx_st)
 951
 952 /* 3 channels */
 953 #define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec)                                                   \
 954 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                 \
 955                                      const Tvec& c, Tp* ptr)                                      \
 956 {                                                                                                 \
 957     static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5};         \
 958     static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15};    \
 959     vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                                       \
 960     static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26};       \
 961     static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15};    \
 962     vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr);                                      \
 963     static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0};    \
 964     static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31};   \
 965     vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr);                                      \
 966 }                                                                                                 \
 967 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                   \
 968 {                                                                                                 \
 969     Tvec v1 = vsx_ld(0, ptr);                                                                     \
 970     Tvec v2 = vsx_ld(16, ptr);                                                                    \
 971     Tvec v3 = vsx_ld(32, ptr);                                                                    \
 972     static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0};  \
 973     static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29};  \
 974     a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                                      \
 975     static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
 976     static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30};  \
 977     b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                                      \
 978     static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0};  \
 979     static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31};  \
 980     c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
 981 }
 982 VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
 983 VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
 984
 985 #define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec)                                                    \
 986 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                 \
 987                                      const Tvec& c, Tp* ptr)                                      \
 988 {                                                                                                 \
 989     static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21};        \
 990     static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15};     \
 991     vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr);                                       \
 992     static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11};        \
 993     static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15};   \
 994     vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr);                                       \
 995     static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0};    \
 996     static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31};   \
 997     vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr);                                      \
 998 }                                                                                                 \
 999 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                   \
1000 {                                                                                                 \
1001     Tvec v1 = vsx_ld(0, ptr);                                                                     \
1002     Tvec v2 = vsx_ld(8, ptr);                                                                     \
1003     Tvec v3 = vsx_ld(16, ptr);                                                                    \
1004     static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
1005     static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27};  \
1006     a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm);                                      \
1007     static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0};   \
1008     static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29};  \
1009     b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm);                                      \
1010     static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
1011     static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31};  \
1012     c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
1013 }
1014 VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
1015 VSX_IMPL_ST_INTERLEAVE_3CH_8(short,  vec_short8)
1016
1017 #define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec)                                                     \
1018 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                  \
1019                                      const Tvec& c, Tp* ptr)                                       \
1020 {                                                                                                  \
1021     Tvec hbc = vec_mergeh(b, c);                                                                   \
1022     static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7};      \
1023     vsx_st(vec_perm(a, hbc, ahbc), 0, ptr);                                                        \
1024     Tvec lab = vec_mergel(a, b);                                                                   \
1025     vsx_st(vec_sld(lab, hbc, 8), 4, ptr);                                                          \
1026     static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
1027     vsx_st(vec_perm(c, lab, clab), 8, ptr);                                                        \
1028 }                                                                                                  \
1029 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)                    \
1030 {                                                                                                  \
1031     Tvec v1 = vsx_ld(0, ptr);                                                                      \
1032     Tvec v2 = vsx_ld(4, ptr);                                                                      \
1033     Tvec v3 = vsx_ld(8, ptr);                                                                      \
1034     static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31};   \
1035     a = vec_perm(v1, vec_sld(v3, v2, 8), flp);                                                     \
1036     static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19};  \
1037     b = vec_perm(v2, vec_sld(v1, v3, 8), flp2);                                                    \
1038     c = vec_perm(vec_sld(v2, v1, 8), v3, flp);                                                     \
1039 }
1040 VSX_IMPL_ST_INTERLEAVE_3CH_4(uint,  vec_uint4)
1041 VSX_IMPL_ST_INTERLEAVE_3CH_4(int,   vec_int4)
1042 VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
1043
1044 #define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func)     \
1045 VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,    \
1046                                      const Tvec& c, Tp* ptr)         \
1047 {                                                                    \
1048     st_func(vec_mergeh(a, b), 0, ptr);                               \
1049     st_func(vec_permi(c, a, 1), 2, ptr);                             \
1050     st_func(vec_mergel(b, c), 4, ptr);                               \
1051 }                                                                    \
1052 VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a,        \
1053                                        Tvec& b, Tvec& c)             \
1054 {                                                                    \
1055     Tvec v1 = ld_func(0, ptr);                                       \
1056     Tvec v2 = ld_func(2, ptr);                                       \
1057     Tvec v3 = ld_func(4, ptr);                                       \
1058     a = vec_permi(v1, v2, 1);                                        \
1059     b = vec_permi(v1, v3, 2);                                        \
1060     c = vec_permi(v2, v3, 1);                                        \
1061 }
1062 VSX_IMPL_ST_INTERLEAVE_3CH_2(int64,  vec_dword2,  vsx_ld2, vsx_st2)
1063 VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
1064 VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld,  vsx_st)
1065
1066 #endif // CV_VSX
1067
1068 //! @}
1069
1070 #endif // OPENCV_HAL_VSX_UTILS_HPP