modules/core/include/opencv2/core/hal/intrin_avx.hpp

   1 // This file is part of OpenCV project.
   2 // It is subject to the license terms in the LICENSE file found in the top-level directory
   3 // of this distribution and at http://opencv.org/license.html
   4
   5 #ifndef OPENCV_HAL_INTRIN_AVX_HPP
   6 #define OPENCV_HAL_INTRIN_AVX_HPP
   7
   8 #define CV_SIMD256 1
   9 #define CV_SIMD256_64F 1
  10
  11 namespace cv
  12 {
  13
  14 //! @cond IGNORED
  15
  16 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
  17
  18 ///////// Utils ////////////
  19
  20 inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
  21 { return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
  22
  23 inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
  24 { return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
  25
  26 inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
  27 { return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
  28
  29 inline int _v_cvtsi256_si32(const __m256i& a)
  30 { return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
  31
  32 inline __m256i _v256_shuffle_odd_64(const __m256i& v)
  33 { return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
  34
  35 inline __m256d _v256_shuffle_odd_64(const __m256d& v)
  36 { return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
  37
  38 template<int imm>
  39 inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
  40 { return _mm256_permute2x128_si256(a, b, imm); }
  41
  42 template<int imm>
  43 inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
  44 { return _mm256_permute2f128_ps(a, b, imm); }
  45
  46 template<int imm>
  47 inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
  48 { return _mm256_permute2f128_pd(a, b, imm); }
  49
  50 template<int imm, typename _Tpvec>
  51 inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
  52 { return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
  53
  54 template<int imm>
  55 inline __m256i _v256_permute4x64(const __m256i& a)
  56 { return _mm256_permute4x64_epi64(a, imm); }
  57
  58 template<int imm>
  59 inline __m256d _v256_permute4x64(const __m256d& a)
  60 { return _mm256_permute4x64_pd(a, imm); }
  61
  62 template<int imm, typename _Tpvec>
  63 inline _Tpvec v256_permute4x64(const _Tpvec& a)
  64 { return _Tpvec(_v256_permute4x64<imm>(a.val)); }
  65
  66 inline __m128i _v256_extract_high(const __m256i& v)
  67 { return _mm256_extracti128_si256(v, 1); }
  68
  69 inline __m128  _v256_extract_high(const __m256& v)
  70 { return _mm256_extractf128_ps(v, 1); }
  71
  72 inline __m128d _v256_extract_high(const __m256d& v)
  73 { return _mm256_extractf128_pd(v, 1); }
  74
  75 inline __m128i _v256_extract_low(const __m256i& v)
  76 { return _mm256_castsi256_si128(v); }
  77
  78 inline __m128  _v256_extract_low(const __m256& v)
  79 { return _mm256_castps256_ps128(v); }
  80
  81 inline __m128d _v256_extract_low(const __m256d& v)
  82 { return _mm256_castpd256_pd128(v); }
  83
  84 ///////// Types ////////////
  85
  86 struct v_uint8x32
  87 {
  88     typedef uchar lane_type;
  89     enum { nlanes = 32 };
  90     __m256i val;
  91
  92     explicit v_uint8x32(__m256i v) : val(v) {}
  93     v_uint8x32(uchar v0,  uchar v1,  uchar v2,  uchar v3,
  94                uchar v4,  uchar v5,  uchar v6,  uchar v7,
  95                uchar v8,  uchar v9,  uchar v10, uchar v11,
  96                uchar v12, uchar v13, uchar v14, uchar v15,
  97                uchar v16, uchar v17, uchar v18, uchar v19,
  98                uchar v20, uchar v21, uchar v22, uchar v23,
  99                uchar v24, uchar v25, uchar v26, uchar v27,
 100                uchar v28, uchar v29, uchar v30, uchar v31)
 101     {
 102         val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
 103             (char)v4,  (char)v5,  (char)v6 , (char)v7,  (char)v8,  (char)v9,
 104             (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
 105             (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
 106             (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
 107             (char)v28, (char)v29, (char)v30, (char)v31);
 108     }
 109     v_uint8x32() : val(_mm256_setzero_si256()) {}
 110     uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
 111 };
 112
 113 struct v_int8x32
 114 {
 115     typedef schar lane_type;
 116     enum { nlanes = 32 };
 117     __m256i val;
 118
 119     explicit v_int8x32(__m256i v) : val(v) {}
 120     v_int8x32(schar v0,  schar v1,  schar v2,  schar v3,
 121               schar v4,  schar v5,  schar v6,  schar v7,
 122               schar v8,  schar v9,  schar v10, schar v11,
 123               schar v12, schar v13, schar v14, schar v15,
 124               schar v16, schar v17, schar v18, schar v19,
 125               schar v20, schar v21, schar v22, schar v23,
 126               schar v24, schar v25, schar v26, schar v27,
 127               schar v28, schar v29, schar v30, schar v31)
 128     {
 129         val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
 130             v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
 131             v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
 132     }
 133     v_int8x32() : val(_mm256_setzero_si256()) {}
 134     schar get0() const { return (schar)_v_cvtsi256_si32(val); }
 135 };
 136
 137 struct v_uint16x16
 138 {
 139     typedef ushort lane_type;
 140     enum { nlanes = 16 };
 141     __m256i val;
 142
 143     explicit v_uint16x16(__m256i v) : val(v) {}
 144     v_uint16x16(ushort v0,  ushort v1,  ushort v2,  ushort v3,
 145                 ushort v4,  ushort v5,  ushort v6,  ushort v7,
 146                 ushort v8,  ushort v9,  ushort v10, ushort v11,
 147                 ushort v12, ushort v13, ushort v14, ushort v15)
 148     {
 149         val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
 150             (short)v4,  (short)v5,  (short)v6,  (short)v7,  (short)v8,  (short)v9,
 151             (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
 152     }
 153     v_uint16x16() : val(_mm256_setzero_si256()) {}
 154     ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
 155 };
 156
 157 struct v_int16x16
 158 {
 159     typedef short lane_type;
 160     enum { nlanes = 16 };
 161     __m256i val;
 162
 163     explicit v_int16x16(__m256i v) : val(v) {}
 164     v_int16x16(short v0,  short v1,  short v2,  short v3,
 165                short v4,  short v5,  short v6,  short v7,
 166                short v8,  short v9,  short v10, short v11,
 167                short v12, short v13, short v14, short v15)
 168     {
 169         val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
 170             v8, v9, v10, v11, v12, v13, v14, v15);
 171     }
 172     v_int16x16() : val(_mm256_setzero_si256()) {}
 173     short get0() const { return (short)_v_cvtsi256_si32(val); }
 174 };
 175
 176 struct v_uint32x8
 177 {
 178     typedef unsigned lane_type;
 179     enum { nlanes = 8 };
 180     __m256i val;
 181
 182     explicit v_uint32x8(__m256i v) : val(v) {}
 183     v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
 184                unsigned v4, unsigned v5, unsigned v6, unsigned v7)
 185     {
 186         val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
 187             (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
 188     }
 189     v_uint32x8() : val(_mm256_setzero_si256()) {}
 190     unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
 191 };
 192
 193 struct v_int32x8
 194 {
 195     typedef int lane_type;
 196     enum { nlanes = 8 };
 197     __m256i val;
 198
 199     explicit v_int32x8(__m256i v) : val(v) {}
 200     v_int32x8(int v0, int v1, int v2, int v3,
 201               int v4, int v5, int v6, int v7)
 202     {
 203         val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
 204     }
 205     v_int32x8() : val(_mm256_setzero_si256()) {}
 206     int get0() const { return _v_cvtsi256_si32(val); }
 207 };
 208
 209 struct v_float32x8
 210 {
 211     typedef float lane_type;
 212     enum { nlanes = 8 };
 213     __m256 val;
 214
 215     explicit v_float32x8(__m256 v) : val(v) {}
 216     v_float32x8(float v0, float v1, float v2, float v3,
 217                 float v4, float v5, float v6, float v7)
 218     {
 219         val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
 220     }
 221     v_float32x8() : val(_mm256_setzero_ps()) {}
 222     float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
 223 };
 224
 225 struct v_uint64x4
 226 {
 227     typedef uint64 lane_type;
 228     enum { nlanes = 4 };
 229     __m256i val;
 230
 231     explicit v_uint64x4(__m256i v) : val(v) {}
 232     v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
 233     { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
 234     v_uint64x4() : val(_mm256_setzero_si256()) {}
 235     uint64 get0() const
 236     { return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
 237 };
 238
 239 struct v_int64x4
 240 {
 241     typedef int64 lane_type;
 242     enum { nlanes = 4 };
 243     __m256i val;
 244
 245     explicit v_int64x4(__m256i v) : val(v) {}
 246     v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
 247     { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
 248     v_int64x4() : val(_mm256_setzero_si256()) {}
 249     int64 get0() const { return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
 250 };
 251
 252 struct v_float64x4
 253 {
 254     typedef double lane_type;
 255     enum { nlanes = 4 };
 256     __m256d val;
 257
 258     explicit v_float64x4(__m256d v) : val(v) {}
 259     v_float64x4(double v0, double v1, double v2, double v3)
 260     { val = _mm256_setr_pd(v0, v1, v2, v3); }
 261     v_float64x4() : val(_mm256_setzero_pd()) {}
 262     double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
 263 };
 264
 265 struct v_float16x16
 266 {
 267     typedef short lane_type;
 268     enum { nlanes = 16 };
 269     __m256i val;
 270
 271     explicit v_float16x16(__m256i v) : val(v) {}
 272     v_float16x16(short v0, short v1, short v2, short v3,
 273                  short v4, short v5, short v6, short v7,
 274                  short v8, short v9, short v10, short v11,
 275                  short v12, short v13, short v14, short v15)
 276     {
 277         val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
 278     }
 279     v_float16x16() : val(_mm256_setzero_si256()) {}
 280     short get0() const { return (short)_v_cvtsi256_si32(val); }
 281 };
 282 inline v_float16x16 v256_setzero_f16() { return v_float16x16(_mm256_setzero_si256()); }
 283 inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1_epi16(val)); }
 284
 285 //////////////// Load and store operations ///////////////
 286
 287 #define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp)                    \
 288     inline _Tpvec v256_load(const _Tp* ptr)                           \
 289     { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); }       \
 290     inline _Tpvec v256_load_aligned(const _Tp* ptr)                   \
 291     { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); }        \
 292     inline _Tpvec v256_load_low(const _Tp* ptr)                       \
 293     {                                                                 \
 294         __m128i v128 = _mm_loadu_si128((const __m128i*)ptr);          \
 295         return _Tpvec(_mm256_castsi128_si256(v128));                  \
 296     }                                                                 \
 297     inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
 298     {                                                                 \
 299         __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0);          \
 300         __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1);          \
 301         return _Tpvec(_v256_combine(vlo, vhi));                       \
 302     }                                                                 \
 303     inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
 304     { _mm256_storeu_si256((__m256i*)ptr, a.val); }                    \
 305     inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
 306     { _mm256_store_si256((__m256i*)ptr, a.val); }                     \
 307     inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
 308     { _mm256_stream_si256((__m256i*)ptr, a.val); }                    \
 309     inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
 310     { \
 311         if( mode == hal::STORE_UNALIGNED ) \
 312             _mm256_storeu_si256((__m256i*)ptr, a.val); \
 313         else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
 314             _mm256_stream_si256((__m256i*)ptr, a.val); \
 315         else \
 316             _mm256_store_si256((__m256i*)ptr, a.val); \
 317     } \
 318     inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
 319     { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); }    \
 320     inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
 321     { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
 322
 323 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32,  uchar)
 324 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32,   schar)
 325 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
 326 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16,  short)
 327 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8,  unsigned)
 328 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8,   int)
 329 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4,  uint64)
 330 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4,   int64)
 331
 332 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
 333     inline _Tpvec v256_load(const _Tp* ptr)                               \
 334     { return _Tpvec(_mm256_loadu_##suffix(ptr)); }                        \
 335     inline _Tpvec v256_load_aligned(const _Tp* ptr)                       \
 336     { return _Tpvec(_mm256_load_##suffix(ptr)); }                         \
 337     inline _Tpvec v256_load_low(const _Tp* ptr)                           \
 338     {                                                                     \
 339         return _Tpvec(_mm256_cast##suffix##128_##suffix##256              \
 340                      (_mm_loadu_##suffix(ptr)));                          \
 341     }                                                                     \
 342     inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
 343     {                                                                     \
 344         halfreg vlo = _mm_loadu_##suffix(ptr0);                           \
 345         halfreg vhi = _mm_loadu_##suffix(ptr1);                           \
 346         return _Tpvec(_v256_combine(vlo, vhi));                           \
 347     }                                                                     \
 348     inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
 349     { _mm256_storeu_##suffix(ptr, a.val); }                               \
 350     inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
 351     { _mm256_store_##suffix(ptr, a.val); }                                \
 352     inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
 353     { _mm256_stream_##suffix(ptr, a.val); }                               \
 354     inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
 355     { \
 356         if( mode == hal::STORE_UNALIGNED ) \
 357             _mm256_storeu_##suffix(ptr, a.val); \
 358         else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
 359             _mm256_stream_##suffix(ptr, a.val); \
 360         else \
 361             _mm256_store_##suffix(ptr, a.val); \
 362     } \
 363     inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
 364     { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); }               \
 365     inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
 366     { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
 367
 368 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float,  ps, __m128)
 369 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
 370
 371 #define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
 372     inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
 373     { return _Tpvec(cast(a.val)); }
 374
 375 #define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)          \
 376     inline _Tpvec v256_setzero_##suffix()                                        \
 377     { return _Tpvec(_mm256_setzero_si256()); }                                   \
 378     inline _Tpvec v256_setall_##suffix(_Tp v)                                    \
 379     { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); }                        \
 380     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, OPENCV_HAL_NOP)        \
 381     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, OPENCV_HAL_NOP)        \
 382     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP)        \
 383     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, OPENCV_HAL_NOP)        \
 384     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, OPENCV_HAL_NOP)        \
 385     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, OPENCV_HAL_NOP)        \
 386     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, OPENCV_HAL_NOP)        \
 387     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, OPENCV_HAL_NOP)        \
 388     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256)   \
 389     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
 390
 391 OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32,  uchar,    u8,  epi8,   char)
 392 OPENCV_HAL_IMPL_AVX_INIT(v_int8x32,   schar,    s8,  epi8,   char)
 393 OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort,   u16, epi16,  short)
 394 OPENCV_HAL_IMPL_AVX_INIT(v_int16x16,  short,    s16, epi16,  short)
 395 OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8,  unsigned, u32, epi32,  int)
 396 OPENCV_HAL_IMPL_AVX_INIT(v_int32x8,   int,      s32, epi32,  int)
 397 OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4,  uint64,   u64, epi64x, int64)
 398 OPENCV_HAL_IMPL_AVX_INIT(v_int64x4,   int64,    s64, epi64x, int64)
 399
 400 #define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
 401     inline _Tpvec v256_setzero_##suffix()                                \
 402     { return _Tpvec(_mm256_setzero_##zsuffix()); }                       \
 403     inline _Tpvec v256_setall_##suffix(_Tp v)                            \
 404     { return _Tpvec(_mm256_set1_##zsuffix(v)); }                         \
 405     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32,  suffix, cast)          \
 406     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32,   suffix, cast)          \
 407     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast)          \
 408     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16,  suffix, cast)          \
 409     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8,  suffix, cast)          \
 410     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8,   suffix, cast)          \
 411     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4,  suffix, cast)          \
 412     OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4,   suffix, cast)
 413
 414 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float,  f32, ps, _mm256_castsi256_ps)
 415 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
 416
 417 inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
 418 { return a; }
 419 inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
 420 { return v_float32x8(_mm256_castpd_ps(a.val)); }
 421
 422 inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
 423 { return a; }
 424 inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
 425 { return v_float64x4(_mm256_castps_pd(a.val)); }
 426
 427 inline v_float16x16 v256_load_f16(const short* ptr)
 428 { return v_float16x16(_mm256_loadu_si256((const __m256i*)ptr)); }
 429 inline v_float16x16 v256_load_f16_aligned(const short* ptr)
 430 { return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
 431
 432 inline void v_store(short* ptr, const v_float16x16& a)
 433 { _mm256_storeu_si256((__m256i*)ptr, a.val); }
 434 inline void v_store_aligned(short* ptr, const v_float16x16& a)
 435 { _mm256_store_si256((__m256i*)ptr, a.val); }
 436
 437 /* Recombine */
 438 /*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm)                    \
 439     inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
 440     { return _Tpvec(perm(a.val, b.val, 0x20)); }                     \
 441     inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
 442     { return _Tpvec(perm(a.val, b.val, 0x31)); }                     \
 443     inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
 444                              _Tpvec& c, _Tpvec& d)                   \
 445     { c = v_combine_low(a, b); d = v_combine_high(a, b); }
 446
 447 #define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix)                  \
 448     OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256)   \
 449     inline void v_zip(const _Tpvec& a0, const _Tpvec& a1,            \
 450                              _Tpvec& b0, _Tpvec& b1)                 \
 451     {                                                                \
 452         __m256i v0 = _v256_shuffle_odd_64(a0.val);                   \
 453         __m256i v1 = _v256_shuffle_odd_64(a1.val);                   \
 454         b0.val = _mm256_unpacklo_##suffix(v0, v1);                   \
 455         b1.val = _mm256_unpackhi_##suffix(v0, v1);                   \
 456     }
 457
 458 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32,  epi8)
 459 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32,   epi8)
 460 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
 461 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16,  epi16)
 462 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8,  epi32)
 463 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8,   epi32)
 464 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4,  epi64)
 465 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4,   epi64)
 466 OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
 467 OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
 468
 469 inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
 470 {
 471     __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
 472     __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
 473     v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
 474 }
 475
 476 inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
 477 {
 478     __m256d v0 = _v_shuffle_odd_64(a0.val);
 479     __m256d v1 = _v_shuffle_odd_64(a1.val);
 480     b0.val = _mm256_unpacklo_pd(v0, v1);
 481     b1.val = _mm256_unpackhi_pd(v0, v1);
 482 }*/
 483
 484 //////////////// Variant Value reordering ///////////////
 485
 486 // unpacks
 487 #define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix)                 \
 488     inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b)  \
 489     { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); }     \
 490     inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b)  \
 491     { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
 492
 493 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32,  epi8)
 494 OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32,   epi8)
 495 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
 496 OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16,  epi16)
 497 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8,  epi32)
 498 OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8,   epi32)
 499 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4,  epi64)
 500 OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4,   epi64)
 501 OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
 502 OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
 503
 504 // blend
 505 #define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix)               \
 506     template<int m>                                             \
 507     inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b)  \
 508     { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
 509
 510 OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
 511 OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16,  epi16)
 512 OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8,  epi32)
 513 OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8,   epi32)
 514 OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
 515 OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
 516
 517 template<int m>
 518 inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
 519 {
 520     enum {M0 = m};
 521     enum {M1 = (M0 | (M0 << 2)) & 0x33};
 522     enum {M2 = (M1 | (M1 << 1)) & 0x55};
 523     enum {MM =  M2 | (M2 << 1)};
 524     return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
 525 }
 526 template<int m>
 527 inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
 528 { return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
 529
 530 // shuffle
 531 // todo: emluate 64bit
 532 #define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin)  \
 533     template<int m>                                  \
 534     inline _Tpvec v256_shuffle(const _Tpvec& a)      \
 535     { return _Tpvec(_mm256_##intrin(a.val, m)); }
 536
 537 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8,  shuffle_epi32)
 538 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8,   shuffle_epi32)
 539 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
 540 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
 541
 542 template<typename _Tpvec>
 543 inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
 544 {
 545     ab0 = v256_unpacklo(a, b);
 546     ab1 = v256_unpackhi(a, b);
 547 }
 548
 549 template<typename _Tpvec>
 550 inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
 551 { return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
 552
 553 inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
 554 { return v256_blend<0xf0>(a, b); }
 555
 556 inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
 557 { return v256_blend<0xc>(a, b); }
 558
 559 template<typename _Tpvec>
 560 inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
 561 { return v256_permute2x128<0x21>(a, b); }
 562
 563 template<typename _Tpvec>
 564 inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
 565 { return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
 566 inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
 567 { return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
 568 // todo: emulate float32
 569
 570 template<typename _Tpvec>
 571 inline _Tpvec v256_swap_halves(const _Tpvec& a)
 572 { return v256_permute2x128<1>(a, a); }
 573
 574 template<typename _Tpvec>
 575 inline _Tpvec v256_reverse_64(const _Tpvec& a)
 576 { return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
 577
 578 // ZIP
 579 #define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec)                              \
 580     inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)    \
 581     { return v256_permute2x128<0x20>(a, b); }                        \
 582     inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)   \
 583     { return v256_permute2x128<0x31>(a, b); }                        \
 584     inline void v_recombine(const _Tpvec& a, const _Tpvec& b,        \
 585                              _Tpvec& c, _Tpvec& d)                   \
 586     {                                                                \
 587         _Tpvec a1b0 = v256_alignr_128(a, b);                         \
 588         c = v256_combine_diagonal(a, a1b0);                          \
 589         d = v256_combine_diagonal(a1b0, b);                          \
 590     }                                                                \
 591     inline void v_zip(const _Tpvec& a, const _Tpvec& b,              \
 592                       _Tpvec& ab0, _Tpvec& ab1)                      \
 593     {                                                                \
 594         _Tpvec ab0ab2, ab1ab3;                                       \
 595         v256_zip(a, b, ab0ab2, ab1ab3);                              \
 596         v_recombine(ab0ab2, ab1ab3, ab0, ab1);                       \
 597     }
 598
 599 OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
 600 OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
 601 OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
 602 OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
 603 OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
 604 OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
 605 OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
 606 OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
 607 OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
 608 OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
 609
 610 ////////// Arithmetic, bitwise and comparison operations /////////
 611
 612 /* Element-wise binary and unary operations */
 613
 614 /** Arithmetics **/
 615 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin)            \
 616     inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)  \
 617     { return _Tpvec(intrin(a.val, b.val)); }                          \
 618     inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)    \
 619     { a.val = intrin(a.val, b.val); return a; }
 620
 621 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32,  _mm256_adds_epu8)
 622 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32,  _mm256_subs_epu8)
 623 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32,   _mm256_adds_epi8)
 624 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32,   _mm256_subs_epi8)
 625 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
 626 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
 627 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
 628 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16,  _mm256_adds_epi16)
 629 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16,  _mm256_subs_epi16)
 630 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16,  _mm256_mullo_epi16)
 631 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8,  _mm256_add_epi32)
 632 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8,  _mm256_sub_epi32)
 633 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8,  _mm256_mullo_epi32)
 634 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8,   _mm256_add_epi32)
 635 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8,   _mm256_sub_epi32)
 636 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8,   _mm256_mullo_epi32)
 637 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4,  _mm256_add_epi64)
 638 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4,  _mm256_sub_epi64)
 639 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4,   _mm256_add_epi64)
 640 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4,   _mm256_sub_epi64)
 641
 642 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
 643 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
 644 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
 645 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
 646 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
 647 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
 648 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
 649 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
 650
 651 inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
 652                          v_int32x8& c, v_int32x8& d)
 653 {
 654     v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
 655
 656     v_int16x16 v0, v1;
 657     v_zip(a * b, vhi, v0, v1);
 658
 659     c = v_reinterpret_as_s32(v0);
 660     d = v_reinterpret_as_s32(v1);
 661 }
 662
 663 inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
 664                          v_uint32x8& c, v_uint32x8& d)
 665 {
 666     v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
 667
 668     v_uint16x16 v0, v1;
 669     v_zip(a * b, vhi, v0, v1);
 670
 671     c = v_reinterpret_as_u32(v0);
 672     d = v_reinterpret_as_u32(v1);
 673 }
 674
 675 inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
 676                          v_uint64x4& c, v_uint64x4& d)
 677 {
 678     __m256i v0 = _mm256_mul_epu32(a.val, b.val);
 679     __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
 680     v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
 681 }
 682
 683
 684 /** Non-saturating arithmetics **/
 685 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
 686     inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)   \
 687     { return _Tpvec(intrin(a.val, b.val)); }
 688
 689 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32,  _mm256_add_epi8)
 690 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32,   _mm256_add_epi8)
 691 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
 692 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16,  _mm256_add_epi16)
 693 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32,  _mm256_sub_epi8)
 694 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32,   _mm256_sub_epi8)
 695 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
 696 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16,  _mm256_sub_epi16)
 697
 698 /** Bitwise shifts **/
 699 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai)  \
 700     inline _Tpuvec operator << (const _Tpuvec& a, int imm)            \
 701     { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
 702     inline _Tpsvec operator << (const _Tpsvec& a, int imm)            \
 703     { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
 704     inline _Tpuvec operator >> (const _Tpuvec& a, int imm)            \
 705     { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
 706     inline _Tpsvec operator >> (const _Tpsvec& a, int imm)            \
 707     { return _Tpsvec(srai(a.val, imm)); }                             \
 708     template<int imm>                                                 \
 709     inline _Tpuvec v_shl(const _Tpuvec& a)                            \
 710     { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); }             \
 711     template<int imm>                                                 \
 712     inline _Tpsvec v_shl(const _Tpsvec& a)                            \
 713     { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); }             \
 714     template<int imm>                                                 \
 715     inline _Tpuvec v_shr(const _Tpuvec& a)                            \
 716     { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); }             \
 717     template<int imm>                                                 \
 718     inline _Tpsvec v_shr(const _Tpsvec& a)                            \
 719     { return _Tpsvec(srai(a.val, imm)); }
 720
 721 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
 722 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8,  v_int32x8,  epi32, _mm256_srai_epi32)
 723
 724 inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
 725 {
 726     __m256i d = _mm256_set1_epi64x((int64)1 << 63);
 727     __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
 728     return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
 729 }
 730 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4,  v_int64x4,  epi64, _mm256_srai_epi64xx)
 731
 732
 733 /** Bitwise logic **/
 734 #define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const)  \
 735     OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix)   \
 736     OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix)    \
 737     OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix)   \
 738     inline _Tpvec operator ~ (const _Tpvec& a)                   \
 739     { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
 740
 741 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32,   si256, _mm256_set1_epi32(-1))
 742 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32,    si256, _mm256_set1_epi32(-1))
 743 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16,  si256, _mm256_set1_epi32(-1))
 744 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16,   si256, _mm256_set1_epi32(-1))
 745 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8,   si256, _mm256_set1_epi32(-1))
 746 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8,    si256, _mm256_set1_epi32(-1))
 747 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4,   si256, _mm256_set1_epi64x(-1))
 748 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4,    si256, _mm256_set1_epi64x(-1))
 749 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8,  ps,    _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
 750 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4,  pd,    _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
 751
 752 /** Select **/
 753 #define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix)                               \
 754     inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
 755     { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
 756
 757 OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32,  epi8)
 758 OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32,   epi8)
 759 OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
 760 OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16,  epi8)
 761 OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8,  epi8)
 762 OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8,   epi8)
 763 OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
 764 OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
 765
 766 /** Comparison **/
 767 #define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec)                     \
 768     inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b)  \
 769     { return ~(a == b); }                                         \
 770     inline _Tpvec operator <  (const _Tpvec& a, const _Tpvec& b)  \
 771     { return b > a; }                                             \
 772     inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b)  \
 773     { return ~(a < b); }                                          \
 774     inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b)  \
 775     { return b >= a; }
 776
 777 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit)   \
 778     inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b)      \
 779     { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
 780     inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b)       \
 781     {                                                                    \
 782         __m256i smask = _mm256_set1_##suffix(sbit);                      \
 783         return _Tpuvec(_mm256_cmpgt_##suffix(                            \
 784                        _mm256_xor_si256(a.val, smask),                   \
 785                        _mm256_xor_si256(b.val, smask)));                 \
 786     }                                                                    \
 787     inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b)      \
 788     { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); }             \
 789     inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b)       \
 790     { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); }             \
 791     OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec)                               \
 792     OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
 793
 794 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32,  v_int8x32,  epi8,  (char)-128)
 795 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
 796 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8,  v_int32x8,  epi32, (int)0x80000000)
 797
 798 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec)                 \
 799     inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 800     { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); }         \
 801     inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 802     { return ~(a == b); }
 803
 804 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
 805 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
 806
 807 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix)    \
 808     inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 809     { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
 810
 811 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix)               \
 812     OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, suffix)     \
 813     OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix)     \
 814     OPENCV_HAL_IMPL_AVX_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, suffix)     \
 815     OPENCV_HAL_IMPL_AVX_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, suffix)     \
 816     OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, suffix)     \
 817     OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, suffix)
 818
 819 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
 820 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
 821
 822 /** min/max **/
 823 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32,  _mm256_min_epu8)
 824 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32,  _mm256_max_epu8)
 825 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32,   _mm256_min_epi8)
 826 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32,   _mm256_max_epi8)
 827 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
 828 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
 829 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16,  _mm256_min_epi16)
 830 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16,  _mm256_max_epi16)
 831 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8,  _mm256_min_epu32)
 832 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8,  _mm256_max_epu32)
 833 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8,   _mm256_min_epi32)
 834 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8,   _mm256_max_epi32)
 835 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
 836 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
 837 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
 838 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
 839
 840 /** Rotate **/
 841 template<int imm>
 842 inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
 843 {
 844     __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
 845
 846     switch(imm)
 847     {
 848         case 0:  return a;
 849         case 32: return b;
 850         case 16: return v_uint8x32(swap);
 851     }
 852
 853     if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm));
 854     if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm));
 855
 856     return v_uint8x32();
 857 }
 858
 859 template<int imm>
 860 inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
 861 {
 862     __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
 863
 864     switch(imm)
 865     {
 866         case 0:  return a;
 867         case 32: return b;
 868         case 16: return v_uint8x32(swap);
 869     }
 870
 871     if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
 872     if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16));
 873
 874     return v_uint8x32();
 875 }
 876
 877 template<int imm>
 878 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
 879 {
 880     v_uint8x32 res;
 881     // ESAC control[3] ? [127:0] = 0
 882     __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
 883
 884     if (imm == 0)
 885         return a;
 886     if (imm == 16)
 887         res.val = swapz;
 888     else if (imm < 16)
 889         res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
 890     else if (imm < 32)
 891         res.val = _mm256_slli_si256(swapz, imm - 16);
 892     else
 893         return v_uint8x32();
 894     return res;
 895 }
 896
 897 template<int imm>
 898 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
 899 {
 900     v_uint8x32 res;
 901     // ESAC control[3] ? [127:0] = 0
 902     __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
 903
 904     if (imm == 0)
 905         return a;
 906     if (imm == 16)
 907         res.val = swapz;
 908     else if (imm < 16)
 909         res.val = _mm256_alignr_epi8(swapz, a.val, imm);
 910     else if (imm < 32)
 911         res.val = _mm256_srli_si256(swapz, imm - 16);
 912     else
 913         return v_uint8x32();
 914     return res;
 915 }
 916
 917 #define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast)   \
 918     template<int imm>                                           \
 919     inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b)      \
 920     {                                                           \
 921         const int w = sizeof(typename _Tpvec::lane_type);       \
 922         v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a),  \
 923                                        v_reinterpret_as_u8(b)); \
 924         return _Tpvec(cast(ret.val));                           \
 925     }                                                           \
 926     template<int imm>                                           \
 927     inline _Tpvec intrin(const _Tpvec& a)                       \
 928     {                                                           \
 929         const int w = sizeof(typename _Tpvec::lane_type);       \
 930         v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \
 931         return _Tpvec(cast(ret.val));                           \
 932     }
 933
 934 #define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec)                                  \
 935     OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  _Tpvec, OPENCV_HAL_NOP) \
 936     OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
 937
 938 OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
 939 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
 940 OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
 941 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
 942 OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
 943 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
 944 OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
 945
 946 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float32x8, _mm256_castsi256_ps)
 947 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
 948 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left,  v_float64x4, _mm256_castsi256_pd)
 949 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
 950
 951 ////////// Reduce and mask /////////
 952
 953 /** Reduce **/
 954 #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
 955     inline sctype v_reduce_##func(const _Tpvec& a)                  \
 956     {                                                               \
 957         __m128i v0 = _v256_extract_low(a.val);                      \
 958         __m128i v1 = _v256_extract_high(a.val);                     \
 959         v0 = intrin(v0, v1);                                        \
 960         v0 = intrin(v0, _mm_srli_si128(v0, 8));                     \
 961         v0 = intrin(v0, _mm_srli_si128(v0, 4));                     \
 962         v0 = intrin(v0, _mm_srli_si128(v0, 2));                     \
 963         return (sctype) _mm_cvtsi128_si32(v0);                      \
 964     }
 965
 966 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
 967 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  min, _mm_min_epi16)
 968 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
 969 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16,  short,  max, _mm_max_epi16)
 970
 971 #define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
 972     inline sctype v_reduce_##func(const _Tpvec& a)                 \
 973     {                                                              \
 974         __m128i v0 = _v256_extract_low(a.val);                     \
 975         __m128i v1 = _v256_extract_high(a.val);                    \
 976         v0 = intrin(v0, v1);                                       \
 977         v0 = intrin(v0, _mm_srli_si128(v0, 8));                    \
 978         v0 = intrin(v0, _mm_srli_si128(v0, 4));                    \
 979         return (sctype) _mm_cvtsi128_si32(v0);                     \
 980     }
 981
 982 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
 983 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      min, _mm_min_epi32)
 984 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
 985 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8,  int,      max, _mm_max_epi32)
 986
 987 #define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin)                  \
 988     inline float v_reduce_##func(const v_float32x8& a)                \
 989     {                                                                 \
 990         __m128 v0 = _v256_extract_low(a.val);                         \
 991         __m128 v1 = _v256_extract_high(a.val);                        \
 992         v0 = intrin(v0, v1);                                          \
 993         v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
 994         v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \
 995         return _mm_cvtss_f32(v0);                                     \
 996     }
 997
 998 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
 999 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
1000
1001 inline ushort v_reduce_sum(const v_uint16x16& a)
1002 {
1003     __m128i a0 = _v256_extract_low(a.val);
1004     __m128i a1 = _v256_extract_high(a.val);
1005
1006     __m128i s0 = _mm_adds_epu16(a0, a1);
1007             s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
1008             s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
1009             s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
1010
1011     return (ushort)_mm_cvtsi128_si32(s0);
1012 }
1013
1014 inline short v_reduce_sum(const v_int16x16& a)
1015 {
1016     __m256i s0 = _mm256_hadds_epi16(a.val, a.val);
1017             s0 = _mm256_hadds_epi16(s0, s0);
1018             s0 = _mm256_hadds_epi16(s0, s0);
1019
1020     __m128i s1 = _v256_extract_high(s0);
1021             s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
1022
1023     return (short)_mm_cvtsi128_si32(s1);
1024 }
1025
1026 inline int v_reduce_sum(const v_int32x8& a)
1027 {
1028     __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
1029             s0 = _mm256_hadd_epi32(s0, s0);
1030
1031     __m128i s1 = _v256_extract_high(s0);
1032             s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
1033
1034     return _mm_cvtsi128_si32(s1);
1035 }
1036
1037 inline unsigned v_reduce_sum(const v_uint32x8& a)
1038 { return v_reduce_sum(v_reinterpret_as_s32(a)); }
1039
1040 inline float v_reduce_sum(const v_float32x8& a)
1041 {
1042     __m256 s0 = _mm256_hadd_ps(a.val, a.val);
1043            s0 = _mm256_hadd_ps(s0, s0);
1044
1045     __m128 s1 = _v256_extract_high(s0);
1046            s1 = _mm_add_ps(_v256_extract_low(s0), s1);
1047
1048     return _mm_cvtss_f32(s1);
1049 }
1050
1051 inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
1052                                  const v_float32x8& c, const v_float32x8& d)
1053 {
1054     __m256 ab = _mm256_hadd_ps(a.val, b.val);
1055     __m256 cd = _mm256_hadd_ps(c.val, d.val);
1056     return v_float32x8(_mm256_hadd_ps(ab, cd));
1057 }
1058
1059 /** Popcount **/
1060 #define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec)                     \
1061     inline v_uint32x8 v_popcount(const _Tpvec& a)                \
1062     {                                                            \
1063         const v_uint32x8 m1 = v256_setall_u32(0x55555555);       \
1064         const v_uint32x8 m2 = v256_setall_u32(0x33333333);       \
1065         const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f);       \
1066         v_uint32x8 p  = v_reinterpret_as_u32(a);                 \
1067         p = ((p >> 1) & m1) + (p & m1);                          \
1068         p = ((p >> 2) & m2) + (p & m2);                          \
1069         p = ((p >> 4) & m4) + (p & m4);                          \
1070         p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256());  \
1071         return p;                                                \
1072     }
1073
1074 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
1075 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
1076 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
1077 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
1078 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
1079 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
1080
1081 /** Mask **/
1082 inline int v_signmask(const v_int8x32& a)
1083 { return _mm256_movemask_epi8(a.val); }
1084 inline int v_signmask(const v_uint8x32& a)
1085 { return v_signmask(v_reinterpret_as_s8(a)); }
1086
1087 inline int v_signmask(const v_int16x16& a)
1088 {
1089     v_int8x32 v = v_int8x32(_mm256_packs_epi16(a.val, a.val));
1090     return v_signmask(v) & 255;
1091 }
1092 inline int v_signmask(const v_uint16x16& a)
1093 { return v_signmask(v_reinterpret_as_s16(a)); }
1094
1095 inline int v_signmask(const v_int32x8& a)
1096 {
1097     __m256i a16 = _mm256_packs_epi32(a.val, a.val);
1098     v_int8x32 v = v_int8x32(_mm256_packs_epi16(a16, a16));
1099     return v_signmask(v) & 15;
1100 }
1101 inline int v_signmask(const v_uint32x8& a)
1102 { return v_signmask(v_reinterpret_as_s32(a)); }
1103
1104 inline int v_signmask(const v_float32x8& a)
1105 { return _mm256_movemask_ps(a.val); }
1106 inline int v_signmask(const v_float64x4& a)
1107 { return _mm256_movemask_pd(a.val); }
1108
1109 /** Checks **/
1110 #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask)  \
1111     inline bool v_check_all(const _Tpvec& a)                \
1112     {                                                       \
1113         int mask = v_signmask(v_reinterpret_as_s8(a));      \
1114         return and_op(mask, allmask) == allmask;            \
1115     }                                                       \
1116     inline bool v_check_any(const _Tpvec& a)                \
1117     {                                                       \
1118         int mask = v_signmask(v_reinterpret_as_s8(a));      \
1119         return and_op(mask, allmask) != 0;                  \
1120     }
1121
1122 OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32,  OPENCV_HAL_1ST, -1)
1123 OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32,   OPENCV_HAL_1ST, -1)
1124 OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaa)
1125 OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16,  OPENCV_HAL_AND, (int)0xaaaa)
1126 OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8,  OPENCV_HAL_AND, (int)0x8888)
1127 OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8,   OPENCV_HAL_AND, (int)0x8888)
1128
1129 #define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \
1130     inline bool v_check_all(const _Tpvec& a)           \
1131     {                                                  \
1132         int mask = v_signmask(a);                      \
1133         return mask == allmask;                        \
1134     }                                                  \
1135     inline bool v_check_any(const _Tpvec& a)           \
1136     {                                                  \
1137         int mask = v_signmask(a);                      \
1138         return mask != 0;                              \
1139     }
1140
1141 OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255)
1142 OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
1143
1144
1145 ////////// Other math /////////
1146
1147 /** Some frequent operations **/
1148 #define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix)                            \
1149     inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
1150     { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \
1151     inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1152     { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); }            \
1153     inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
1154     { return _Tpvec(_mm256_sqrt_##suffix(x.val)); }                           \
1155     inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
1156     { return v_fma(a, a, b * b); }                                            \
1157     inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
1158     { return v_sqrt(v_fma(a, a, b*b)); }
1159
1160 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
1161 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
1162
1163 inline v_float32x8 v_invsqrt(const v_float32x8& x)
1164 {
1165     v_float32x8 half = x * v256_setall_f32(0.5);
1166     v_float32x8 t  = v_float32x8(_mm256_rsqrt_ps(x.val));
1167     // todo: _mm256_fnmsub_ps
1168     t *= v256_setall_f32(1.5) - ((t * t) * half);
1169     return t;
1170 }
1171
1172 inline v_float64x4 v_invsqrt(const v_float64x4& x)
1173 {
1174     return v256_setall_f64(1.) / v_sqrt(x);
1175 }
1176
1177 /** Absolute values **/
1178 #define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix)         \
1179     inline v_u##_Tpvec v_abs(const v_##_Tpvec& x)       \
1180     { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
1181
1182 OPENCV_HAL_IMPL_AVX_ABS(int8x32,  epi8)
1183 OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
1184 OPENCV_HAL_IMPL_AVX_ABS(int32x8,  epi32)
1185
1186 inline v_float32x8 v_abs(const v_float32x8& x)
1187 { return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
1188 inline v_float64x4 v_abs(const v_float64x4& x)
1189 { return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
1190
1191 /** Absolute difference **/
1192 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
1193 { return v_add_wrap(a - b,  b - a); }
1194 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
1195 { return v_add_wrap(a - b,  b - a); }
1196 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
1197 { return v_max(a, b) - v_min(a, b); }
1198
1199 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
1200 {
1201     v_int8x32 d = v_sub_wrap(a, b);
1202     v_int8x32 m = a < b;
1203     return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1204 }
1205
1206 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
1207 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1208
1209 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
1210 {
1211     v_int32x8 d = a - b;
1212     v_int32x8 m = a < b;
1213     return v_reinterpret_as_u32((d ^ m) - m);
1214 }
1215
1216 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
1217 { return v_abs(a - b); }
1218
1219 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
1220 { return v_abs(a - b); }
1221
1222 ////////// Conversions /////////
1223
1224 /** Rounding **/
1225 inline v_int32x8 v_round(const v_float32x8& a)
1226 { return v_int32x8(_mm256_cvtps_epi32(a.val)); }
1227
1228 inline v_int32x8 v_round(const v_float64x4& a)
1229 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
1230
1231 inline v_int32x8 v_trunc(const v_float32x8& a)
1232 { return v_int32x8(_mm256_cvttps_epi32(a.val)); }
1233
1234 inline v_int32x8 v_trunc(const v_float64x4& a)
1235 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
1236
1237 inline v_int32x8 v_floor(const v_float32x8& a)
1238 { return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
1239
1240 inline v_int32x8 v_floor(const v_float64x4& a)
1241 { return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
1242
1243 inline v_int32x8 v_ceil(const v_float32x8& a)
1244 { return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
1245
1246 inline v_int32x8 v_ceil(const v_float64x4& a)
1247 { return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
1248
1249 /** To float **/
1250 inline v_float32x8 v_cvt_f32(const v_int32x8& a)
1251 { return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
1252
1253 inline v_float32x8 v_cvt_f32(const v_float64x4& a)
1254 { return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
1255
1256 inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
1257 {
1258     __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
1259     return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
1260 }
1261
1262 inline v_float64x4 v_cvt_f64(const v_int32x8& a)
1263 { return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
1264
1265 inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
1266 { return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
1267
1268 inline v_float64x4 v_cvt_f64(const v_float32x8& a)
1269 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
1270
1271 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
1272 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
1273
1274 #if CV_FP16
1275 inline v_float32x8 v_cvt_f32(const v_float16x16& a)
1276 { return v_float32x8(_mm256_cvtph_ps(_v256_extract_low(a.val))); }
1277
1278 inline v_float32x8 v_cvt_f32_high(const v_float16x16& a)
1279 { return v_float32x8(_mm256_cvtph_ps(_v256_extract_high(a.val))); }
1280
1281 inline v_float16x16 v_cvt_f16(const v_float32x8& a, const v_float32x8& b)
1282 {
1283     __m128i ah = _mm256_cvtps_ph(a.val, 0), bh = _mm256_cvtps_ph(b.val, 0);
1284     return v_float16x16(_mm256_inserti128_si256(_mm256_castsi128_si256(ah), bh, 1));
1285 }
1286 #endif
1287
1288 ////////////// Lookup table access ////////////////////
1289
1290 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
1291 {
1292     int CV_DECL_ALIGNED(32) idx[8];
1293     v_store_aligned(idx, idxvec);
1294     return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1295                                        tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1296 }
1297
1298 inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
1299 {
1300     int CV_DECL_ALIGNED(32) idx[8];
1301     v_store_aligned(idx, idxvec);
1302     return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1303                                       tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1304 }
1305
1306 inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
1307 {
1308     int CV_DECL_ALIGNED(32) idx[8];
1309     v_store_aligned(idx, idxvec);
1310     return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1311 }
1312
1313 inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1314 {
1315     int CV_DECL_ALIGNED(32) idx[8];
1316     v_store_aligned(idx, idxvec);
1317     __m128 z = _mm_setzero_ps();
1318     __m128 xy01, xy45, xy23, xy67;
1319     xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
1320     xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
1321     xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
1322     xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
1323     __m256 xy0145 = _v256_combine(xy01, xy45);
1324     xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
1325     xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
1326     xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
1327     xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
1328     __m256 xy2367 = _v256_combine(xy23, xy67);
1329
1330     __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
1331     __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
1332
1333     x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
1334     y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
1335 }
1336
1337 inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1338 {
1339     int CV_DECL_ALIGNED(32) idx[4];
1340     v_store_low(idx, idxvec);
1341     __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
1342     __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
1343     __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
1344     __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
1345     __m256d xy02 = _v256_combine(xy0, xy2);
1346     __m256d xy13 = _v256_combine(xy1, xy3);
1347
1348     x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
1349     y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
1350 }
1351
1352 ////////// Matrix operations /////////
1353
1354 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
1355 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
1356
1357 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1358 { return v_dotprod(a, b) + c; }
1359
1360 #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
1361     v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1362
1363 inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
1364                             const v_float32x8& m1, const v_float32x8& m2,
1365                             const v_float32x8& m3)
1366 {
1367     v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1368     v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1369     v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1370     v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
1371     return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1372 }
1373
1374 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
1375                                const v_float32x8& m1, const v_float32x8& m2,
1376                                const v_float32x8& a)
1377 {
1378     v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1379     v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1380     v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1381     return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1382 }
1383
1384 #define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to)    \
1385     inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
1386                                const _Tpvec& a2, const _Tpvec& a3,              \
1387                                _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
1388     {                                                                           \
1389         __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val));       \
1390         __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val));       \
1391         __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val));       \
1392         __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val));       \
1393         b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1));                        \
1394         b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1));                        \
1395         b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3));                        \
1396         b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3));                        \
1397     }
1398
1399 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1400 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1401 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
1402
1403 //////////////// Value reordering ///////////////
1404
1405 /* Expand */
1406 #define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
1407     inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1408     {                                                               \
1409         b0.val = intrin(_v256_extract_low(a.val));                  \
1410         b1.val = intrin(_v256_extract_high(a.val));                 \
1411     }                                                               \
1412     inline _Tpwvec v256_load_expand(const _Tp* ptr)                 \
1413     {                                                               \
1414         __m128i a = _mm_loadu_si128((const __m128i*)ptr);           \
1415         return _Tpwvec(intrin(a));                                  \
1416     }
1417
1418 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32,  v_uint16x16, uchar,    _mm256_cvtepu8_epi16)
1419 OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32,   v_int16x16,  schar,    _mm256_cvtepi8_epi16)
1420 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8,  ushort,   _mm256_cvtepu16_epi32)
1421 OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16,  v_int32x8,   short,    _mm256_cvtepi16_epi32)
1422 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8,  v_uint64x4,  unsigned, _mm256_cvtepu32_epi64)
1423 OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8,   v_int64x4,   int,      _mm256_cvtepi32_epi64)
1424
1425 #define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin)   \
1426     inline _Tpvec v256_load_expand_q(const _Tp* ptr)        \
1427     {                                                       \
1428         __m128i a = _mm_loadl_epi64((const __m128i*)ptr);   \
1429         return _Tpvec(intrin(a));                           \
1430     }
1431
1432 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
1433 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8,  schar, _mm256_cvtepi8_epi32)
1434
1435 /* pack */
1436 // 16
1437 inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
1438 { return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
1439
1440 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
1441 { return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
1442
1443 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
1444 { return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
1445
1446 inline void v_pack_store(schar* ptr, const v_int16x16& a)
1447 { v_store_low(ptr, v_pack(a, a)); }
1448
1449 inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
1450 { v_store_low(ptr, v_pack(a, a)); }
1451
1452 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
1453 { v_store_low(ptr, v_pack_u(a, a)); }
1454
1455 template<int n> inline
1456 v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
1457 {
1458     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
1459     v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
1460     return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
1461                     v_reinterpret_as_s16((b + delta) >> n));
1462 }
1463
1464 template<int n> inline
1465 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
1466 {
1467     v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
1468     v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
1469 }
1470
1471 template<int n> inline
1472 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
1473 {
1474     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1475     return v_pack_u((a + delta) >> n, (b + delta) >> n);
1476 }
1477
1478 template<int n> inline
1479 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
1480 {
1481     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1482     v_pack_u_store(ptr, (a + delta) >> n);
1483 }
1484
1485 template<int n> inline
1486 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
1487 {
1488     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1489     return v_pack((a + delta) >> n, (b + delta) >> n);
1490 }
1491
1492 template<int n> inline
1493 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
1494 {
1495     v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1496     v_pack_store(ptr, (a + delta) >> n);
1497 }
1498
1499 // 32
1500 inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
1501 { return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
1502
1503 inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
1504 { return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
1505
1506 inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
1507 { return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
1508
1509 inline void v_pack_store(short* ptr, const v_int32x8& a)
1510 { v_store_low(ptr, v_pack(a, a)); }
1511
1512 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
1513 { v_store_low(ptr, v_pack(a, a)); }
1514
1515 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
1516 { v_store_low(ptr, v_pack_u(a, a)); }
1517
1518
1519 template<int n> inline
1520 v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
1521 {
1522     // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
1523     v_uint32x8 delta = v256_setall_u32(1 << (n-1));
1524     return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
1525                     v_reinterpret_as_s32((b + delta) >> n));
1526 }
1527
1528 template<int n> inline
1529 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
1530 {
1531     v_uint32x8 delta = v256_setall_u32(1 << (n-1));
1532     v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
1533 }
1534
1535 template<int n> inline
1536 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
1537 {
1538     v_int32x8 delta = v256_setall_s32(1 << (n-1));
1539     return v_pack_u((a + delta) >> n, (b + delta) >> n);
1540 }
1541
1542 template<int n> inline
1543 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
1544 {
1545     v_int32x8 delta = v256_setall_s32(1 << (n-1));
1546     v_pack_u_store(ptr, (a + delta) >> n);
1547 }
1548
1549 template<int n> inline
1550 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
1551 {
1552     v_int32x8 delta = v256_setall_s32(1 << (n-1));
1553     return v_pack((a + delta) >> n, (b + delta) >> n);
1554 }
1555
1556 template<int n> inline
1557 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
1558 {
1559     v_int32x8 delta = v256_setall_s32(1 << (n-1));
1560     v_pack_store(ptr, (a + delta) >> n);
1561 }
1562
1563 // 64
1564 // Non-saturating pack
1565 inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
1566 {
1567     __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
1568     __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
1569     __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
1570     return v_uint32x8(_v256_shuffle_odd_64(ab));
1571 }
1572
1573 inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
1574 { return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
1575
1576 inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
1577 {
1578     __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
1579     v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
1580 }
1581
1582 inline void v_pack_store(int* ptr, const v_int64x4& b)
1583 { v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
1584
1585 template<int n> inline
1586 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
1587 {
1588     v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
1589     return v_pack((a + delta) >> n, (b + delta) >> n);
1590 }
1591
1592 template<int n> inline
1593 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
1594 {
1595     v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
1596     v_pack_store(ptr, (a + delta) >> n);
1597 }
1598
1599 template<int n> inline
1600 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
1601 {
1602     v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
1603     return v_pack((a + delta) >> n, (b + delta) >> n);
1604 }
1605
1606 template<int n> inline
1607 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
1608 {
1609     v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
1610     v_pack_store(ptr, (a + delta) >> n);
1611 }
1612
1613 /* Recombine */
1614 // its up there with load and store operations
1615
1616 /* Extract */
1617 #define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec)                    \
1618     template<int s>                                            \
1619     inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)  \
1620     { return v_rotate_right<s>(a, b); }
1621
1622 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
1623 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
1624 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
1625 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
1626 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
1627 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
1628 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
1629 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
1630 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
1631 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
1632
1633
1634 ///////////////////// load deinterleave /////////////////////////////
1635
1636 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
1637 {
1638     __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
1639     __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1640
1641     const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
1642                                                0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
1643     __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
1644     __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
1645     __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
1646     __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
1647     __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
1648     __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
1649     a = v_uint8x32(a0);
1650     b = v_uint8x32(b0);
1651 }
1652
1653 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
1654 {
1655     __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
1656     __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1657
1658     const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1659                                                0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1660     __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
1661     __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
1662     __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
1663     __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
1664     __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
1665     __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
1666     a = v_uint16x16(a0);
1667     b = v_uint16x16(b0);
1668 }
1669
1670 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
1671 {
1672     __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
1673     __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1674
1675     const int sh = 0+2*4+1*16+3*64;
1676     __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
1677     __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
1678     __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
1679     __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
1680     __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
1681     __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
1682     a = v_uint32x8(a0);
1683     b = v_uint32x8(b0);
1684 }
1685
1686 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
1687 {
1688     __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
1689     __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
1690
1691     __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
1692     __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
1693     __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
1694     __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
1695     a = v_uint64x4(a0);
1696     b = v_uint64x4(b0);
1697 }
1698
1699 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r )
1700 {
1701     __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1702     __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1703     __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
1704
1705     __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
1706     __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
1707
1708     const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
1709                                                0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
1710     const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
1711                                                -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
1712
1713     __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
1714     __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
1715     __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
1716
1717     const __m256i
1718     sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
1719                             0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
1720     sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
1721                             1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
1722     sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
1723                             2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
1724     b0 = _mm256_shuffle_epi8(b0, sh_b);
1725     g0 = _mm256_shuffle_epi8(g0, sh_g);
1726     r0 = _mm256_shuffle_epi8(r0, sh_r);
1727
1728     b = v_uint8x32(b0);
1729     g = v_uint8x32(g0);
1730     r = v_uint8x32(r0);
1731 }
1732
1733 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r )
1734 {
1735     __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1736     __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1737     __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1738
1739     __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
1740     __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
1741
1742     const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
1743                                                0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
1744     const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
1745                                                -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
1746     __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
1747     __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
1748     __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
1749     const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
1750                                                  0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
1751     const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
1752                                                  2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
1753     const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
1754                                                  4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
1755     b0 = _mm256_shuffle_epi8(b0, sh_b);
1756     g0 = _mm256_shuffle_epi8(g0, sh_g);
1757     r0 = _mm256_shuffle_epi8(r0, sh_r);
1758
1759     b = v_uint16x16(b0);
1760     g = v_uint16x16(g0);
1761     r = v_uint16x16(r0);
1762 }
1763
1764 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r )
1765 {
1766     __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1767     __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1768     __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1769
1770     __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
1771     __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
1772
1773     __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
1774     __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
1775     __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
1776
1777     b0 = _mm256_shuffle_epi32(b0, 0x6c);
1778     g0 = _mm256_shuffle_epi32(g0, 0xb1);
1779     r0 = _mm256_shuffle_epi32(r0, 0xc6);
1780
1781     b = v_uint32x8(b0);
1782     g = v_uint32x8(g0);
1783     r = v_uint32x8(r0);
1784 }
1785
1786 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r )
1787 {
1788     __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1789     __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
1790     __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1791
1792     __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
1793     __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
1794     __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
1795     __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
1796     __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
1797     __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
1798
1799     b = v_uint64x4(b0);
1800     g = v_uint64x4(g0);
1801     r = v_uint64x4(r0);
1802 }
1803
1804 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a )
1805 {
1806     __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1807     __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1808     __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
1809     __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
1810     const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1811                                                0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
1812
1813     __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
1814     __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
1815     __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
1816     __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
1817
1818     __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
1819     __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
1820     __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
1821     __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
1822
1823     __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
1824     __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
1825     __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
1826     __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
1827
1828     __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
1829     __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
1830     __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
1831     __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
1832
1833     b = v_uint8x32(b0);
1834     g = v_uint8x32(g0);
1835     r = v_uint8x32(r0);
1836     a = v_uint8x32(a0);
1837 }
1838
1839 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a )
1840 {
1841     __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1842     __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1843     __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1844     __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
1845     const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1846                                                0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
1847     __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
1848     __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
1849     __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
1850     __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
1851
1852     __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
1853     __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
1854     __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
1855     __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
1856
1857     __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
1858     __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
1859     __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
1860     __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
1861
1862     __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
1863     __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
1864     __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
1865     __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
1866
1867     b = v_uint16x16(b0);
1868     g = v_uint16x16(g0);
1869     r = v_uint16x16(r0);
1870     a = v_uint16x16(a0);
1871 }
1872
1873 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a )
1874 {
1875     __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
1876     __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1877     __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1878     __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
1879
1880     __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
1881     __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
1882     __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
1883     __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
1884
1885     __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
1886     __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
1887     __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
1888     __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
1889
1890     __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
1891     __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
1892     __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
1893     __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
1894
1895     b = v_uint32x8(b0);
1896     g = v_uint32x8(g0);
1897     r = v_uint32x8(r0);
1898     a = v_uint32x8(a0);
1899 }
1900
1901 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a )
1902 {
1903     __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
1904     __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
1905     __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1906     __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
1907
1908     __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
1909     __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
1910     __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
1911     __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
1912
1913     __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
1914     __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
1915     __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
1916     __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
1917
1918     b = v_uint64x4(b0);
1919     g = v_uint64x4(g0);
1920     r = v_uint64x4(r0);
1921     a = v_uint64x4(a0);
1922 }
1923
1924 ///////////////////////////// store interleave /////////////////////////////////////
1925
1926 inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
1927                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
1928 {
1929     __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
1930     __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
1931
1932     __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
1933     __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
1934
1935     if( mode == hal::STORE_ALIGNED_NOCACHE )
1936     {
1937         _mm256_stream_si256((__m256i*)ptr, xy0);
1938         _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
1939     }
1940     else if( mode == hal::STORE_ALIGNED )
1941     {
1942         _mm256_store_si256((__m256i*)ptr, xy0);
1943         _mm256_store_si256((__m256i*)(ptr + 32), xy1);
1944     }
1945     else
1946     {
1947         _mm256_storeu_si256((__m256i*)ptr, xy0);
1948         _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
1949     }
1950 }
1951
1952 inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
1953                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
1954 {
1955     __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
1956     __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
1957
1958     __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
1959     __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
1960
1961     if( mode == hal::STORE_ALIGNED_NOCACHE )
1962     {
1963         _mm256_stream_si256((__m256i*)ptr, xy0);
1964         _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
1965     }
1966     else if( mode == hal::STORE_ALIGNED )
1967     {
1968         _mm256_store_si256((__m256i*)ptr, xy0);
1969         _mm256_store_si256((__m256i*)(ptr + 16), xy1);
1970     }
1971     else
1972     {
1973         _mm256_storeu_si256((__m256i*)ptr, xy0);
1974         _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
1975     }
1976 }
1977
1978 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
1979                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
1980 {
1981     __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
1982     __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
1983
1984     __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
1985     __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
1986
1987     if( mode == hal::STORE_ALIGNED_NOCACHE )
1988     {
1989         _mm256_stream_si256((__m256i*)ptr, xy0);
1990         _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
1991     }
1992     else if( mode == hal::STORE_ALIGNED )
1993     {
1994         _mm256_store_si256((__m256i*)ptr, xy0);
1995         _mm256_store_si256((__m256i*)(ptr + 8), xy1);
1996     }
1997     else
1998     {
1999         _mm256_storeu_si256((__m256i*)ptr, xy0);
2000         _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
2001     }
2002 }
2003
2004 inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
2005                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2006 {
2007     __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
2008     __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
2009
2010     __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2011     __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2012
2013     if( mode == hal::STORE_ALIGNED_NOCACHE )
2014     {
2015         _mm256_stream_si256((__m256i*)ptr, xy0);
2016         _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
2017     }
2018     else if( mode == hal::STORE_ALIGNED )
2019     {
2020         _mm256_store_si256((__m256i*)ptr, xy0);
2021         _mm256_store_si256((__m256i*)(ptr + 4), xy1);
2022     }
2023     else
2024     {
2025         _mm256_storeu_si256((__m256i*)ptr, xy0);
2026         _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
2027     }
2028 }
2029
2030 inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r,
2031                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2032 {
2033     const __m256i sh_b = _mm256_setr_epi8(
2034             0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2035             0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2036     const __m256i sh_g = _mm256_setr_epi8(
2037             5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2038             5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2039     const __m256i sh_r = _mm256_setr_epi8(
2040             10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2041             10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2042
2043     __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
2044     __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
2045     __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
2046
2047     const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2048                                                0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2049     const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2050                                                0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2051
2052     __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2053     __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2054     __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2055
2056     __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2057     __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
2058     __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
2059
2060     if( mode == hal::STORE_ALIGNED_NOCACHE )
2061     {
2062         _mm256_stream_si256((__m256i*)ptr, bgr0);
2063         _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
2064         _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
2065     }
2066     else if( mode == hal::STORE_ALIGNED )
2067     {
2068         _mm256_store_si256((__m256i*)ptr, bgr0);
2069         _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
2070         _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
2071     }
2072     else
2073     {
2074         _mm256_storeu_si256((__m256i*)ptr, bgr0);
2075         _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
2076         _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
2077     }
2078 }
2079
2080 inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r,
2081                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2082 {
2083     const __m256i sh_b = _mm256_setr_epi8(
2084          0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2085          0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2086     const __m256i sh_g = _mm256_setr_epi8(
2087          10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2088          10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2089     const __m256i sh_r = _mm256_setr_epi8(
2090          4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2091          4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2092
2093     __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
2094     __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
2095     __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
2096
2097     const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2098                                                0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2099     const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2100                                                -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2101
2102     __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2103     __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2104     __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2105
2106     __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
2107     //__m256i bgr1 = p1;
2108     __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
2109
2110     if( mode == hal::STORE_ALIGNED_NOCACHE )
2111     {
2112         _mm256_stream_si256((__m256i*)ptr, bgr0);
2113         _mm256_stream_si256((__m256i*)(ptr + 16), p1);
2114         _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
2115     }
2116     else if( mode == hal::STORE_ALIGNED )
2117     {
2118         _mm256_store_si256((__m256i*)ptr, bgr0);
2119         _mm256_store_si256((__m256i*)(ptr + 16), p1);
2120         _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
2121     }
2122     else
2123     {
2124         _mm256_storeu_si256((__m256i*)ptr, bgr0);
2125         _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
2126         _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
2127     }
2128 }
2129
2130 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r,
2131                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2132 {
2133     __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
2134     __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
2135     __m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6);
2136
2137     __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
2138     __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
2139     __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
2140
2141     __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2142     //__m256i bgr1 = p2;
2143     __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2144
2145     if( mode == hal::STORE_ALIGNED_NOCACHE )
2146     {
2147         _mm256_stream_si256((__m256i*)ptr, bgr0);
2148         _mm256_stream_si256((__m256i*)(ptr + 8), p2);
2149         _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
2150     }
2151     else if( mode == hal::STORE_ALIGNED )
2152     {
2153         _mm256_store_si256((__m256i*)ptr, bgr0);
2154         _mm256_store_si256((__m256i*)(ptr + 8), p2);
2155         _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
2156     }
2157     else
2158     {
2159         _mm256_stream_si256((__m256i*)ptr, bgr0);
2160         _mm256_stream_si256((__m256i*)(ptr + 8), p2);
2161         _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
2162     }
2163 }
2164
2165 inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r,
2166                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2167 {
2168     __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
2169     __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
2170     __m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc);
2171
2172     __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
2173     __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
2174     __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
2175
2176     if( mode == hal::STORE_ALIGNED_NOCACHE )
2177     {
2178         _mm256_stream_si256((__m256i*)ptr, bgr0);
2179         _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
2180         _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
2181     }
2182     else if( mode == hal::STORE_ALIGNED )
2183     {
2184         _mm256_store_si256((__m256i*)ptr, bgr0);
2185         _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
2186         _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
2187     }
2188     else
2189     {
2190         _mm256_storeu_si256((__m256i*)ptr, bgr0);
2191         _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
2192         _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
2193     }
2194 }
2195
2196 inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g,
2197                                 const v_uint8x32& r, const v_uint8x32& a,
2198                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2199 {
2200     __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
2201     __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
2202     __m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val);
2203     __m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val);
2204
2205     __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
2206     __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
2207     __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
2208     __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
2209
2210     __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2211     __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2212     __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2213     __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2214
2215     if( mode == hal::STORE_ALIGNED_NOCACHE )
2216     {
2217         _mm256_stream_si256((__m256i*)ptr, bgra0);
2218         _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
2219         _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
2220         _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
2221     }
2222     else if( mode == hal::STORE_ALIGNED )
2223     {
2224         _mm256_store_si256((__m256i*)ptr, bgra0);
2225         _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
2226         _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
2227         _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
2228     }
2229     else
2230     {
2231         _mm256_storeu_si256((__m256i*)ptr, bgra0);
2232         _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
2233         _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
2234         _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
2235     }
2236 }
2237
2238 inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
2239                                 const v_uint16x16& r, const v_uint16x16& a,
2240                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2241 {
2242     __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
2243     __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
2244     __m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val);
2245     __m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val);
2246
2247     __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
2248     __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
2249     __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
2250     __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
2251
2252     __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2253     __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2254     __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2255     __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2256
2257     if( mode == hal::STORE_ALIGNED_NOCACHE )
2258     {
2259         _mm256_stream_si256((__m256i*)ptr, bgra0);
2260         _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
2261         _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
2262         _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
2263     }
2264     else if( mode == hal::STORE_ALIGNED )
2265     {
2266         _mm256_store_si256((__m256i*)ptr, bgra0);
2267         _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
2268         _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
2269         _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
2270     }
2271     else
2272     {
2273         _mm256_storeu_si256((__m256i*)ptr, bgra0);
2274         _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
2275         _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
2276         _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
2277     }
2278 }
2279
2280 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
2281                                 const v_uint32x8& r, const v_uint32x8& a,
2282                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2283 {
2284     __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
2285     __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
2286     __m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val);
2287     __m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val);
2288
2289     __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
2290     __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
2291     __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
2292     __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
2293
2294     __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2295     __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2296     __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2297     __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2298
2299     if( mode == hal::STORE_ALIGNED_NOCACHE )
2300     {
2301         _mm256_stream_si256((__m256i*)ptr, bgra0);
2302         _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
2303         _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
2304         _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
2305     }
2306     else if( mode == hal::STORE_ALIGNED )
2307     {
2308         _mm256_store_si256((__m256i*)ptr, bgra0);
2309         _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
2310         _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
2311         _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
2312     }
2313     else
2314     {
2315         _mm256_storeu_si256((__m256i*)ptr, bgra0);
2316         _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
2317         _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
2318         _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
2319     }
2320 }
2321
2322 inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
2323                                 const v_uint64x4& r, const v_uint64x4& a,
2324                                 hal::StoreMode mode=hal::STORE_UNALIGNED )
2325 {
2326     __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
2327     __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
2328     __m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val);
2329     __m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val);
2330
2331     __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
2332     __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
2333     __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
2334     __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
2335
2336     if( mode == hal::STORE_ALIGNED_NOCACHE )
2337     {
2338         _mm256_stream_si256((__m256i*)ptr, bgra0);
2339         _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
2340         _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
2341         _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
2342     }
2343     else if( mode == hal::STORE_ALIGNED )
2344     {
2345         _mm256_store_si256((__m256i*)ptr, bgra0);
2346         _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
2347         _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
2348         _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
2349     }
2350     else
2351     {
2352         _mm256_storeu_si256((__m256i*)ptr, bgra0);
2353         _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
2354         _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
2355         _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
2356     }
2357 }
2358
2359 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2360 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2361 { \
2362     _Tpvec1 a1, b1; \
2363     v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2364     a0 = v_reinterpret_as_##suffix0(a1); \
2365     b0 = v_reinterpret_as_##suffix0(b1); \
2366 } \
2367 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2368 { \
2369     _Tpvec1 a1, b1, c1; \
2370     v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2371     a0 = v_reinterpret_as_##suffix0(a1); \
2372     b0 = v_reinterpret_as_##suffix0(b1); \
2373     c0 = v_reinterpret_as_##suffix0(c1); \
2374 } \
2375 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2376 { \
2377     _Tpvec1 a1, b1, c1, d1; \
2378     v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2379     a0 = v_reinterpret_as_##suffix0(a1); \
2380     b0 = v_reinterpret_as_##suffix0(b1); \
2381     c0 = v_reinterpret_as_##suffix0(c1); \
2382     d0 = v_reinterpret_as_##suffix0(d1); \
2383 } \
2384 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2385                                 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2386 { \
2387     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2388     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2389     v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
2390 } \
2391 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
2392                                 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2393 { \
2394     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2395     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2396     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2397     v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
2398 } \
2399 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2400                                 const _Tpvec0& c0, const _Tpvec0& d0, \
2401                                 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2402 { \
2403     _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2404     _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2405     _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2406     _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2407     v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2408 }
2409
2410 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
2411 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
2412 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
2413 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
2414 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
2415 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
2416
2417 inline void v256_cleanup() { _mm256_zeroupper(); }
2418
2419 //! @name Check SIMD256 support
2420 //! @{
2421 //! @brief Check CPU capability of SIMD operation
2422 static inline bool hasSIMD256()
2423 {
2424     return (CV_CPU_HAS_SUPPORT_AVX2) ? true : false;
2425 }
2426 //! @}
2427
2428 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2429
2430 //! @endcond
2431
2432 } // cv::
2433
2434 #endif // OPENCV_HAL_INTRIN_AVX_HPP