modules/core/include/opencv2/core/hal/intrin_sse.hpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                          License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
  17 // Third party copyrights are property of their respective owners.
  18 //
  19 // Redistribution and use in source and binary forms, with or without modification,
  20 // are permitted provided that the following conditions are met:
  21 //
  22 //   * Redistribution's of source code must retain the above copyright notice,
  23 //     this list of conditions and the following disclaimer.
  24 //
  25 //   * Redistribution's in binary form must reproduce the above copyright notice,
  26 //     this list of conditions and the following disclaimer in the documentation
  27 //     and/or other materials provided with the distribution.
  28 //
  29 //   * The name of the copyright holders may not be used to endorse or promote products
  30 //     derived from this software without specific prior written permission.
  31 //
  32 // This software is provided by the copyright holders and contributors "as is" and
  33 // any express or implied warranties, including, but not limited to, the implied
  34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  35 // In no event shall the Intel Corporation or contributors be liable for any direct,
  36 // indirect, incidental, special, exemplary, or consequential damages
  37 // (including, but not limited to, procurement of substitute goods or services;
  38 // loss of use, data, or profits; or business interruption) however caused
  39 // and on any theory of liability, whether in contract, strict liability,
  40 // or tort (including negligence or otherwise) arising in any way out of
  41 // the use of this software, even if advised of the possibility of such damage.
  42 //
  43 //M*/
  44
  45 #ifndef OPENCV_HAL_SSE_HPP
  46 #define OPENCV_HAL_SSE_HPP
  47
  48 #include <algorithm>
  49 #include "opencv2/core/utility.hpp"
  50
  51 #define CV_SIMD128 1
  52 #define CV_SIMD128_64F 1
  53
  54 namespace cv
  55 {
  56
  57 //! @cond IGNORED
  58
  59 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
  60
  61 struct v_uint8x16
  62 {
  63     typedef uchar lane_type;
  64     enum { nlanes = 16 };
  65
  66     v_uint8x16() : val(_mm_setzero_si128()) {}
  67     explicit v_uint8x16(__m128i v) : val(v) {}
  68     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
  69                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
  70     {
  71         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
  72                             (char)v4, (char)v5, (char)v6, (char)v7,
  73                             (char)v8, (char)v9, (char)v10, (char)v11,
  74                             (char)v12, (char)v13, (char)v14, (char)v15);
  75     }
  76     uchar get0() const
  77     {
  78         return (uchar)_mm_cvtsi128_si32(val);
  79     }
  80
  81     __m128i val;
  82 };
  83
  84 struct v_int8x16
  85 {
  86     typedef schar lane_type;
  87     enum { nlanes = 16 };
  88
  89     v_int8x16() : val(_mm_setzero_si128()) {}
  90     explicit v_int8x16(__m128i v) : val(v) {}
  91     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
  92               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
  93     {
  94         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
  95                             (char)v4, (char)v5, (char)v6, (char)v7,
  96                             (char)v8, (char)v9, (char)v10, (char)v11,
  97                             (char)v12, (char)v13, (char)v14, (char)v15);
  98     }
  99     schar get0() const
 100     {
 101         return (schar)_mm_cvtsi128_si32(val);
 102     }
 103
 104     __m128i val;
 105 };
 106
 107 struct v_uint16x8
 108 {
 109     typedef ushort lane_type;
 110     enum { nlanes = 8 };
 111
 112     v_uint16x8() : val(_mm_setzero_si128()) {}
 113     explicit v_uint16x8(__m128i v) : val(v) {}
 114     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
 115     {
 116         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
 117                              (short)v4, (short)v5, (short)v6, (short)v7);
 118     }
 119     ushort get0() const
 120     {
 121         return (ushort)_mm_cvtsi128_si32(val);
 122     }
 123
 124     __m128i val;
 125 };
 126
 127 struct v_int16x8
 128 {
 129     typedef short lane_type;
 130     enum { nlanes = 8 };
 131
 132     v_int16x8() : val(_mm_setzero_si128()) {}
 133     explicit v_int16x8(__m128i v) : val(v) {}
 134     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
 135     {
 136         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
 137                              (short)v4, (short)v5, (short)v6, (short)v7);
 138     }
 139     short get0() const
 140     {
 141         return (short)_mm_cvtsi128_si32(val);
 142     }
 143     __m128i val;
 144 };
 145
 146 struct v_uint32x4
 147 {
 148     typedef unsigned lane_type;
 149     enum { nlanes = 4 };
 150
 151     v_uint32x4() : val(_mm_setzero_si128()) {}
 152     explicit v_uint32x4(__m128i v) : val(v) {}
 153     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
 154     {
 155         val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
 156     }
 157     unsigned get0() const
 158     {
 159         return (unsigned)_mm_cvtsi128_si32(val);
 160     }
 161     __m128i val;
 162 };
 163
 164 struct v_int32x4
 165 {
 166     typedef int lane_type;
 167     enum { nlanes = 4 };
 168
 169     v_int32x4() : val(_mm_setzero_si128()) {}
 170     explicit v_int32x4(__m128i v) : val(v) {}
 171     v_int32x4(int v0, int v1, int v2, int v3)
 172     {
 173         val = _mm_setr_epi32(v0, v1, v2, v3);
 174     }
 175     int get0() const
 176     {
 177         return _mm_cvtsi128_si32(val);
 178     }
 179     __m128i val;
 180 };
 181
 182 struct v_float32x4
 183 {
 184     typedef float lane_type;
 185     enum { nlanes = 4 };
 186
 187     v_float32x4() : val(_mm_setzero_ps()) {}
 188     explicit v_float32x4(__m128 v) : val(v) {}
 189     v_float32x4(float v0, float v1, float v2, float v3)
 190     {
 191         val = _mm_setr_ps(v0, v1, v2, v3);
 192     }
 193     float get0() const
 194     {
 195         return _mm_cvtss_f32(val);
 196     }
 197     __m128 val;
 198 };
 199
 200 struct v_uint64x2
 201 {
 202     typedef uint64 lane_type;
 203     enum { nlanes = 2 };
 204
 205     v_uint64x2() : val(_mm_setzero_si128()) {}
 206     explicit v_uint64x2(__m128i v) : val(v) {}
 207     v_uint64x2(uint64 v0, uint64 v1)
 208     {
 209         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
 210     }
 211     uint64 get0() const
 212     {
 213         int a = _mm_cvtsi128_si32(val);
 214         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
 215         return (unsigned)a | ((uint64)(unsigned)b << 32);
 216     }
 217     __m128i val;
 218 };
 219
 220 struct v_int64x2
 221 {
 222     typedef int64 lane_type;
 223     enum { nlanes = 2 };
 224
 225     v_int64x2() : val(_mm_setzero_si128()) {}
 226     explicit v_int64x2(__m128i v) : val(v) {}
 227     v_int64x2(int64 v0, int64 v1)
 228     {
 229         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
 230     }
 231     int64 get0() const
 232     {
 233         int a = _mm_cvtsi128_si32(val);
 234         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
 235         return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
 236     }
 237     __m128i val;
 238 };
 239
 240 struct v_float64x2
 241 {
 242     typedef double lane_type;
 243     enum { nlanes = 2 };
 244
 245     v_float64x2() : val(_mm_setzero_pd()) {}
 246     explicit v_float64x2(__m128d v) : val(v) {}
 247     v_float64x2(double v0, double v1)
 248     {
 249         val = _mm_setr_pd(v0, v1);
 250     }
 251     double get0() const
 252     {
 253         return _mm_cvtsd_f64(val);
 254     }
 255     __m128d val;
 256 };
 257
 258 #if CV_FP16
 259 struct v_float16x4
 260 {
 261     typedef short lane_type;
 262     enum { nlanes = 4 };
 263
 264     v_float16x4() : val(_mm_setzero_si128()) {}
 265     explicit v_float16x4(__m128i v) : val(v) {}
 266     v_float16x4(short v0, short v1, short v2, short v3)
 267     {
 268         val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
 269     }
 270     short get0() const
 271     {
 272         return (short)_mm_cvtsi128_si32(val);
 273     }
 274     __m128i val;
 275 };
 276 #endif
 277
 278 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
 279 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
 280 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
 281 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
 282 { return _Tpvec(cast(a.val)); }
 283
 284 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
 285 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
 286 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
 287 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
 288 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
 289 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
 290 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
 291 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
 292
 293 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
 294 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
 295 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
 296 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
 297
 298 template<typename _Tpvec> inline
 299 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
 300 template<typename _Tpvec> inline
 301 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
 302 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
 303 { return v_float32x4(_mm_castsi128_ps(a.val)); }
 304 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
 305 { return v_float32x4(_mm_castsi128_ps(a.val)); }
 306 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
 307 { return v_float64x2(_mm_castsi128_pd(a.val)); }
 308 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
 309 { return v_float64x2(_mm_castsi128_pd(a.val)); }
 310
 311 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
 312 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
 313 { return _Tpvec(_mm_castps_si128(a.val)); } \
 314 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
 315 { return _Tpvec(_mm_castpd_si128(a.val)); }
 316
 317 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
 318 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
 319 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
 320 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
 321 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
 322 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
 323 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
 324 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
 325
 326 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
 327 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
 328 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
 329 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
 330
 331 //////////////// PACK ///////////////
 332 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
 333 {
 334     __m128i delta = _mm_set1_epi16(255);
 335     return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
 336                                        _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
 337 }
 338
 339 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
 340 {
 341     __m128i delta = _mm_set1_epi16(255);
 342     __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
 343     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
 344 }
 345
 346 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
 347 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
 348
 349 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
 350 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
 351
 352 template<int n> inline
 353 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
 354 {
 355     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
 356     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
 357     return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
 358                                        _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
 359 }
 360
 361 template<int n> inline
 362 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
 363 {
 364     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
 365     __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
 366     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
 367 }
 368
 369 template<int n> inline
 370 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
 371 {
 372     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
 373     return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
 374                                        _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
 375 }
 376
 377 template<int n> inline
 378 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
 379 {
 380     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
 381     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
 382     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
 383 }
 384
 385 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
 386 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
 387
 388 inline void v_pack_store(schar* ptr, v_int16x8& a)
 389 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
 390
 391 template<int n> inline
 392 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
 393 {
 394     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
 395     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
 396     return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
 397                                      _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
 398 }
 399 template<int n> inline
 400 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
 401 {
 402     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
 403     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
 404     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
 405     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
 406 }
 407
 408
 409 // bit-wise "mask ? a : b"
 410 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
 411 {
 412     return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
 413 }
 414
 415 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
 416 {
 417     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
 418     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
 419     __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
 420     __m128i r = _mm_packs_epi32(a1, b1);
 421     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
 422 }
 423
 424 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
 425 {
 426     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
 427     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
 428     __m128i r = _mm_packs_epi32(a1, a1);
 429     _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
 430 }
 431
 432 template<int n> inline
 433 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
 434 {
 435     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
 436     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
 437     __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
 438     return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
 439 }
 440
 441 template<int n> inline
 442 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
 443 {
 444     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
 445     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
 446     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
 447     _mm_storel_epi64((__m128i*)ptr, a2);
 448 }
 449
 450 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
 451 {
 452     __m128i delta32 = _mm_set1_epi32(32768);
 453     __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
 454     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
 455 }
 456
 457 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
 458 {
 459     __m128i delta32 = _mm_set1_epi32(32768);
 460     __m128i a1 = _mm_sub_epi32(a.val, delta32);
 461     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
 462     _mm_storel_epi64((__m128i*)ptr, r);
 463 }
 464
 465 template<int n> inline
 466 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
 467 {
 468     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
 469     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
 470     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
 471     __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
 472     __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
 473     return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
 474 }
 475
 476 template<int n> inline
 477 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
 478 {
 479     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
 480     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
 481     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
 482     _mm_storel_epi64((__m128i*)ptr, a2);
 483 }
 484
 485 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
 486 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
 487
 488 inline void v_pack_store(short* ptr, const v_int32x4& a)
 489 {
 490     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
 491 }
 492
 493 template<int n> inline
 494 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
 495 {
 496     __m128i delta = _mm_set1_epi32(1 << (n-1));
 497     return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
 498                                      _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
 499 }
 500
 501 template<int n> inline
 502 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
 503 {
 504     __m128i delta = _mm_set1_epi32(1 << (n-1));
 505     __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
 506     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
 507 }
 508
 509
 510 // [a0 0 | b0 0]  [a1 0 | b1 0]
 511 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
 512 {
 513     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
 514     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
 515     return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
 516 }
 517
 518 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
 519 {
 520     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
 521     _mm_storel_epi64((__m128i*)ptr, a1);
 522 }
 523
 524 // [a0 0 | b0 0]  [a1 0 | b1 0]
 525 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
 526 {
 527     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
 528     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
 529     return v_int32x4(_mm_unpacklo_epi32(v0, v1));
 530 }
 531
 532 inline void v_pack_store(int* ptr, const v_int64x2& a)
 533 {
 534     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
 535     _mm_storel_epi64((__m128i*)ptr, a1);
 536 }
 537
 538 template<int n> inline
 539 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
 540 {
 541     uint64 delta = (uint64)1 << (n-1);
 542     v_uint64x2 delta2(delta, delta);
 543     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
 544     __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
 545     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
 546     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
 547     return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
 548 }
 549
 550 template<int n> inline
 551 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
 552 {
 553     uint64 delta = (uint64)1 << (n-1);
 554     v_uint64x2 delta2(delta, delta);
 555     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
 556     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
 557     _mm_storel_epi64((__m128i*)ptr, a2);
 558 }
 559
 560 inline __m128i v_sign_epi64(__m128i a)
 561 {
 562     return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
 563 }
 564
 565 inline __m128i v_srai_epi64(__m128i a, int imm)
 566 {
 567     __m128i smask = v_sign_epi64(a);
 568     return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
 569 }
 570
 571 template<int n> inline
 572 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
 573 {
 574     int64 delta = (int64)1 << (n-1);
 575     v_int64x2 delta2(delta, delta);
 576     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
 577     __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
 578     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
 579     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
 580     return v_int32x4(_mm_unpacklo_epi32(v0, v1));
 581 }
 582
 583 template<int n> inline
 584 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
 585 {
 586     int64 delta = (int64)1 << (n-1);
 587     v_int64x2 delta2(delta, delta);
 588     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
 589     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
 590     _mm_storel_epi64((__m128i*)ptr, a2);
 591 }
 592
 593 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
 594                             const v_float32x4& m1, const v_float32x4& m2,
 595                             const v_float32x4& m3)
 596 {
 597     __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
 598     __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
 599     __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
 600     __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
 601
 602     return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
 603 }
 604
 605 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
 606                                const v_float32x4& m1, const v_float32x4& m2,
 607                                const v_float32x4& a)
 608 {
 609     __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
 610     __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
 611     __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
 612
 613     return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
 614 }
 615
 616 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
 617     inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 618     { \
 619         return _Tpvec(intrin(a.val, b.val)); \
 620     } \
 621     inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
 622     { \
 623         a.val = intrin(a.val, b.val); \
 624         return a; \
 625     }
 626
 627 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
 628 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
 629 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
 630 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
 631 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
 632 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
 633 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
 634 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
 635 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
 636 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
 637 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
 638 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
 639 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
 640 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
 641 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
 642 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
 643 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
 644 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
 645 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
 646 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
 647 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
 648 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
 649 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
 650 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
 651 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
 652 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
 653
 654 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
 655 {
 656     __m128i c0 = _mm_mul_epu32(a.val, b.val);
 657     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
 658     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
 659     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
 660     return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
 661 }
 662 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
 663 {
 664     __m128i c0 = _mm_mul_epu32(a.val, b.val);
 665     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
 666     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
 667     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
 668     return v_int32x4(_mm_unpacklo_epi64(d0, d1));
 669 }
 670 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
 671 {
 672     a = a * b;
 673     return a;
 674 }
 675 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
 676 {
 677     a = a * b;
 678     return a;
 679 }
 680
 681 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
 682                          v_int32x4& c, v_int32x4& d)
 683 {
 684     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
 685     __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
 686     c.val = _mm_unpacklo_epi16(v0, v1);
 687     d.val = _mm_unpackhi_epi16(v0, v1);
 688 }
 689
 690 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
 691                          v_uint32x4& c, v_uint32x4& d)
 692 {
 693     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
 694     __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
 695     c.val = _mm_unpacklo_epi16(v0, v1);
 696     d.val = _mm_unpackhi_epi16(v0, v1);
 697 }
 698
 699 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
 700                          v_uint64x2& c, v_uint64x2& d)
 701 {
 702     __m128i c0 = _mm_mul_epu32(a.val, b.val);
 703     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
 704     c.val = _mm_unpacklo_epi64(c0, c1);
 705     d.val = _mm_unpackhi_epi64(c0, c1);
 706 }
 707
 708 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
 709 {
 710     return v_int32x4(_mm_madd_epi16(a.val, b.val));
 711 }
 712
 713 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
 714     OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
 715     OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
 716     OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
 717     inline _Tpvec operator ~ (const _Tpvec& a) \
 718     { \
 719         return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
 720     }
 721
 722 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
 723 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
 724 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
 725 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
 726 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
 727 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
 728 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
 729 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
 730 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
 731 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
 732
 733 inline v_float32x4 v_sqrt(const v_float32x4& x)
 734 { return v_float32x4(_mm_sqrt_ps(x.val)); }
 735
 736 inline v_float32x4 v_invsqrt(const v_float32x4& x)
 737 {
 738     static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
 739     __m128 t = x.val;
 740     __m128 h = _mm_mul_ps(t, _0_5);
 741     t = _mm_rsqrt_ps(t);
 742     t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
 743     return v_float32x4(t);
 744 }
 745
 746 inline v_float64x2 v_sqrt(const v_float64x2& x)
 747 { return v_float64x2(_mm_sqrt_pd(x.val)); }
 748
 749 inline v_float64x2 v_invsqrt(const v_float64x2& x)
 750 {
 751     static const __m128d v_1 = _mm_set1_pd(1.);
 752     return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
 753 }
 754
 755 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
 756 inline _Tpuvec v_abs(const _Tpsvec& x) \
 757 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
 758
 759 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
 760 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
 761 inline v_uint32x4 v_abs(const v_int32x4& x)
 762 {
 763     __m128i s = _mm_srli_epi32(x.val, 31);
 764     __m128i f = _mm_srai_epi32(x.val, 31);
 765     return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
 766 }
 767 inline v_float32x4 v_abs(const v_float32x4& x)
 768 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
 769 inline v_float64x2 v_abs(const v_float64x2& x)
 770 {
 771     return v_float64x2(_mm_and_pd(x.val,
 772         _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
 773 }
 774
 775 // TODO: exp, log, sin, cos
 776
 777 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
 778 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
 779 { \
 780     return _Tpvec(intrin(a.val, b.val)); \
 781 }
 782
 783 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
 784 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
 785 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
 786 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
 787 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
 788 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
 789 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
 790 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
 791
 792 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
 793 {
 794     __m128i delta = _mm_set1_epi8((char)-128);
 795     return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
 796                                                        _mm_xor_si128(b.val, delta))));
 797 }
 798 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
 799 {
 800     __m128i delta = _mm_set1_epi8((char)-128);
 801     return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
 802                                                        _mm_xor_si128(b.val, delta))));
 803 }
 804 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
 805 {
 806     return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
 807 }
 808 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
 809 {
 810     return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
 811 }
 812 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
 813 {
 814     __m128i delta = _mm_set1_epi32((int)0x80000000);
 815     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
 816     return v_uint32x4(v_select_si128(mask, b.val, a.val));
 817 }
 818 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
 819 {
 820     __m128i delta = _mm_set1_epi32((int)0x80000000);
 821     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
 822     return v_uint32x4(v_select_si128(mask, a.val, b.val));
 823 }
 824 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
 825 {
 826     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
 827 }
 828 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
 829 {
 830     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
 831 }
 832
 833 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
 834 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
 835 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
 836 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
 837 { \
 838     __m128i not_mask = _mm_set1_epi32(-1); \
 839     return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
 840 } \
 841 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
 842 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
 843 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
 844 { \
 845     __m128i not_mask = _mm_set1_epi32(-1); \
 846     return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
 847 } \
 848 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
 849 { \
 850     __m128i smask = _mm_set1_##suffix(sbit); \
 851     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
 852 } \
 853 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
 854 { \
 855     __m128i smask = _mm_set1_##suffix(sbit); \
 856     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
 857 } \
 858 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
 859 { \
 860     __m128i smask = _mm_set1_##suffix(sbit); \
 861     __m128i not_mask = _mm_set1_epi32(-1); \
 862     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
 863     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
 864 } \
 865 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
 866 { \
 867     __m128i smask = _mm_set1_##suffix(sbit); \
 868     __m128i not_mask = _mm_set1_epi32(-1); \
 869     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
 870     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
 871 } \
 872 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
 873 { \
 874     return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
 875 } \
 876 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
 877 { \
 878     return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
 879 } \
 880 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
 881 { \
 882     __m128i not_mask = _mm_set1_epi32(-1); \
 883     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
 884 } \
 885 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
 886 { \
 887     __m128i not_mask = _mm_set1_epi32(-1); \
 888     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
 889 }
 890
 891 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
 892 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
 893 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
 894
 895 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
 896 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 897 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
 898 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 899 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
 900 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
 901 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
 902 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
 903 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
 904 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
 905 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
 906 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
 907 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
 908
 909 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
 910 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
 911
 912 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
 913 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
 914 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
 915 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 916 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
 917
 918 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64);
 919 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64);
 920
 921 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
 922 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
 923 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
 924 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
 925 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
 926 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
 927 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
 928 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
 929
 930 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
 931 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
 932 { \
 933     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
 934 } \
 935 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
 936 { \
 937     __m128i smask = _mm_set1_epi32(smask32); \
 938     __m128i a1 = _mm_xor_si128(a.val, smask); \
 939     __m128i b1 = _mm_xor_si128(b.val, smask); \
 940     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
 941 }
 942
 943 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
 944 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
 945
 946 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
 947 {
 948     return v_max(a, b) - v_min(a, b);
 949 }
 950
 951 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
 952 {
 953     __m128i d = _mm_sub_epi32(a.val, b.val);
 954     __m128i m = _mm_cmpgt_epi32(b.val, a.val);
 955     return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
 956 }
 957
 958 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
 959 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
 960 { \
 961     _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
 962     return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
 963 } \
 964 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
 965 { \
 966     _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
 967     return _Tpvec(_mm_sqrt_##suffix(res)); \
 968 } \
 969 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
 970 { \
 971     _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
 972     return _Tpvec(res); \
 973 } \
 974 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
 975 { \
 976     return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
 977 }
 978
 979 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
 980 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
 981
 982 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
 983 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
 984 { \
 985     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
 986 } \
 987 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
 988 { \
 989     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
 990 } \
 991 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
 992 { \
 993     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
 994 } \
 995 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
 996 { \
 997     return _Tpsvec(srai(a.val, imm)); \
 998 } \
 999 template<int imm> \
1000 inline _Tpuvec v_shl(const _Tpuvec& a) \
1001 { \
1002     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1003 } \
1004 template<int imm> \
1005 inline _Tpsvec v_shl(const _Tpsvec& a) \
1006 { \
1007     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1008 } \
1009 template<int imm> \
1010 inline _Tpuvec v_shr(const _Tpuvec& a) \
1011 { \
1012     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1013 } \
1014 template<int imm> \
1015 inline _Tpsvec v_shr(const _Tpsvec& a) \
1016 { \
1017     return _Tpsvec(srai(a.val, imm)); \
1018 }
1019
1020 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1021 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1022 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1023
1024 template<int imm, typename _Tpvec>
1025 inline _Tpvec v_rotate_right(const _Tpvec &a)
1026 {
1027     enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) };
1028     return _Tpvec(_mm_srli_si128(a.val, CV_SHIFT));
1029 }
1030 template<int imm, typename _Tpvec>
1031 inline _Tpvec v_rotate_left(const _Tpvec &a)
1032 {
1033     enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) };
1034     return _Tpvec(_mm_slli_si128(a.val, CV_SHIFT));
1035 }
1036 template<int imm, typename _Tpvec>
1037 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1038 {
1039     enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) };
1040     enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) };
1041     return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, CV_SHIFT1), _mm_slli_si128(b.val, CV_SHIFT2)));
1042 }
1043 template<int imm, typename _Tpvec>
1044 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1045 {
1046     enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) };
1047     enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) };
1048     return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, CV_SHIFT1), _mm_srli_si128(b.val, CV_SHIFT2)));
1049 }
1050
1051 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1052 inline _Tpvec v_load(const _Tp* ptr) \
1053 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1054 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1055 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1056 inline _Tpvec v_load_low(const _Tp* ptr) \
1057 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1058 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1059 { \
1060     return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1061                                      _mm_loadl_epi64((const __m128i*)ptr1))); \
1062 } \
1063 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1064 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1065 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1066 { _mm_store_si128((__m128i*)ptr, a.val); } \
1067 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1068 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1069 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1070 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1071
1072 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1073 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1074 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1075 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1076 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1077 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1078 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1079 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1080
1081 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1082 inline _Tpvec v_load(const _Tp* ptr) \
1083 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1084 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1085 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1086 inline _Tpvec v_load_low(const _Tp* ptr) \
1087 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1088 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1089 { \
1090     return _Tpvec(_mm_castsi128_##suffix( \
1091         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1092                            _mm_loadl_epi64((const __m128i*)ptr1)))); \
1093 } \
1094 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1095 { _mm_storeu_##suffix(ptr, a.val); } \
1096 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1097 { _mm_store_##suffix(ptr, a.val); } \
1098 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1099 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1100 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1101 { \
1102     __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1103     _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1104 }
1105
1106 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1107 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1108
1109 #if CV_FP16
1110 inline v_float16x4 v_load_f16(const short* ptr)
1111 { return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
1112 inline void v_store_f16(short* ptr, v_float16x4& a)
1113 { _mm_storel_epi64((__m128i*)ptr, a.val); }
1114 #endif
1115
1116 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1117 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1118 { \
1119     __m128i val = a.val; \
1120     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1121     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1122     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1123     return (scalartype)_mm_cvtsi128_si32(val); \
1124 } \
1125 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1126 { \
1127     __m128i val = a.val; \
1128     __m128i smask = _mm_set1_epi16(sbit); \
1129     val = _mm_xor_si128(val, smask); \
1130     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1131     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1132     val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1133     return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^  sbit); \
1134 }
1135 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
1136 inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
1137 { \
1138     __m128i val = a.val; \
1139     val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
1140     val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
1141     val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
1142     return (scalartype)_mm_cvtsi128_si32(val); \
1143 } \
1144 inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
1145 { \
1146     __m128i val = a.val; \
1147     val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
1148     val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
1149     val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
1150     return (unsigned scalartype)_mm_cvtsi128_si32(val); \
1151 }
1152 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1153 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1154 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
1155
1156 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1157 inline scalartype v_reduce_sum(const _Tpvec& a) \
1158 { \
1159     regtype val = a.val; \
1160     val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1161     val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1162     return (scalartype)_mm_cvt##extract(val); \
1163 }
1164
1165 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1166 inline scalartype v_reduce_##func(const _Tpvec& a) \
1167 { \
1168     scalartype CV_DECL_ALIGNED(16) buf[4]; \
1169     v_store_aligned(buf, a); \
1170     scalartype s0 = scalar_func(buf[0], buf[1]); \
1171     scalartype s1 = scalar_func(buf[2], buf[3]); \
1172     return scalar_func(s0, s1); \
1173 }
1174
1175 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1176 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1177 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1178
1179 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1180                                  const v_float32x4& c, const v_float32x4& d)
1181 {
1182 #if CV_SSE3
1183     __m128 ab = _mm_hadd_ps(a.val, b.val);
1184     __m128 cd = _mm_hadd_ps(c.val, d.val);
1185     return v_float32x4(_mm_hadd_ps(ab, cd));
1186 #else
1187     __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1188     __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1189     return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1190 #endif
1191 }
1192
1193 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1194 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1195 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1196 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1197 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1198 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1199
1200 #define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
1201 inline v_uint32x4 v_popcount(const _Tpvec& a) \
1202 { \
1203     __m128i m1 = _mm_set1_epi32(0x55555555); \
1204     __m128i m2 = _mm_set1_epi32(0x33333333); \
1205     __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
1206     __m128i p = a.val; \
1207     p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
1208     p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
1209     p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
1210     p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
1211     p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
1212     return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
1213 }
1214
1215 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
1216 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
1217 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
1218 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
1219 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
1220 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
1221
1222 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
1223 inline int v_signmask(const _Tpvec& a) \
1224 { \
1225     return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
1226 } \
1227 inline bool v_check_all(const _Tpvec& a) \
1228 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
1229 inline bool v_check_any(const _Tpvec& a) \
1230 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
1231
1232 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
1233 inline __m128i v_packq_epi32(__m128i a)
1234 {
1235     __m128i b = _mm_packs_epi32(a, a);
1236     return _mm_packs_epi16(b, b);
1237 }
1238
1239 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1240 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1241 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1242 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1243 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1244 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1245 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
1246 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
1247
1248 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1249 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1250 { \
1251     return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1252 }
1253
1254 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1255 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1256 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1257 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1258 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1259 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1260 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1261 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1262 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1263 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1264
1265 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
1266 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
1267 { \
1268     __m128i z = _mm_setzero_si128(); \
1269     b0.val = _mm_unpacklo_##suffix(a.val, z); \
1270     b1.val = _mm_unpackhi_##suffix(a.val, z); \
1271 } \
1272 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
1273 { \
1274     __m128i z = _mm_setzero_si128(); \
1275     return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
1276 } \
1277 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
1278 { \
1279     b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
1280     b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
1281 } \
1282 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
1283 { \
1284     __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1285     return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
1286 }
1287
1288 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
1289 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
1290
1291 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
1292 {
1293     __m128i z = _mm_setzero_si128();
1294     b0.val = _mm_unpacklo_epi32(a.val, z);
1295     b1.val = _mm_unpackhi_epi32(a.val, z);
1296 }
1297 inline v_uint64x2 v_load_expand(const unsigned* ptr)
1298 {
1299     __m128i z = _mm_setzero_si128();
1300     return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
1301 }
1302 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
1303 {
1304     __m128i s = _mm_srai_epi32(a.val, 31);
1305     b0.val = _mm_unpacklo_epi32(a.val, s);
1306     b1.val = _mm_unpackhi_epi32(a.val, s);
1307 }
1308 inline v_int64x2 v_load_expand(const int* ptr)
1309 {
1310     __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
1311     __m128i s = _mm_srai_epi32(a, 31);
1312     return v_int64x2(_mm_unpacklo_epi32(a, s));
1313 }
1314
1315 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1316 {
1317     __m128i z = _mm_setzero_si128();
1318     __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1319     return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
1320 }
1321
1322 inline v_int32x4 v_load_expand_q(const schar* ptr)
1323 {
1324     __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1325     a = _mm_unpacklo_epi8(a, a);
1326     a = _mm_unpacklo_epi8(a, a);
1327     return v_int32x4(_mm_srai_epi32(a, 24));
1328 }
1329
1330 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1331 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1332 { \
1333     b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1334     b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1335 } \
1336 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1337 { \
1338     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1339     return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1340 } \
1341 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1342 { \
1343     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1344     return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1345 } \
1346 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1347 { \
1348     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1349     c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1350     d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1351 }
1352
1353 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1354 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1355 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1356 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1357 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1358 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1359 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1360 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1361
1362 template<int s, typename _Tpvec>
1363 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1364 {
1365     const int w = sizeof(typename _Tpvec::lane_type);
1366     const int n = _Tpvec::nlanes;
1367     __m128i ra, rb;
1368     ra = _mm_srli_si128(a.val, s*w);
1369     rb = _mm_slli_si128(b.val, (n-s)*w);
1370     return _Tpvec(_mm_or_si128(ra, rb));
1371 }
1372
1373 inline v_int32x4 v_round(const v_float32x4& a)
1374 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
1375
1376 inline v_int32x4 v_floor(const v_float32x4& a)
1377 {
1378     __m128i a1 = _mm_cvtps_epi32(a.val);
1379     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
1380     return v_int32x4(_mm_add_epi32(a1, mask));
1381 }
1382
1383 inline v_int32x4 v_ceil(const v_float32x4& a)
1384 {
1385     __m128i a1 = _mm_cvtps_epi32(a.val);
1386     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
1387     return v_int32x4(_mm_sub_epi32(a1, mask));
1388 }
1389
1390 inline v_int32x4 v_trunc(const v_float32x4& a)
1391 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
1392
1393 inline v_int32x4 v_round(const v_float64x2& a)
1394 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
1395
1396 inline v_int32x4 v_floor(const v_float64x2& a)
1397 {
1398     __m128i a1 = _mm_cvtpd_epi32(a.val);
1399     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
1400     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1401     return v_int32x4(_mm_add_epi32(a1, mask));
1402 }
1403
1404 inline v_int32x4 v_ceil(const v_float64x2& a)
1405 {
1406     __m128i a1 = _mm_cvtpd_epi32(a.val);
1407     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
1408     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1409     return v_int32x4(_mm_sub_epi32(a1, mask));
1410 }
1411
1412 inline v_int32x4 v_trunc(const v_float64x2& a)
1413 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
1414
1415 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1416 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1417                            const _Tpvec& a2, const _Tpvec& a3, \
1418                            _Tpvec& b0, _Tpvec& b1, \
1419                            _Tpvec& b2, _Tpvec& b3) \
1420 { \
1421     __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
1422     __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
1423     __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
1424     __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
1425 \
1426     b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
1427     b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
1428     b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
1429     b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
1430 }
1431
1432 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1433 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1434 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1435
1436 // adopted from sse_utils.hpp
1437 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1438 {
1439     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1440     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1441
1442     __m128i t10 = _mm_unpacklo_epi8(t00, t01);
1443     __m128i t11 = _mm_unpackhi_epi8(t00, t01);
1444
1445     __m128i t20 = _mm_unpacklo_epi8(t10, t11);
1446     __m128i t21 = _mm_unpackhi_epi8(t10, t11);
1447
1448     __m128i t30 = _mm_unpacklo_epi8(t20, t21);
1449     __m128i t31 = _mm_unpackhi_epi8(t20, t21);
1450
1451     a.val = _mm_unpacklo_epi8(t30, t31);
1452     b.val = _mm_unpackhi_epi8(t30, t31);
1453 }
1454
1455 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1456 {
1457     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1458     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1459     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1460
1461     __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
1462     __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
1463     __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
1464
1465     __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
1466     __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
1467     __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
1468
1469     __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
1470     __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
1471     __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
1472
1473     a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
1474     b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
1475     c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
1476 }
1477
1478 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
1479 {
1480     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
1481     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1482     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
1483     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
1484
1485     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
1486     __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
1487     __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
1488     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
1489
1490     u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
1491     u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
1492     u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
1493     u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
1494
1495     v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
1496     v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
1497     v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
1498     v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
1499
1500     a.val = _mm_unpacklo_epi8(v0, v1);
1501     b.val = _mm_unpackhi_epi8(v0, v1);
1502     c.val = _mm_unpacklo_epi8(v2, v3);
1503     d.val = _mm_unpackhi_epi8(v2, v3);
1504 }
1505
1506 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
1507 {
1508     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1509     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1510     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1511
1512     __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
1513     __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
1514     __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
1515
1516     __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
1517     __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
1518     __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
1519
1520     a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
1521     b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
1522     c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
1523 }
1524
1525 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
1526 {
1527     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
1528     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
1529     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1530     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
1531
1532     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
1533     __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
1534     __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
1535     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
1536
1537     u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
1538     u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
1539     u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
1540     u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
1541
1542     a.val = _mm_unpacklo_epi16(u0, u1);
1543     b.val = _mm_unpackhi_epi16(u0, u1);
1544     c.val = _mm_unpacklo_epi16(u2, u3);
1545     d.val = _mm_unpackhi_epi16(u2, u3);
1546 }
1547
1548 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
1549 {
1550     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1551     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1552     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1553
1554     __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
1555     __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
1556     __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
1557
1558     a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
1559     b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
1560     c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
1561 }
1562
1563 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
1564 {
1565     v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
1566     v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
1567     v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
1568     v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
1569
1570     v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
1571 }
1572
1573 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
1574 {
1575     __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
1576     __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
1577     __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1578
1579     a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
1580     b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
1581     c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
1582 }
1583
1584 inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
1585 {
1586     v_uint64x2 t0, t1, t2;
1587     v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
1588     a = v_reinterpret_as_s64(t0);
1589     b = v_reinterpret_as_s64(t1);
1590     c = v_reinterpret_as_s64(t2);
1591 }
1592
1593 inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
1594 {
1595     v_uint64x2 t0, t1, t2;
1596     v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
1597     a = v_reinterpret_as_f64(t0);
1598     b = v_reinterpret_as_f64(t1);
1599     c = v_reinterpret_as_f64(t2);
1600 }
1601
1602 // 2-channel, float only
1603 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
1604 {
1605     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
1606
1607     __m128 u0 = _mm_loadu_ps(ptr);       // a0 b0 a1 b1
1608     __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
1609
1610     a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
1611     b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
1612 }
1613
1614 inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
1615 {
1616     __m128i t0, t1;
1617     t0 = _mm_unpacklo_epi16(a.val, b.val);
1618     t1 = _mm_unpackhi_epi16(a.val, b.val);
1619     _mm_storeu_si128((__m128i*)(ptr), t0);
1620     _mm_storeu_si128((__m128i*)(ptr + 8), t1);
1621 }
1622
1623 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
1624 {
1625     __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
1626     __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
1627
1628     _mm_storeu_si128((__m128i*)(ptr), v0);
1629     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1630 }
1631
1632 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1633                                 const v_uint8x16& c )
1634 {
1635     __m128i z = _mm_setzero_si128();
1636     __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
1637     __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
1638     __m128i c0 = _mm_unpacklo_epi8(c.val, z);
1639     __m128i c1 = _mm_unpackhi_epi8(c.val, z);
1640
1641     __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
1642     __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
1643     __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
1644     __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
1645
1646     __m128i p10 = _mm_unpacklo_epi32(p00, p01);
1647     __m128i p11 = _mm_unpackhi_epi32(p00, p01);
1648     __m128i p12 = _mm_unpacklo_epi32(p02, p03);
1649     __m128i p13 = _mm_unpackhi_epi32(p02, p03);
1650
1651     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1652     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1653     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1654     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1655
1656     p20 = _mm_slli_si128(p20, 1);
1657     p22 = _mm_slli_si128(p22, 1);
1658
1659     __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
1660     __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
1661     __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
1662     __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
1663
1664     __m128i p40 = _mm_unpacklo_epi64(p30, p31);
1665     __m128i p41 = _mm_unpackhi_epi64(p30, p31);
1666     __m128i p42 = _mm_unpacklo_epi64(p32, p33);
1667     __m128i p43 = _mm_unpackhi_epi64(p32, p33);
1668
1669     __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
1670     __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
1671     __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
1672
1673     _mm_storeu_si128((__m128i*)(ptr), v0);
1674     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1675     _mm_storeu_si128((__m128i*)(ptr + 32), v2);
1676 }
1677
1678 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1679                                 const v_uint8x16& c, const v_uint8x16& d)
1680 {
1681     // a0 a1 a2 a3 ....
1682     // b0 b1 b2 b3 ....
1683     // c0 c1 c2 c3 ....
1684     // d0 d1 d2 d3 ....
1685     __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
1686     __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
1687     __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
1688     __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
1689
1690     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
1691     __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
1692     __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
1693     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
1694
1695     _mm_storeu_si128((__m128i*)ptr, v0);
1696     _mm_storeu_si128((__m128i*)(ptr + 16), v2);
1697     _mm_storeu_si128((__m128i*)(ptr + 32), v1);
1698     _mm_storeu_si128((__m128i*)(ptr + 48), v3);
1699 }
1700
1701 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
1702                                 const v_uint16x8& b,
1703                                 const v_uint16x8& c )
1704 {
1705     __m128i z = _mm_setzero_si128();
1706     __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
1707     __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
1708     __m128i c0 = _mm_unpacklo_epi16(c.val, z);
1709     __m128i c1 = _mm_unpackhi_epi16(c.val, z);
1710
1711     __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
1712     __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
1713     __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
1714     __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
1715
1716     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1717     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1718     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1719     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1720
1721     p20 = _mm_slli_si128(p20, 2);
1722     p22 = _mm_slli_si128(p22, 2);
1723
1724     __m128i p30 = _mm_unpacklo_epi64(p20, p21);
1725     __m128i p31 = _mm_unpackhi_epi64(p20, p21);
1726     __m128i p32 = _mm_unpacklo_epi64(p22, p23);
1727     __m128i p33 = _mm_unpackhi_epi64(p22, p23);
1728
1729     __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
1730     __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
1731     __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
1732
1733     _mm_storeu_si128((__m128i*)(ptr), v0);
1734     _mm_storeu_si128((__m128i*)(ptr + 8), v1);
1735     _mm_storeu_si128((__m128i*)(ptr + 16), v2);
1736 }
1737
1738 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
1739                                 const v_uint16x8& c, const v_uint16x8& d)
1740 {
1741     // a0 a1 a2 a3 ....
1742     // b0 b1 b2 b3 ....
1743     // c0 c1 c2 c3 ....
1744     // d0 d1 d2 d3 ....
1745     __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
1746     __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
1747     __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
1748     __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
1749
1750     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
1751     __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
1752     __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
1753     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
1754
1755     _mm_storeu_si128((__m128i*)ptr, v0);
1756     _mm_storeu_si128((__m128i*)(ptr + 8), v2);
1757     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1758     _mm_storeu_si128((__m128i*)(ptr + 24), v3);
1759 }
1760
1761 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
1762                                 const v_uint32x4& c )
1763 {
1764     v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
1765     v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
1766
1767     __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
1768     __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
1769     __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
1770
1771     _mm_storeu_si128((__m128i*)ptr, v0);
1772     _mm_storeu_si128((__m128i*)(ptr + 4), v1);
1773     _mm_storeu_si128((__m128i*)(ptr + 8), v2);
1774 }
1775
1776 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
1777                                const v_uint32x4& c, const v_uint32x4& d)
1778 {
1779     v_uint32x4 t0, t1, t2, t3;
1780     v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
1781     v_store(ptr, t0);
1782     v_store(ptr + 4, t1);
1783     v_store(ptr + 8, t2);
1784     v_store(ptr + 12, t3);
1785 }
1786
1787 // 2-channel, float only
1788 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
1789 {
1790     // a0 a1 a2 a3 ...
1791     // b0 b1 b2 b3 ...
1792     __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
1793     __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
1794
1795     _mm_storeu_ps(ptr, u0);
1796     _mm_storeu_ps((ptr + 4), u1);
1797 }
1798
1799 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
1800 {
1801     __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
1802     __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
1803     __m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
1804
1805     _mm_storeu_si128((__m128i*)ptr, t0);
1806     _mm_storeu_si128((__m128i*)(ptr + 2), t1);
1807     _mm_storeu_si128((__m128i*)(ptr + 4), t2);
1808 }
1809
1810 inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
1811 {
1812     v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
1813 }
1814
1815 inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1816 {
1817     v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
1818 }
1819
1820 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
1821 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
1822                                  _Tpvec& b0, _Tpvec& c0 ) \
1823 { \
1824     _Tpuvec a1, b1, c1; \
1825     v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
1826     a0 = v_reinterpret_as_##suffix(a1); \
1827     b0 = v_reinterpret_as_##suffix(b1); \
1828     c0 = v_reinterpret_as_##suffix(c1); \
1829 } \
1830 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
1831                                  _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
1832 { \
1833     _Tpuvec a1, b1, c1, d1; \
1834     v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
1835     a0 = v_reinterpret_as_##suffix(a1); \
1836     b0 = v_reinterpret_as_##suffix(b1); \
1837     c0 = v_reinterpret_as_##suffix(c1); \
1838     d0 = v_reinterpret_as_##suffix(d1); \
1839 } \
1840 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
1841                                const _Tpvec& b0, const _Tpvec& c0 ) \
1842 { \
1843     _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
1844     _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
1845     _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
1846     v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
1847 } \
1848 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
1849                                const _Tpvec& c0, const _Tpvec& d0 ) \
1850 { \
1851     _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
1852     _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
1853     _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
1854     _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
1855     v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
1856 }
1857
1858 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
1859 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
1860 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
1861 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
1862
1863 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1864 {
1865     return v_float32x4(_mm_cvtepi32_ps(a.val));
1866 }
1867
1868 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1869 {
1870     return v_float32x4(_mm_cvtpd_ps(a.val));
1871 }
1872
1873 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1874 {
1875     return v_float64x2(_mm_cvtepi32_pd(a.val));
1876 }
1877
1878 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1879 {
1880     return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
1881 }
1882
1883 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1884 {
1885     return v_float64x2(_mm_cvtps_pd(a.val));
1886 }
1887
1888 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1889 {
1890     return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
1891 }
1892
1893 #if CV_FP16
1894 inline v_float32x4 v_cvt_f32(const v_float16x4& a)
1895 {
1896     return v_float32x4(_mm_cvtph_ps(a.val));
1897 }
1898
1899 inline v_float16x4 v_cvt_f16(const v_float32x4& a)
1900 {
1901     return v_float16x4(_mm_cvtps_ph(a.val, 0));
1902 }
1903 #endif
1904
1905 //! @name Check SIMD support
1906 //! @{
1907 //! @brief Check CPU capability of SIMD operation
1908 static inline bool hasSIMD128()
1909 {
1910     return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
1911 }
1912
1913 //! @}
1914
1915 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
1916
1917 //! @endcond
1918
1919 }
1920
1921 #endif