1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
49 #include "opencv2/core/utility.hpp"
52 #define CV_SIMD128_64F 1
59 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
63 typedef uchar lane_type;
66 v_uint8x16() : val(_mm_setzero_si128()) {}
67 explicit v_uint8x16(__m128i v) : val(v) {}
68 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
69 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
71 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
72 (char)v4, (char)v5, (char)v6, (char)v7,
73 (char)v8, (char)v9, (char)v10, (char)v11,
74 (char)v12, (char)v13, (char)v14, (char)v15);
78 return (uchar)_mm_cvtsi128_si32(val);
86 typedef schar lane_type;
89 v_int8x16() : val(_mm_setzero_si128()) {}
90 explicit v_int8x16(__m128i v) : val(v) {}
91 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
92 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
94 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
95 (char)v4, (char)v5, (char)v6, (char)v7,
96 (char)v8, (char)v9, (char)v10, (char)v11,
97 (char)v12, (char)v13, (char)v14, (char)v15);
101 return (schar)_mm_cvtsi128_si32(val);
109 typedef ushort lane_type;
112 v_uint16x8() : val(_mm_setzero_si128()) {}
113 explicit v_uint16x8(__m128i v) : val(v) {}
114 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
116 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
117 (short)v4, (short)v5, (short)v6, (short)v7);
121 return (ushort)_mm_cvtsi128_si32(val);
129 typedef short lane_type;
132 v_int16x8() : val(_mm_setzero_si128()) {}
133 explicit v_int16x8(__m128i v) : val(v) {}
134 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
136 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
137 (short)v4, (short)v5, (short)v6, (short)v7);
141 return (short)_mm_cvtsi128_si32(val);
148 typedef unsigned lane_type;
151 v_uint32x4() : val(_mm_setzero_si128()) {}
152 explicit v_uint32x4(__m128i v) : val(v) {}
153 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
155 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
157 unsigned get0() const
159 return (unsigned)_mm_cvtsi128_si32(val);
166 typedef int lane_type;
169 v_int32x4() : val(_mm_setzero_si128()) {}
170 explicit v_int32x4(__m128i v) : val(v) {}
171 v_int32x4(int v0, int v1, int v2, int v3)
173 val = _mm_setr_epi32(v0, v1, v2, v3);
177 return _mm_cvtsi128_si32(val);
184 typedef float lane_type;
187 v_float32x4() : val(_mm_setzero_ps()) {}
188 explicit v_float32x4(__m128 v) : val(v) {}
189 v_float32x4(float v0, float v1, float v2, float v3)
191 val = _mm_setr_ps(v0, v1, v2, v3);
195 return _mm_cvtss_f32(val);
202 typedef uint64 lane_type;
205 v_uint64x2() : val(_mm_setzero_si128()) {}
206 explicit v_uint64x2(__m128i v) : val(v) {}
207 v_uint64x2(uint64 v0, uint64 v1)
209 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
213 int a = _mm_cvtsi128_si32(val);
214 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
215 return (unsigned)a | ((uint64)(unsigned)b << 32);
222 typedef int64 lane_type;
225 v_int64x2() : val(_mm_setzero_si128()) {}
226 explicit v_int64x2(__m128i v) : val(v) {}
227 v_int64x2(int64 v0, int64 v1)
229 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
233 int a = _mm_cvtsi128_si32(val);
234 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
235 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
242 typedef double lane_type;
245 v_float64x2() : val(_mm_setzero_pd()) {}
246 explicit v_float64x2(__m128d v) : val(v) {}
247 v_float64x2(double v0, double v1)
249 val = _mm_setr_pd(v0, v1);
253 return _mm_cvtsd_f64(val);
261 typedef short lane_type;
264 v_float16x4() : val(_mm_setzero_si128()) {}
265 explicit v_float16x4(__m128i v) : val(v) {}
266 v_float16x4(short v0, short v1, short v2, short v3)
268 val = _mm_setr_epi16(v0, v1, v2, v3, 0, 0, 0, 0);
272 return (short)_mm_cvtsi128_si32(val);
278 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
279 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
280 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
281 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
282 { return _Tpvec(cast(a.val)); }
284 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
285 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
286 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
287 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
288 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
289 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
290 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
291 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
293 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
294 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
295 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
296 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
298 template<typename _Tpvec> inline
299 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
300 template<typename _Tpvec> inline
301 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
302 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
303 { return v_float32x4(_mm_castsi128_ps(a.val)); }
304 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
305 { return v_float32x4(_mm_castsi128_ps(a.val)); }
306 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
307 { return v_float64x2(_mm_castsi128_pd(a.val)); }
308 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
309 { return v_float64x2(_mm_castsi128_pd(a.val)); }
311 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
312 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
313 { return _Tpvec(_mm_castps_si128(a.val)); } \
314 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
315 { return _Tpvec(_mm_castpd_si128(a.val)); }
317 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
318 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
319 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
320 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
321 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
322 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
323 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
324 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
326 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
327 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
328 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
329 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
331 //////////////// PACK ///////////////
332 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
334 __m128i delta = _mm_set1_epi16(255);
335 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
336 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
339 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
341 __m128i delta = _mm_set1_epi16(255);
342 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
343 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
346 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
347 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
349 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
350 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
352 template<int n> inline
353 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
355 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
356 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
357 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
358 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
361 template<int n> inline
362 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
364 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
365 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
366 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
369 template<int n> inline
370 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
372 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
373 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
374 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
377 template<int n> inline
378 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
380 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
381 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
382 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
385 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
386 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
388 inline void v_pack_store(schar* ptr, v_int16x8& a)
389 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
391 template<int n> inline
392 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
394 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
395 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
396 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
397 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
399 template<int n> inline
400 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
402 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
403 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
404 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
405 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
409 // bit-wise "mask ? a : b"
410 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
412 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
415 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
417 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
418 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
419 __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
420 __m128i r = _mm_packs_epi32(a1, b1);
421 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
424 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
426 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
427 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
428 __m128i r = _mm_packs_epi32(a1, a1);
429 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
432 template<int n> inline
433 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
435 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
436 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
437 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
438 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
441 template<int n> inline
442 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
444 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
445 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
446 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
447 _mm_storel_epi64((__m128i*)ptr, a2);
450 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
452 __m128i delta32 = _mm_set1_epi32(32768);
453 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
454 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
457 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
459 __m128i delta32 = _mm_set1_epi32(32768);
460 __m128i a1 = _mm_sub_epi32(a.val, delta32);
461 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
462 _mm_storel_epi64((__m128i*)ptr, r);
465 template<int n> inline
466 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
468 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
469 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
470 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
471 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
472 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
473 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
476 template<int n> inline
477 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
479 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
480 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
481 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
482 _mm_storel_epi64((__m128i*)ptr, a2);
485 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
486 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
488 inline void v_pack_store(short* ptr, const v_int32x4& a)
490 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
493 template<int n> inline
494 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
496 __m128i delta = _mm_set1_epi32(1 << (n-1));
497 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
498 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
501 template<int n> inline
502 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
504 __m128i delta = _mm_set1_epi32(1 << (n-1));
505 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
506 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
510 // [a0 0 | b0 0] [a1 0 | b1 0]
511 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
513 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
514 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
515 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
518 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
520 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
521 _mm_storel_epi64((__m128i*)ptr, a1);
524 // [a0 0 | b0 0] [a1 0 | b1 0]
525 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
527 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
528 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
529 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
532 inline void v_pack_store(int* ptr, const v_int64x2& a)
534 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
535 _mm_storel_epi64((__m128i*)ptr, a1);
538 template<int n> inline
539 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
541 uint64 delta = (uint64)1 << (n-1);
542 v_uint64x2 delta2(delta, delta);
543 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
544 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
545 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
546 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
547 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
550 template<int n> inline
551 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
553 uint64 delta = (uint64)1 << (n-1);
554 v_uint64x2 delta2(delta, delta);
555 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
556 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
557 _mm_storel_epi64((__m128i*)ptr, a2);
560 inline __m128i v_sign_epi64(__m128i a)
562 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
565 inline __m128i v_srai_epi64(__m128i a, int imm)
567 __m128i smask = v_sign_epi64(a);
568 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
571 template<int n> inline
572 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
574 int64 delta = (int64)1 << (n-1);
575 v_int64x2 delta2(delta, delta);
576 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
577 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
578 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
579 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
580 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
583 template<int n> inline
584 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
586 int64 delta = (int64)1 << (n-1);
587 v_int64x2 delta2(delta, delta);
588 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
589 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
590 _mm_storel_epi64((__m128i*)ptr, a2);
593 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
594 const v_float32x4& m1, const v_float32x4& m2,
595 const v_float32x4& m3)
597 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
598 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
599 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
600 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
602 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
605 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
606 const v_float32x4& m1, const v_float32x4& m2,
607 const v_float32x4& a)
609 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
610 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
611 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
613 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
616 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
617 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
619 return _Tpvec(intrin(a.val, b.val)); \
621 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
623 a.val = intrin(a.val, b.val); \
627 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
628 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
629 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
630 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
631 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
632 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
633 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
634 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
635 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
636 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
637 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
638 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
639 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
640 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
641 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
642 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
643 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
644 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
645 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
646 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
647 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
648 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
649 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
650 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
651 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
652 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
654 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
656 __m128i c0 = _mm_mul_epu32(a.val, b.val);
657 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
658 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
659 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
660 return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
662 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
664 __m128i c0 = _mm_mul_epu32(a.val, b.val);
665 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
666 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
667 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
668 return v_int32x4(_mm_unpacklo_epi64(d0, d1));
670 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
675 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
681 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
682 v_int32x4& c, v_int32x4& d)
684 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
685 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
686 c.val = _mm_unpacklo_epi16(v0, v1);
687 d.val = _mm_unpackhi_epi16(v0, v1);
690 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
691 v_uint32x4& c, v_uint32x4& d)
693 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
694 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
695 c.val = _mm_unpacklo_epi16(v0, v1);
696 d.val = _mm_unpackhi_epi16(v0, v1);
699 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
700 v_uint64x2& c, v_uint64x2& d)
702 __m128i c0 = _mm_mul_epu32(a.val, b.val);
703 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
704 c.val = _mm_unpacklo_epi64(c0, c1);
705 d.val = _mm_unpackhi_epi64(c0, c1);
708 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
710 return v_int32x4(_mm_madd_epi16(a.val, b.val));
713 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
714 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
715 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
716 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
717 inline _Tpvec operator ~ (const _Tpvec& a) \
719 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
722 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
723 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
724 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
725 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
726 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
727 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
728 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
729 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
730 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
731 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
733 inline v_float32x4 v_sqrt(const v_float32x4& x)
734 { return v_float32x4(_mm_sqrt_ps(x.val)); }
736 inline v_float32x4 v_invsqrt(const v_float32x4& x)
738 static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
740 __m128 h = _mm_mul_ps(t, _0_5);
742 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
743 return v_float32x4(t);
746 inline v_float64x2 v_sqrt(const v_float64x2& x)
747 { return v_float64x2(_mm_sqrt_pd(x.val)); }
749 inline v_float64x2 v_invsqrt(const v_float64x2& x)
751 static const __m128d v_1 = _mm_set1_pd(1.);
752 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
755 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
756 inline _Tpuvec v_abs(const _Tpsvec& x) \
757 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
759 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
760 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
761 inline v_uint32x4 v_abs(const v_int32x4& x)
763 __m128i s = _mm_srli_epi32(x.val, 31);
764 __m128i f = _mm_srai_epi32(x.val, 31);
765 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
767 inline v_float32x4 v_abs(const v_float32x4& x)
768 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
769 inline v_float64x2 v_abs(const v_float64x2& x)
771 return v_float64x2(_mm_and_pd(x.val,
772 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
775 // TODO: exp, log, sin, cos
777 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
778 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
780 return _Tpvec(intrin(a.val, b.val)); \
783 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
784 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
785 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
786 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
787 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
788 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
789 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
790 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
792 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
794 __m128i delta = _mm_set1_epi8((char)-128);
795 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
796 _mm_xor_si128(b.val, delta))));
798 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
800 __m128i delta = _mm_set1_epi8((char)-128);
801 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
802 _mm_xor_si128(b.val, delta))));
804 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
806 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
808 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
810 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
812 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
814 __m128i delta = _mm_set1_epi32((int)0x80000000);
815 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
816 return v_uint32x4(v_select_si128(mask, b.val, a.val));
818 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
820 __m128i delta = _mm_set1_epi32((int)0x80000000);
821 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
822 return v_uint32x4(v_select_si128(mask, a.val, b.val));
824 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
826 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
828 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
830 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
833 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
834 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
835 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
836 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
838 __m128i not_mask = _mm_set1_epi32(-1); \
839 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
841 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
842 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
843 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
845 __m128i not_mask = _mm_set1_epi32(-1); \
846 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
848 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
850 __m128i smask = _mm_set1_##suffix(sbit); \
851 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
853 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
855 __m128i smask = _mm_set1_##suffix(sbit); \
856 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
858 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
860 __m128i smask = _mm_set1_##suffix(sbit); \
861 __m128i not_mask = _mm_set1_epi32(-1); \
862 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
863 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
865 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
867 __m128i smask = _mm_set1_##suffix(sbit); \
868 __m128i not_mask = _mm_set1_epi32(-1); \
869 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
870 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
872 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
874 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
876 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
878 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
880 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
882 __m128i not_mask = _mm_set1_epi32(-1); \
883 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
885 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
887 __m128i not_mask = _mm_set1_epi32(-1); \
888 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
891 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
892 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
893 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
895 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
896 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
897 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
898 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
899 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
900 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
901 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
902 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
903 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
904 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
905 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
906 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
907 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
909 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
910 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
912 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
913 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
914 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
915 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
916 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
918 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64);
919 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64);
921 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
922 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
923 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
924 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
925 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
926 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
927 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
928 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
930 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
931 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
933 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
935 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
937 __m128i smask = _mm_set1_epi32(smask32); \
938 __m128i a1 = _mm_xor_si128(a.val, smask); \
939 __m128i b1 = _mm_xor_si128(b.val, smask); \
940 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
943 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
944 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
946 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
948 return v_max(a, b) - v_min(a, b);
951 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
953 __m128i d = _mm_sub_epi32(a.val, b.val);
954 __m128i m = _mm_cmpgt_epi32(b.val, a.val);
955 return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
958 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
959 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
961 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
962 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
964 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
966 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
967 return _Tpvec(_mm_sqrt_##suffix(res)); \
969 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
971 _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
972 return _Tpvec(res); \
974 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
976 return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
979 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
980 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
982 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
983 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
985 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
987 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
989 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
991 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
993 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
995 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
997 return _Tpsvec(srai(a.val, imm)); \
1000 inline _Tpuvec v_shl(const _Tpuvec& a) \
1002 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1005 inline _Tpsvec v_shl(const _Tpsvec& a) \
1007 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1010 inline _Tpuvec v_shr(const _Tpuvec& a) \
1012 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1015 inline _Tpsvec v_shr(const _Tpsvec& a) \
1017 return _Tpsvec(srai(a.val, imm)); \
1020 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1021 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1022 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1024 template<int imm, typename _Tpvec>
1025 inline _Tpvec v_rotate_right(const _Tpvec &a)
1027 enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) };
1028 return _Tpvec(_mm_srli_si128(a.val, CV_SHIFT));
1030 template<int imm, typename _Tpvec>
1031 inline _Tpvec v_rotate_left(const _Tpvec &a)
1033 enum { CV_SHIFT = imm*(sizeof(typename _Tpvec::lane_type)) };
1034 return _Tpvec(_mm_slli_si128(a.val, CV_SHIFT));
1036 template<int imm, typename _Tpvec>
1037 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1039 enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) };
1040 enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) };
1041 return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, CV_SHIFT1), _mm_slli_si128(b.val, CV_SHIFT2)));
1043 template<int imm, typename _Tpvec>
1044 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1046 enum { CV_SHIFT1 = imm*(sizeof(typename _Tpvec::lane_type)) };
1047 enum { CV_SHIFT2 = 16 - imm*(sizeof(typename _Tpvec::lane_type)) };
1048 return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, CV_SHIFT1), _mm_srli_si128(b.val, CV_SHIFT2)));
1051 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1052 inline _Tpvec v_load(const _Tp* ptr) \
1053 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1054 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1055 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1056 inline _Tpvec v_load_low(const _Tp* ptr) \
1057 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1058 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1060 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1061 _mm_loadl_epi64((const __m128i*)ptr1))); \
1063 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1064 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1065 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1066 { _mm_store_si128((__m128i*)ptr, a.val); } \
1067 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1068 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1069 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1070 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1072 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1073 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1074 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1075 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1076 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1077 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1078 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1079 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1081 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1082 inline _Tpvec v_load(const _Tp* ptr) \
1083 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1084 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1085 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1086 inline _Tpvec v_load_low(const _Tp* ptr) \
1087 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1088 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1090 return _Tpvec(_mm_castsi128_##suffix( \
1091 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1092 _mm_loadl_epi64((const __m128i*)ptr1)))); \
1094 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1095 { _mm_storeu_##suffix(ptr, a.val); } \
1096 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1097 { _mm_store_##suffix(ptr, a.val); } \
1098 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1099 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1100 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1102 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1103 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1106 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1107 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1110 inline v_float16x4 v_load_f16(const short* ptr)
1111 { return v_float16x4(_mm_loadl_epi64((const __m128i*)ptr)); }
1112 inline void v_store_f16(short* ptr, v_float16x4& a)
1113 { _mm_storel_epi64((__m128i*)ptr, a.val); }
1116 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1117 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1119 __m128i val = a.val; \
1120 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1121 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1122 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1123 return (scalartype)_mm_cvtsi128_si32(val); \
1125 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1127 __m128i val = a.val; \
1128 __m128i smask = _mm_set1_epi16(sbit); \
1129 val = _mm_xor_si128(val, smask); \
1130 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1131 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1132 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1133 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1135 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
1136 inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
1138 __m128i val = a.val; \
1139 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
1140 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
1141 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
1142 return (scalartype)_mm_cvtsi128_si32(val); \
1144 inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
1146 __m128i val = a.val; \
1147 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
1148 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
1149 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
1150 return (unsigned scalartype)_mm_cvtsi128_si32(val); \
1152 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1153 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1154 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
1156 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1157 inline scalartype v_reduce_sum(const _Tpvec& a) \
1159 regtype val = a.val; \
1160 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1161 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1162 return (scalartype)_mm_cvt##extract(val); \
1165 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1166 inline scalartype v_reduce_##func(const _Tpvec& a) \
1168 scalartype CV_DECL_ALIGNED(16) buf[4]; \
1169 v_store_aligned(buf, a); \
1170 scalartype s0 = scalar_func(buf[0], buf[1]); \
1171 scalartype s1 = scalar_func(buf[2], buf[3]); \
1172 return scalar_func(s0, s1); \
1175 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1176 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1177 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1179 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1180 const v_float32x4& c, const v_float32x4& d)
1183 __m128 ab = _mm_hadd_ps(a.val, b.val);
1184 __m128 cd = _mm_hadd_ps(c.val, d.val);
1185 return v_float32x4(_mm_hadd_ps(ab, cd));
1187 __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1188 __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1189 return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1193 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1194 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1195 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1196 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1197 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1198 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1200 #define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
1201 inline v_uint32x4 v_popcount(const _Tpvec& a) \
1203 __m128i m1 = _mm_set1_epi32(0x55555555); \
1204 __m128i m2 = _mm_set1_epi32(0x33333333); \
1205 __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
1206 __m128i p = a.val; \
1207 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
1208 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
1209 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
1210 p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
1211 p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
1212 return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
1215 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
1216 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
1217 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
1218 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
1219 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
1220 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
1222 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
1223 inline int v_signmask(const _Tpvec& a) \
1225 return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
1227 inline bool v_check_all(const _Tpvec& a) \
1228 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
1229 inline bool v_check_any(const _Tpvec& a) \
1230 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
1232 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
1233 inline __m128i v_packq_epi32(__m128i a)
1235 __m128i b = _mm_packs_epi32(a, a);
1236 return _mm_packs_epi16(b, b);
1239 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1240 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1241 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1242 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1243 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1244 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1245 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
1246 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
1248 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1249 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1251 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1254 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1255 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1256 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1257 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1258 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1259 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1260 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1261 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1262 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1263 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1265 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
1266 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
1268 __m128i z = _mm_setzero_si128(); \
1269 b0.val = _mm_unpacklo_##suffix(a.val, z); \
1270 b1.val = _mm_unpackhi_##suffix(a.val, z); \
1272 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
1274 __m128i z = _mm_setzero_si128(); \
1275 return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
1277 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
1279 b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
1280 b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
1282 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
1284 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1285 return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
1288 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
1289 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
1291 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
1293 __m128i z = _mm_setzero_si128();
1294 b0.val = _mm_unpacklo_epi32(a.val, z);
1295 b1.val = _mm_unpackhi_epi32(a.val, z);
1297 inline v_uint64x2 v_load_expand(const unsigned* ptr)
1299 __m128i z = _mm_setzero_si128();
1300 return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
1302 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
1304 __m128i s = _mm_srai_epi32(a.val, 31);
1305 b0.val = _mm_unpacklo_epi32(a.val, s);
1306 b1.val = _mm_unpackhi_epi32(a.val, s);
1308 inline v_int64x2 v_load_expand(const int* ptr)
1310 __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
1311 __m128i s = _mm_srai_epi32(a, 31);
1312 return v_int64x2(_mm_unpacklo_epi32(a, s));
1315 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1317 __m128i z = _mm_setzero_si128();
1318 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1319 return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
1322 inline v_int32x4 v_load_expand_q(const schar* ptr)
1324 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1325 a = _mm_unpacklo_epi8(a, a);
1326 a = _mm_unpacklo_epi8(a, a);
1327 return v_int32x4(_mm_srai_epi32(a, 24));
1330 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1331 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1333 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1334 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1336 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1338 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1339 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1341 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1343 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1344 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1346 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1348 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1349 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1350 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1353 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1354 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1355 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1356 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1357 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1358 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1359 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1360 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1362 template<int s, typename _Tpvec>
1363 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1365 const int w = sizeof(typename _Tpvec::lane_type);
1366 const int n = _Tpvec::nlanes;
1368 ra = _mm_srli_si128(a.val, s*w);
1369 rb = _mm_slli_si128(b.val, (n-s)*w);
1370 return _Tpvec(_mm_or_si128(ra, rb));
1373 inline v_int32x4 v_round(const v_float32x4& a)
1374 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
1376 inline v_int32x4 v_floor(const v_float32x4& a)
1378 __m128i a1 = _mm_cvtps_epi32(a.val);
1379 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
1380 return v_int32x4(_mm_add_epi32(a1, mask));
1383 inline v_int32x4 v_ceil(const v_float32x4& a)
1385 __m128i a1 = _mm_cvtps_epi32(a.val);
1386 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
1387 return v_int32x4(_mm_sub_epi32(a1, mask));
1390 inline v_int32x4 v_trunc(const v_float32x4& a)
1391 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
1393 inline v_int32x4 v_round(const v_float64x2& a)
1394 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
1396 inline v_int32x4 v_floor(const v_float64x2& a)
1398 __m128i a1 = _mm_cvtpd_epi32(a.val);
1399 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
1400 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1401 return v_int32x4(_mm_add_epi32(a1, mask));
1404 inline v_int32x4 v_ceil(const v_float64x2& a)
1406 __m128i a1 = _mm_cvtpd_epi32(a.val);
1407 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
1408 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1409 return v_int32x4(_mm_sub_epi32(a1, mask));
1412 inline v_int32x4 v_trunc(const v_float64x2& a)
1413 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
1415 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1416 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1417 const _Tpvec& a2, const _Tpvec& a3, \
1418 _Tpvec& b0, _Tpvec& b1, \
1419 _Tpvec& b2, _Tpvec& b3) \
1421 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
1422 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
1423 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
1424 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
1426 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
1427 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
1428 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
1429 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
1432 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1433 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1434 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1436 // adopted from sse_utils.hpp
1437 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1439 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1440 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1442 __m128i t10 = _mm_unpacklo_epi8(t00, t01);
1443 __m128i t11 = _mm_unpackhi_epi8(t00, t01);
1445 __m128i t20 = _mm_unpacklo_epi8(t10, t11);
1446 __m128i t21 = _mm_unpackhi_epi8(t10, t11);
1448 __m128i t30 = _mm_unpacklo_epi8(t20, t21);
1449 __m128i t31 = _mm_unpackhi_epi8(t20, t21);
1451 a.val = _mm_unpacklo_epi8(t30, t31);
1452 b.val = _mm_unpackhi_epi8(t30, t31);
1455 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1457 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1458 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1459 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1461 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
1462 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
1463 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
1465 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
1466 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
1467 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
1469 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
1470 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
1471 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
1473 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
1474 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
1475 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
1478 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
1480 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
1481 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1482 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
1483 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
1485 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
1486 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
1487 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
1488 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
1490 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
1491 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
1492 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
1493 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
1495 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
1496 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
1497 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
1498 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
1500 a.val = _mm_unpacklo_epi8(v0, v1);
1501 b.val = _mm_unpackhi_epi8(v0, v1);
1502 c.val = _mm_unpacklo_epi8(v2, v3);
1503 d.val = _mm_unpackhi_epi8(v2, v3);
1506 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
1508 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1509 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1510 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1512 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
1513 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
1514 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
1516 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
1517 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
1518 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
1520 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
1521 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
1522 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
1525 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
1527 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
1528 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
1529 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1530 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
1532 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
1533 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
1534 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
1535 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
1537 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
1538 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
1539 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
1540 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
1542 a.val = _mm_unpacklo_epi16(u0, u1);
1543 b.val = _mm_unpackhi_epi16(u0, u1);
1544 c.val = _mm_unpacklo_epi16(u2, u3);
1545 d.val = _mm_unpackhi_epi16(u2, u3);
1548 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
1550 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1551 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1552 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1554 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
1555 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
1556 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
1558 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
1559 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
1560 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
1563 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
1565 v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
1566 v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
1567 v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
1568 v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
1570 v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
1573 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
1575 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
1576 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
1577 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1579 a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
1580 b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
1581 c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
1584 inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
1586 v_uint64x2 t0, t1, t2;
1587 v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
1588 a = v_reinterpret_as_s64(t0);
1589 b = v_reinterpret_as_s64(t1);
1590 c = v_reinterpret_as_s64(t2);
1593 inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
1595 v_uint64x2 t0, t1, t2;
1596 v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
1597 a = v_reinterpret_as_f64(t0);
1598 b = v_reinterpret_as_f64(t1);
1599 c = v_reinterpret_as_f64(t2);
1602 // 2-channel, float only
1603 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
1605 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
1607 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
1608 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
1610 a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
1611 b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
1614 inline void v_store_interleave( short* ptr, const v_int16x8& a, const v_int16x8& b )
1617 t0 = _mm_unpacklo_epi16(a.val, b.val);
1618 t1 = _mm_unpackhi_epi16(a.val, b.val);
1619 _mm_storeu_si128((__m128i*)(ptr), t0);
1620 _mm_storeu_si128((__m128i*)(ptr + 8), t1);
1623 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
1625 __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
1626 __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
1628 _mm_storeu_si128((__m128i*)(ptr), v0);
1629 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1632 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1633 const v_uint8x16& c )
1635 __m128i z = _mm_setzero_si128();
1636 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
1637 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
1638 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
1639 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
1641 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
1642 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
1643 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
1644 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
1646 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
1647 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
1648 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
1649 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
1651 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1652 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1653 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1654 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1656 p20 = _mm_slli_si128(p20, 1);
1657 p22 = _mm_slli_si128(p22, 1);
1659 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
1660 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
1661 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
1662 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
1664 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
1665 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
1666 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
1667 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
1669 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
1670 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
1671 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
1673 _mm_storeu_si128((__m128i*)(ptr), v0);
1674 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1675 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
1678 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1679 const v_uint8x16& c, const v_uint8x16& d)
1685 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
1686 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
1687 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
1688 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
1690 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
1691 __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
1692 __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
1693 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
1695 _mm_storeu_si128((__m128i*)ptr, v0);
1696 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
1697 _mm_storeu_si128((__m128i*)(ptr + 32), v1);
1698 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
1701 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
1702 const v_uint16x8& b,
1703 const v_uint16x8& c )
1705 __m128i z = _mm_setzero_si128();
1706 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
1707 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
1708 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
1709 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
1711 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
1712 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
1713 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
1714 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
1716 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1717 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1718 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1719 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1721 p20 = _mm_slli_si128(p20, 2);
1722 p22 = _mm_slli_si128(p22, 2);
1724 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
1725 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
1726 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
1727 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
1729 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
1730 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
1731 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
1733 _mm_storeu_si128((__m128i*)(ptr), v0);
1734 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
1735 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
1738 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
1739 const v_uint16x8& c, const v_uint16x8& d)
1745 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
1746 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
1747 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
1748 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
1750 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
1751 __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
1752 __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
1753 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
1755 _mm_storeu_si128((__m128i*)ptr, v0);
1756 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
1757 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1758 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
1761 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
1762 const v_uint32x4& c )
1764 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
1765 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
1767 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
1768 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
1769 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
1771 _mm_storeu_si128((__m128i*)ptr, v0);
1772 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
1773 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
1776 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
1777 const v_uint32x4& c, const v_uint32x4& d)
1779 v_uint32x4 t0, t1, t2, t3;
1780 v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
1782 v_store(ptr + 4, t1);
1783 v_store(ptr + 8, t2);
1784 v_store(ptr + 12, t3);
1787 // 2-channel, float only
1788 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
1792 __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
1793 __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
1795 _mm_storeu_ps(ptr, u0);
1796 _mm_storeu_ps((ptr + 4), u1);
1799 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
1801 __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
1802 __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
1803 __m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
1805 _mm_storeu_si128((__m128i*)ptr, t0);
1806 _mm_storeu_si128((__m128i*)(ptr + 2), t1);
1807 _mm_storeu_si128((__m128i*)(ptr + 4), t2);
1810 inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
1812 v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
1815 inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1817 v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
1820 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
1821 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
1822 _Tpvec& b0, _Tpvec& c0 ) \
1824 _Tpuvec a1, b1, c1; \
1825 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
1826 a0 = v_reinterpret_as_##suffix(a1); \
1827 b0 = v_reinterpret_as_##suffix(b1); \
1828 c0 = v_reinterpret_as_##suffix(c1); \
1830 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
1831 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
1833 _Tpuvec a1, b1, c1, d1; \
1834 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
1835 a0 = v_reinterpret_as_##suffix(a1); \
1836 b0 = v_reinterpret_as_##suffix(b1); \
1837 c0 = v_reinterpret_as_##suffix(c1); \
1838 d0 = v_reinterpret_as_##suffix(d1); \
1840 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
1841 const _Tpvec& b0, const _Tpvec& c0 ) \
1843 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
1844 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
1845 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
1846 v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
1848 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
1849 const _Tpvec& c0, const _Tpvec& d0 ) \
1851 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
1852 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
1853 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
1854 _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
1855 v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
1858 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
1859 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
1860 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
1861 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
1863 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
1865 return v_float32x4(_mm_cvtepi32_ps(a.val));
1868 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
1870 return v_float32x4(_mm_cvtpd_ps(a.val));
1873 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
1875 return v_float64x2(_mm_cvtepi32_pd(a.val));
1878 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
1880 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
1883 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
1885 return v_float64x2(_mm_cvtps_pd(a.val));
1888 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
1890 return v_float64x2(_mm_cvtps_pd(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(a.val),8))));
1894 inline v_float32x4 v_cvt_f32(const v_float16x4& a)
1896 return v_float32x4(_mm_cvtph_ps(a.val));
1899 inline v_float16x4 v_cvt_f16(const v_float32x4& a)
1901 return v_float16x4(_mm_cvtps_ph(a.val, 0));
1905 //! @name Check SIMD support
1907 //! @brief Check CPU capability of SIMD operation
1908 static inline bool hasSIMD128()
1910 return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
1915 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END