1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
49 #include "opencv2/core/utility.hpp"
52 #define CV_SIMD128_64F 1
59 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
63 typedef uchar lane_type;
64 typedef __m128i vector_type;
67 v_uint8x16() : val(_mm_setzero_si128()) {}
68 explicit v_uint8x16(__m128i v) : val(v) {}
69 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
70 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
72 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
73 (char)v4, (char)v5, (char)v6, (char)v7,
74 (char)v8, (char)v9, (char)v10, (char)v11,
75 (char)v12, (char)v13, (char)v14, (char)v15);
79 return (uchar)_mm_cvtsi128_si32(val);
87 typedef schar lane_type;
88 typedef __m128i vector_type;
91 v_int8x16() : val(_mm_setzero_si128()) {}
92 explicit v_int8x16(__m128i v) : val(v) {}
93 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
94 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
96 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
97 (char)v4, (char)v5, (char)v6, (char)v7,
98 (char)v8, (char)v9, (char)v10, (char)v11,
99 (char)v12, (char)v13, (char)v14, (char)v15);
103 return (schar)_mm_cvtsi128_si32(val);
111 typedef ushort lane_type;
112 typedef __m128i vector_type;
115 v_uint16x8() : val(_mm_setzero_si128()) {}
116 explicit v_uint16x8(__m128i v) : val(v) {}
117 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
119 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
120 (short)v4, (short)v5, (short)v6, (short)v7);
124 return (ushort)_mm_cvtsi128_si32(val);
132 typedef short lane_type;
133 typedef __m128i vector_type;
136 v_int16x8() : val(_mm_setzero_si128()) {}
137 explicit v_int16x8(__m128i v) : val(v) {}
138 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
140 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
141 (short)v4, (short)v5, (short)v6, (short)v7);
145 return (short)_mm_cvtsi128_si32(val);
153 typedef unsigned lane_type;
154 typedef __m128i vector_type;
157 v_uint32x4() : val(_mm_setzero_si128()) {}
158 explicit v_uint32x4(__m128i v) : val(v) {}
159 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
161 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
163 unsigned get0() const
165 return (unsigned)_mm_cvtsi128_si32(val);
173 typedef int lane_type;
174 typedef __m128i vector_type;
177 v_int32x4() : val(_mm_setzero_si128()) {}
178 explicit v_int32x4(__m128i v) : val(v) {}
179 v_int32x4(int v0, int v1, int v2, int v3)
181 val = _mm_setr_epi32(v0, v1, v2, v3);
185 return _mm_cvtsi128_si32(val);
193 typedef float lane_type;
194 typedef __m128 vector_type;
197 v_float32x4() : val(_mm_setzero_ps()) {}
198 explicit v_float32x4(__m128 v) : val(v) {}
199 v_float32x4(float v0, float v1, float v2, float v3)
201 val = _mm_setr_ps(v0, v1, v2, v3);
205 return _mm_cvtss_f32(val);
213 typedef uint64 lane_type;
214 typedef __m128i vector_type;
217 v_uint64x2() : val(_mm_setzero_si128()) {}
218 explicit v_uint64x2(__m128i v) : val(v) {}
219 v_uint64x2(uint64 v0, uint64 v1)
221 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
225 int a = _mm_cvtsi128_si32(val);
226 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
227 return (unsigned)a | ((uint64)(unsigned)b << 32);
235 typedef int64 lane_type;
236 typedef __m128i vector_type;
239 v_int64x2() : val(_mm_setzero_si128()) {}
240 explicit v_int64x2(__m128i v) : val(v) {}
241 v_int64x2(int64 v0, int64 v1)
243 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
247 int a = _mm_cvtsi128_si32(val);
248 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
249 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
257 typedef double lane_type;
258 typedef __m128d vector_type;
261 v_float64x2() : val(_mm_setzero_pd()) {}
262 explicit v_float64x2(__m128d v) : val(v) {}
263 v_float64x2(double v0, double v1)
265 val = _mm_setr_pd(v0, v1);
269 return _mm_cvtsd_f64(val);
277 typedef short lane_type;
278 typedef __m128i vector_type;
281 v_float16x8() : val(_mm_setzero_si128()) {}
282 explicit v_float16x8(__m128i v) : val(v) {}
283 v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
285 val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
289 return (short)_mm_cvtsi128_si32(val);
294 inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
295 inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }
297 namespace hal_sse_internal
299 template <typename to_sse_type, typename from_sse_type>
300 to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
302 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
304 to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
305 { return sse_cast_intrin(a); }
307 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
308 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
309 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
310 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
311 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
312 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
313 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
314 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
315 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
318 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
319 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
320 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
321 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
322 { return _Tpvec(cast(a.val)); }
324 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
325 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
326 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
327 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
328 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
329 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
330 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
331 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
333 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
334 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
335 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
336 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
338 template<typename _Tpvec> inline
339 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
340 template<typename _Tpvec> inline
341 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
342 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
343 { return v_float32x4(_mm_castsi128_ps(a.val)); }
344 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
345 { return v_float32x4(_mm_castsi128_ps(a.val)); }
346 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
347 { return v_float64x2(_mm_castsi128_pd(a.val)); }
348 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
349 { return v_float64x2(_mm_castsi128_pd(a.val)); }
351 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
352 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
353 { return _Tpvec(_mm_castps_si128(a.val)); } \
354 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
355 { return _Tpvec(_mm_castpd_si128(a.val)); }
357 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
358 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
359 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
360 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
361 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
362 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
363 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
364 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
366 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
367 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
368 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
369 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
371 //////////////// PACK ///////////////
372 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
374 __m128i delta = _mm_set1_epi16(255);
375 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
376 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
379 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
381 __m128i delta = _mm_set1_epi16(255);
382 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
383 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
386 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
387 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
389 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
390 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
392 template<int n> inline
393 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
395 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
396 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
397 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
398 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
401 template<int n> inline
402 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
404 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
405 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
406 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
409 template<int n> inline
410 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
412 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
413 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
414 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
417 template<int n> inline
418 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
420 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
421 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
422 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
425 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
426 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
428 inline void v_pack_store(schar* ptr, v_int16x8& a)
429 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
431 template<int n> inline
432 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
434 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
435 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
436 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
437 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
439 template<int n> inline
440 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
442 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
443 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
444 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
445 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
449 // byte-wise "mask ? a : b"
450 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
453 return _mm_blendv_epi8(b, a, mask);
455 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
459 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
461 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
462 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
463 __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
464 __m128i r = _mm_packs_epi32(a1, b1);
465 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
468 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
470 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
471 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
472 __m128i r = _mm_packs_epi32(a1, a1);
473 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
476 template<int n> inline
477 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
479 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
480 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
481 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
482 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
485 template<int n> inline
486 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
488 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
489 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
490 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
491 _mm_storel_epi64((__m128i*)ptr, a2);
494 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
496 __m128i delta32 = _mm_set1_epi32(32768);
497 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
498 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
501 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
503 __m128i delta32 = _mm_set1_epi32(32768);
504 __m128i a1 = _mm_sub_epi32(a.val, delta32);
505 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
506 _mm_storel_epi64((__m128i*)ptr, r);
509 template<int n> inline
510 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
512 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
513 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
514 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
515 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
516 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
517 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
520 template<int n> inline
521 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
523 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
524 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
525 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
526 _mm_storel_epi64((__m128i*)ptr, a2);
529 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
530 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
532 inline void v_pack_store(short* ptr, const v_int32x4& a)
534 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
537 template<int n> inline
538 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
540 __m128i delta = _mm_set1_epi32(1 << (n-1));
541 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
542 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
545 template<int n> inline
546 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
548 __m128i delta = _mm_set1_epi32(1 << (n-1));
549 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
550 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
554 // [a0 0 | b0 0] [a1 0 | b1 0]
555 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
557 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
558 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
559 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
562 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
564 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
565 _mm_storel_epi64((__m128i*)ptr, a1);
568 // [a0 0 | b0 0] [a1 0 | b1 0]
569 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
571 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
572 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
573 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
576 inline void v_pack_store(int* ptr, const v_int64x2& a)
578 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
579 _mm_storel_epi64((__m128i*)ptr, a1);
582 template<int n> inline
583 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
585 uint64 delta = (uint64)1 << (n-1);
586 v_uint64x2 delta2(delta, delta);
587 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
588 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
589 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
590 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
591 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
594 template<int n> inline
595 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
597 uint64 delta = (uint64)1 << (n-1);
598 v_uint64x2 delta2(delta, delta);
599 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
600 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
601 _mm_storel_epi64((__m128i*)ptr, a2);
604 inline __m128i v_sign_epi64(__m128i a)
606 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
609 inline __m128i v_srai_epi64(__m128i a, int imm)
611 __m128i smask = v_sign_epi64(a);
612 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
615 template<int n> inline
616 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
618 int64 delta = (int64)1 << (n-1);
619 v_int64x2 delta2(delta, delta);
620 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
621 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
622 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
623 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
624 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
627 template<int n> inline
628 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
630 int64 delta = (int64)1 << (n-1);
631 v_int64x2 delta2(delta, delta);
632 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
633 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
634 _mm_storel_epi64((__m128i*)ptr, a2);
637 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
638 const v_float32x4& m1, const v_float32x4& m2,
639 const v_float32x4& m3)
641 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
642 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
643 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
644 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
646 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
649 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
650 const v_float32x4& m1, const v_float32x4& m2,
651 const v_float32x4& a)
653 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
654 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
655 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
657 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
660 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
661 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
663 return _Tpvec(intrin(a.val, b.val)); \
665 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
667 a.val = intrin(a.val, b.val); \
671 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
672 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
673 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
674 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
675 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
676 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
677 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
678 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
679 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
680 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
681 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
682 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
683 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
684 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
685 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
686 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
687 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
688 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
689 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
690 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
691 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
692 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
693 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
694 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
695 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
696 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
698 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
700 __m128i c0 = _mm_mul_epu32(a.val, b.val);
701 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
702 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
703 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
704 return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
706 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
709 return v_int32x4(_mm_mullo_epi32(a.val, b.val));
711 __m128i c0 = _mm_mul_epu32(a.val, b.val);
712 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
713 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
714 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
715 return v_int32x4(_mm_unpacklo_epi64(d0, d1));
718 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
723 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
729 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
730 v_int32x4& c, v_int32x4& d)
732 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
733 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
734 c.val = _mm_unpacklo_epi16(v0, v1);
735 d.val = _mm_unpackhi_epi16(v0, v1);
738 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
739 v_uint32x4& c, v_uint32x4& d)
741 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
742 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
743 c.val = _mm_unpacklo_epi16(v0, v1);
744 d.val = _mm_unpackhi_epi16(v0, v1);
747 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
748 v_uint64x2& c, v_uint64x2& d)
750 __m128i c0 = _mm_mul_epu32(a.val, b.val);
751 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
752 c.val = _mm_unpacklo_epi64(c0, c1);
753 d.val = _mm_unpackhi_epi64(c0, c1);
756 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
758 return v_int32x4(_mm_madd_epi16(a.val, b.val));
761 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
763 return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
766 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
767 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
768 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
769 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
770 inline _Tpvec operator ~ (const _Tpvec& a) \
772 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
775 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
776 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
777 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
778 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
779 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
780 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
781 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
782 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
783 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
784 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
786 inline v_float32x4 v_sqrt(const v_float32x4& x)
787 { return v_float32x4(_mm_sqrt_ps(x.val)); }
789 inline v_float32x4 v_invsqrt(const v_float32x4& x)
791 static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
793 __m128 h = _mm_mul_ps(t, _0_5);
795 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
796 return v_float32x4(t);
799 inline v_float64x2 v_sqrt(const v_float64x2& x)
800 { return v_float64x2(_mm_sqrt_pd(x.val)); }
802 inline v_float64x2 v_invsqrt(const v_float64x2& x)
804 static const __m128d v_1 = _mm_set1_pd(1.);
805 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
808 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
809 inline _Tpuvec v_abs(const _Tpsvec& x) \
810 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
812 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
813 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
814 inline v_uint32x4 v_abs(const v_int32x4& x)
816 __m128i s = _mm_srli_epi32(x.val, 31);
817 __m128i f = _mm_srai_epi32(x.val, 31);
818 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
820 inline v_float32x4 v_abs(const v_float32x4& x)
821 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
822 inline v_float64x2 v_abs(const v_float64x2& x)
824 return v_float64x2(_mm_and_pd(x.val,
825 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
828 // TODO: exp, log, sin, cos
830 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
831 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
833 return _Tpvec(intrin(a.val, b.val)); \
836 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
837 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
838 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
839 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
840 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
841 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
842 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
843 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
845 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
848 return v_int8x16(_mm_min_epi8(a.val, b.val));
850 __m128i delta = _mm_set1_epi8((char)-128);
851 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
852 _mm_xor_si128(b.val, delta))));
855 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
858 return v_int8x16(_mm_max_epi8(a.val, b.val));
860 __m128i delta = _mm_set1_epi8((char)-128);
861 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
862 _mm_xor_si128(b.val, delta))));
865 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
868 return v_uint16x8(_mm_min_epu16(a.val, b.val));
870 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
873 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
876 return v_uint16x8(_mm_max_epu16(a.val, b.val));
878 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
881 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
884 return v_uint32x4(_mm_min_epu32(a.val, b.val));
886 __m128i delta = _mm_set1_epi32((int)0x80000000);
887 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
888 return v_uint32x4(v_select_si128(mask, b.val, a.val));
891 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
894 return v_uint32x4(_mm_max_epu32(a.val, b.val));
896 __m128i delta = _mm_set1_epi32((int)0x80000000);
897 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
898 return v_uint32x4(v_select_si128(mask, a.val, b.val));
901 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
904 return v_int32x4(_mm_min_epi32(a.val, b.val));
906 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
909 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
912 return v_int32x4(_mm_max_epi32(a.val, b.val));
914 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
918 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
919 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
920 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
921 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
923 __m128i not_mask = _mm_set1_epi32(-1); \
924 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
926 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
927 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
928 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
930 __m128i not_mask = _mm_set1_epi32(-1); \
931 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
933 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
935 __m128i smask = _mm_set1_##suffix(sbit); \
936 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
938 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
940 __m128i smask = _mm_set1_##suffix(sbit); \
941 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
943 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
945 __m128i smask = _mm_set1_##suffix(sbit); \
946 __m128i not_mask = _mm_set1_epi32(-1); \
947 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
948 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
950 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
952 __m128i smask = _mm_set1_##suffix(sbit); \
953 __m128i not_mask = _mm_set1_epi32(-1); \
954 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
955 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
957 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
959 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
961 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
963 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
965 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
967 __m128i not_mask = _mm_set1_epi32(-1); \
968 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
970 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
972 __m128i not_mask = _mm_set1_epi32(-1); \
973 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
976 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
977 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
978 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
980 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
981 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
982 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
983 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
984 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
985 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
986 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
987 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
988 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
989 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
990 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
991 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
992 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
994 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
995 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
997 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
998 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
999 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1000 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1001 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1003 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
1004 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
1006 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
1007 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
1008 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
1009 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
1010 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1011 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
1012 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1013 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
1015 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
1016 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
1018 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
1020 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
1022 __m128i smask = _mm_set1_epi32(smask32); \
1023 __m128i a1 = _mm_xor_si128(a.val, smask); \
1024 __m128i b1 = _mm_xor_si128(b.val, smask); \
1025 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
1028 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
1029 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
1031 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1033 return v_max(a, b) - v_min(a, b);
1036 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1038 __m128i d = _mm_sub_epi32(a.val, b.val);
1039 __m128i m = _mm_cmpgt_epi32(b.val, a.val);
1040 return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
1043 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1048 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1050 return v_fma(a, b, c);
1053 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1056 return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1058 return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1062 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1065 return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1067 return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1071 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1072 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1074 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1075 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1077 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1079 _Tpvec res = v_fma(a, a, b*b); \
1080 return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1082 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1084 return v_fma(a, a, b*b); \
1086 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1088 return v_fma(a, b, c); \
1091 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
1092 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1094 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1095 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1097 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1099 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1101 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1103 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1105 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1107 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1109 return _Tpsvec(srai(a.val, imm)); \
1112 inline _Tpuvec v_shl(const _Tpuvec& a) \
1114 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1117 inline _Tpsvec v_shl(const _Tpsvec& a) \
1119 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1122 inline _Tpuvec v_shr(const _Tpuvec& a) \
1124 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1127 inline _Tpsvec v_shr(const _Tpsvec& a) \
1129 return _Tpsvec(srai(a.val, imm)); \
1132 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1133 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1134 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1136 namespace hal_sse_internal
1139 bool is_invalid = ((imm < 0) || (imm > 16)),
1140 bool is_first = (imm == 0),
1141 bool is_half = (imm == 8),
1142 bool is_second = (imm == 16),
1143 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1144 class v_sse_palignr_u8_class;
1147 class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1150 class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1153 inline __m128i operator()(const __m128i& a, const __m128i&) const
1160 class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1163 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1165 return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1170 class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1173 inline __m128i operator()(const __m128i&, const __m128i& b) const
1180 class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1184 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1186 return _mm_alignr_epi8(b, a, imm);
1190 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1192 enum { imm2 = (sizeof(__m128i) - imm) };
1193 return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1199 inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
1201 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
1202 return v_sse_palignr_u8_class<imm>()(a, b);
1206 template<int imm, typename _Tpvec>
1207 inline _Tpvec v_rotate_right(const _Tpvec &a)
1209 using namespace hal_sse_internal;
1210 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1211 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1213 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1216 template<int imm, typename _Tpvec>
1217 inline _Tpvec v_rotate_left(const _Tpvec &a)
1219 using namespace hal_sse_internal;
1220 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1221 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1223 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1226 template<int imm, typename _Tpvec>
1227 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1229 using namespace hal_sse_internal;
1230 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1231 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1232 v_sse_palignr_u8<imm2>(
1233 v_sse_reinterpret_as<__m128i>(a.val),
1234 v_sse_reinterpret_as<__m128i>(b.val))));
1237 template<int imm, typename _Tpvec>
1238 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1240 using namespace hal_sse_internal;
1241 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1242 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1243 v_sse_palignr_u8<imm2>(
1244 v_sse_reinterpret_as<__m128i>(b.val),
1245 v_sse_reinterpret_as<__m128i>(a.val))));
1248 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1249 inline _Tpvec v_load(const _Tp* ptr) \
1250 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1251 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1252 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1253 inline _Tpvec v_load_low(const _Tp* ptr) \
1254 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1255 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1257 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1258 _mm_loadl_epi64((const __m128i*)ptr1))); \
1260 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1261 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1262 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1263 { _mm_store_si128((__m128i*)ptr, a.val); } \
1264 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1265 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1266 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1267 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1269 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1270 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1271 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1272 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1273 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1274 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1275 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1276 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1278 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1279 inline _Tpvec v_load(const _Tp* ptr) \
1280 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1281 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1282 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1283 inline _Tpvec v_load_low(const _Tp* ptr) \
1284 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1285 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1287 return _Tpvec(_mm_castsi128_##suffix( \
1288 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1289 _mm_loadl_epi64((const __m128i*)ptr1)))); \
1291 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1292 { _mm_storeu_##suffix(ptr, a.val); } \
1293 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1294 { _mm_store_##suffix(ptr, a.val); } \
1295 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1296 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1297 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1299 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1300 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1303 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1304 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1306 inline v_float16x8 v_load_f16(const short* ptr)
1307 { return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
1308 inline v_float16x8 v_load_f16_aligned(const short* ptr)
1309 { return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
1311 inline void v_store(short* ptr, const v_float16x8& a)
1312 { _mm_storeu_si128((__m128i*)ptr, a.val); }
1313 inline void v_store_aligned(short* ptr, const v_float16x8& a)
1314 { _mm_store_si128((__m128i*)ptr, a.val); }
1316 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1317 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1319 __m128i val = a.val; \
1320 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1321 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1322 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1323 return (scalartype)_mm_cvtsi128_si32(val); \
1325 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1327 __m128i val = a.val; \
1328 __m128i smask = _mm_set1_epi16(sbit); \
1329 val = _mm_xor_si128(val, smask); \
1330 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1331 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1332 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1333 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1335 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
1336 inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
1338 __m128i val = a.val; \
1339 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
1340 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
1341 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
1342 return (scalartype)_mm_cvtsi128_si32(val); \
1344 inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
1346 __m128i val = a.val; \
1347 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
1348 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
1349 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
1350 return (unsigned scalartype)_mm_cvtsi128_si32(val); \
1352 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1353 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1354 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
1356 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1357 inline scalartype v_reduce_sum(const _Tpvec& a) \
1359 regtype val = a.val; \
1360 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1361 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1362 return (scalartype)_mm_cvt##extract(val); \
1365 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1366 inline scalartype v_reduce_##func(const _Tpvec& a) \
1368 scalartype CV_DECL_ALIGNED(16) buf[4]; \
1369 v_store_aligned(buf, a); \
1370 scalartype s0 = scalar_func(buf[0], buf[1]); \
1371 scalartype s1 = scalar_func(buf[2], buf[3]); \
1372 return scalar_func(s0, s1); \
1375 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1376 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1377 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1379 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1380 const v_float32x4& c, const v_float32x4& d)
1383 __m128 ab = _mm_hadd_ps(a.val, b.val);
1384 __m128 cd = _mm_hadd_ps(c.val, d.val);
1385 return v_float32x4(_mm_hadd_ps(ab, cd));
1387 __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1388 __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1389 return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1393 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1394 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1395 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1396 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1397 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1398 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1400 #define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
1401 inline v_uint32x4 v_popcount(const _Tpvec& a) \
1403 __m128i m1 = _mm_set1_epi32(0x55555555); \
1404 __m128i m2 = _mm_set1_epi32(0x33333333); \
1405 __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
1406 __m128i p = a.val; \
1407 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
1408 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
1409 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
1410 p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
1411 p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
1412 return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
1415 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
1416 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
1417 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
1418 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
1419 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
1420 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
1422 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
1423 inline int v_signmask(const _Tpvec& a) \
1425 return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
1427 inline bool v_check_all(const _Tpvec& a) \
1428 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
1429 inline bool v_check_any(const _Tpvec& a) \
1430 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
1432 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
1433 inline __m128i v_packq_epi32(__m128i a)
1435 __m128i b = _mm_packs_epi32(a, a);
1436 return _mm_packs_epi16(b, b);
1439 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1440 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1441 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1442 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1443 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1444 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1445 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
1446 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
1449 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1450 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1452 return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1455 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1456 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1457 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1458 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1459 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1460 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1461 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
1462 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
1463 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1464 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1468 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1469 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1471 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1474 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1475 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1476 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1477 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1478 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1479 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1480 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1481 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1482 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1483 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1486 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
1487 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
1489 __m128i z = _mm_setzero_si128(); \
1490 b0.val = _mm_unpacklo_##suffix(a.val, z); \
1491 b1.val = _mm_unpackhi_##suffix(a.val, z); \
1493 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
1495 __m128i z = _mm_setzero_si128(); \
1496 return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
1498 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
1500 b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
1501 b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
1503 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
1505 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1506 return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
1509 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
1510 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
1512 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
1514 __m128i z = _mm_setzero_si128();
1515 b0.val = _mm_unpacklo_epi32(a.val, z);
1516 b1.val = _mm_unpackhi_epi32(a.val, z);
1518 inline v_uint64x2 v_load_expand(const unsigned* ptr)
1520 __m128i z = _mm_setzero_si128();
1521 return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
1523 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
1525 __m128i s = _mm_srai_epi32(a.val, 31);
1526 b0.val = _mm_unpacklo_epi32(a.val, s);
1527 b1.val = _mm_unpackhi_epi32(a.val, s);
1529 inline v_int64x2 v_load_expand(const int* ptr)
1531 __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
1532 __m128i s = _mm_srai_epi32(a, 31);
1533 return v_int64x2(_mm_unpacklo_epi32(a, s));
1536 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1538 __m128i z = _mm_setzero_si128();
1539 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1540 return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
1543 inline v_int32x4 v_load_expand_q(const schar* ptr)
1545 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1546 a = _mm_unpacklo_epi8(a, a);
1547 a = _mm_unpacklo_epi8(a, a);
1548 return v_int32x4(_mm_srai_epi32(a, 24));
1551 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1552 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1554 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1555 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1557 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1559 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1560 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1562 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1564 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1565 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1567 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1569 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1570 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1571 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1574 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1575 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1576 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1577 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1578 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1579 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1580 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1581 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1583 template<int s, typename _Tpvec>
1584 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1586 return v_rotate_right<s>(a, b);
1589 inline v_int32x4 v_round(const v_float32x4& a)
1590 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
1592 inline v_int32x4 v_floor(const v_float32x4& a)
1594 __m128i a1 = _mm_cvtps_epi32(a.val);
1595 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
1596 return v_int32x4(_mm_add_epi32(a1, mask));
1599 inline v_int32x4 v_ceil(const v_float32x4& a)
1601 __m128i a1 = _mm_cvtps_epi32(a.val);
1602 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
1603 return v_int32x4(_mm_sub_epi32(a1, mask));
1606 inline v_int32x4 v_trunc(const v_float32x4& a)
1607 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
1609 inline v_int32x4 v_round(const v_float64x2& a)
1610 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
1612 inline v_int32x4 v_floor(const v_float64x2& a)
1614 __m128i a1 = _mm_cvtpd_epi32(a.val);
1615 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
1616 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1617 return v_int32x4(_mm_add_epi32(a1, mask));
1620 inline v_int32x4 v_ceil(const v_float64x2& a)
1622 __m128i a1 = _mm_cvtpd_epi32(a.val);
1623 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
1624 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1625 return v_int32x4(_mm_sub_epi32(a1, mask));
1628 inline v_int32x4 v_trunc(const v_float64x2& a)
1629 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
1631 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1632 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1633 const _Tpvec& a2, const _Tpvec& a3, \
1634 _Tpvec& b0, _Tpvec& b1, \
1635 _Tpvec& b2, _Tpvec& b3) \
1637 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
1638 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
1639 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
1640 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
1642 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
1643 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
1644 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
1645 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
1648 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1649 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1650 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1652 // load deinterleave
1653 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1655 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1656 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1658 __m128i t10 = _mm_unpacklo_epi8(t00, t01);
1659 __m128i t11 = _mm_unpackhi_epi8(t00, t01);
1661 __m128i t20 = _mm_unpacklo_epi8(t10, t11);
1662 __m128i t21 = _mm_unpackhi_epi8(t10, t11);
1664 __m128i t30 = _mm_unpacklo_epi8(t20, t21);
1665 __m128i t31 = _mm_unpackhi_epi8(t20, t21);
1667 a.val = _mm_unpacklo_epi8(t30, t31);
1668 b.val = _mm_unpackhi_epi8(t30, t31);
1671 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1674 static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
1675 static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
1676 __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
1677 __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1678 __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1679 __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
1680 __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
1681 __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
1682 static const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
1683 static const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
1684 static const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
1685 a0 = _mm_shuffle_epi8(a0, sh_b);
1686 b0 = _mm_shuffle_epi8(b0, sh_g);
1687 c0 = _mm_shuffle_epi8(c0, sh_r);
1692 static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
1693 static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
1694 static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
1696 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
1697 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1698 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1700 __m128i s0 = _mm_shuffle_epi8(t0, m0);
1701 __m128i s1 = _mm_shuffle_epi8(t1, m1);
1702 __m128i s2 = _mm_shuffle_epi8(t2, m2);
1704 t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
1705 a.val = _mm_alignr_epi8(s2, t0, 5);
1707 t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
1708 b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
1710 t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
1711 c.val = _mm_alignr_epi8(t2, s0, 11);
1713 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1714 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1715 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1717 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
1718 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
1719 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
1721 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
1722 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
1723 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
1725 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
1726 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
1727 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
1729 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
1730 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
1731 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
1735 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
1737 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
1738 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1739 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
1740 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
1742 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
1743 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
1744 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
1745 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
1747 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
1748 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
1749 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
1750 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
1752 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
1753 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
1754 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
1755 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
1757 a.val = _mm_unpacklo_epi8(v0, v1);
1758 b.val = _mm_unpackhi_epi8(v0, v1);
1759 c.val = _mm_unpacklo_epi8(v2, v3);
1760 d.val = _mm_unpackhi_epi8(v2, v3);
1763 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
1765 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
1766 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
1768 __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
1769 __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
1770 __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
1771 __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
1773 a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
1774 b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
1777 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
1780 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
1781 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
1782 __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
1783 __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
1784 __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
1785 __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
1787 static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
1788 static const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
1789 static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
1790 a0 = _mm_shuffle_epi8(a0, sh_a);
1791 b0 = _mm_shuffle_epi8(b0, sh_b);
1792 c0 = _mm_shuffle_epi8(c0, sh_c);
1798 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1799 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1800 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1802 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
1803 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
1804 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
1806 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
1807 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
1808 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
1810 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
1811 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
1812 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
1816 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
1818 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
1819 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
1820 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1821 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
1823 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
1824 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
1825 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
1826 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
1828 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
1829 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
1830 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
1831 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
1833 a.val = _mm_unpacklo_epi16(u0, u1);
1834 b.val = _mm_unpackhi_epi16(u0, u1);
1835 c.val = _mm_unpacklo_epi16(u2, u3);
1836 d.val = _mm_unpackhi_epi16(u2, u3);
1839 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
1841 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1
1842 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
1844 __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
1845 __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
1847 a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
1848 b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
1851 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
1853 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1854 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1855 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1857 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
1858 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
1859 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
1861 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
1862 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
1863 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
1866 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
1868 v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
1869 v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
1870 v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
1871 v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
1873 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
1876 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
1878 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
1880 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
1881 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
1883 a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
1884 b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
1887 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
1889 __m128 t0 = _mm_loadu_ps(ptr + 0);
1890 __m128 t1 = _mm_loadu_ps(ptr + 4);
1891 __m128 t2 = _mm_loadu_ps(ptr + 8);
1893 __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
1894 a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
1896 __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
1897 __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
1898 b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
1900 __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
1901 c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
1904 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
1906 __m128 t0 = _mm_loadu_ps(ptr + 0);
1907 __m128 t1 = _mm_loadu_ps(ptr + 4);
1908 __m128 t2 = _mm_loadu_ps(ptr + 8);
1909 __m128 t3 = _mm_loadu_ps(ptr + 12);
1910 __m128 t02lo = _mm_unpacklo_ps(t0, t2);
1911 __m128 t13lo = _mm_unpacklo_ps(t1, t3);
1912 __m128 t02hi = _mm_unpackhi_ps(t0, t2);
1913 __m128 t13hi = _mm_unpackhi_ps(t1, t3);
1914 a.val = _mm_unpacklo_ps(t02lo, t13lo);
1915 b.val = _mm_unpackhi_ps(t02lo, t13lo);
1916 c.val = _mm_unpacklo_ps(t02hi, t13hi);
1917 d.val = _mm_unpackhi_ps(t02hi, t13hi);
1920 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
1922 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
1923 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
1925 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
1926 b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
1929 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
1931 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
1932 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
1933 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
1935 t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
1937 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
1938 b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
1939 c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
1942 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
1943 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
1945 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
1946 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
1947 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
1948 __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
1950 a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
1951 b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
1952 c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
1953 d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
1958 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
1960 __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
1961 __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
1963 _mm_storeu_si128((__m128i*)(ptr), v0);
1964 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1967 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1968 const v_uint8x16& c )
1971 static const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
1972 static const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
1973 static const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
1974 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
1975 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
1976 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
1978 static const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
1979 static const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
1980 __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
1981 __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
1982 __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
1984 _mm_storeu_si128((__m128i*)(ptr), v0);
1985 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1986 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
1988 static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
1989 static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
1990 static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
1992 __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
1993 t0 = _mm_alignr_epi8(c.val, t0, 5);
1994 __m128i s0 = _mm_shuffle_epi8(t0, m0);
1996 __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
1997 t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
1998 __m128i s1 = _mm_shuffle_epi8(t1, m1);
2000 __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2001 t2 = _mm_alignr_epi8(t2, a.val, 11);
2002 __m128i s2 = _mm_shuffle_epi8(t2, m2);
2004 _mm_storeu_si128((__m128i*)ptr, s0);
2005 _mm_storeu_si128((__m128i*)(ptr + 16), s1);
2006 _mm_storeu_si128((__m128i*)(ptr + 32), s2);
2008 __m128i z = _mm_setzero_si128();
2009 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2010 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2011 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
2012 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
2014 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2015 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2016 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2017 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2019 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
2020 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
2021 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
2022 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
2024 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2025 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2026 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2027 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2029 p20 = _mm_slli_si128(p20, 1);
2030 p22 = _mm_slli_si128(p22, 1);
2032 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2033 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2034 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2035 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2037 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
2038 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
2039 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
2040 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
2042 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2043 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2044 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2046 _mm_storeu_si128((__m128i*)(ptr), v0);
2047 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2048 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2052 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2053 const v_uint8x16& c, const v_uint8x16& d)
2059 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
2060 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
2061 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
2062 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
2064 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
2065 __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
2066 __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
2067 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
2069 _mm_storeu_si128((__m128i*)ptr, v0);
2070 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2071 _mm_storeu_si128((__m128i*)(ptr + 32), v1);
2072 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2075 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b )
2078 t0 = _mm_unpacklo_epi16(a.val, b.val);
2079 t1 = _mm_unpackhi_epi16(a.val, b.val);
2080 _mm_storeu_si128((__m128i*)(ptr), t0);
2081 _mm_storeu_si128((__m128i*)(ptr + 8), t1);
2084 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2085 const v_uint16x8& b,
2086 const v_uint16x8& c )
2089 static const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2090 static const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2091 static const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2092 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2093 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2094 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2096 __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2097 __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2098 __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2100 _mm_storeu_si128((__m128i*)ptr, v0);
2101 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2102 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2104 __m128i z = _mm_setzero_si128();
2105 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2106 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2107 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2108 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2110 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2111 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2112 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2113 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2115 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2116 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2117 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2118 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2120 p20 = _mm_slli_si128(p20, 2);
2121 p22 = _mm_slli_si128(p22, 2);
2123 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2124 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2125 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2126 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2128 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2129 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2130 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2132 _mm_storeu_si128((__m128i*)(ptr), v0);
2133 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2134 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2138 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2139 const v_uint16x8& c, const v_uint16x8& d)
2145 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
2146 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
2147 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
2148 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
2150 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
2151 __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
2152 __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
2153 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
2155 _mm_storeu_si128((__m128i*)ptr, v0);
2156 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2157 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2158 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2161 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b )
2163 __m128i t0 = _mm_unpacklo_epi32(a.val, b.val);
2164 __m128i t1 = _mm_unpackhi_epi32(a.val, b.val);
2166 _mm_storeu_si128((__m128i*)ptr, t0);
2167 _mm_storeu_si128((__m128i*)(ptr + 4), t1);
2170 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2171 const v_uint32x4& c )
2173 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2174 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
2176 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2177 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2178 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2180 _mm_storeu_si128((__m128i*)ptr, v0);
2181 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2182 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2185 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2186 const v_uint32x4& c, const v_uint32x4& d)
2188 v_uint32x4 t0, t1, t2, t3;
2189 v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
2191 v_store(ptr + 4, t1);
2192 v_store(ptr + 8, t2);
2193 v_store(ptr + 12, t3);
2196 // 2-channel, float only
2197 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
2201 __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
2202 __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
2204 _mm_storeu_ps(ptr, u0);
2205 _mm_storeu_ps((ptr + 4), u1);
2208 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
2210 __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2211 __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2212 __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2213 __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2214 __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2215 __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2216 __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2217 __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2218 __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2220 _mm_storeu_ps(ptr + 0, v0);
2221 _mm_storeu_ps(ptr + 4, v1);
2222 _mm_storeu_ps(ptr + 8, v2);
2225 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2226 const v_float32x4& c, const v_float32x4& d)
2228 __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2229 __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2230 __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2231 __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2232 __m128 v0 = _mm_unpacklo_ps(u0, u1);
2233 __m128 v2 = _mm_unpacklo_ps(u2, u3);
2234 __m128 v1 = _mm_unpackhi_ps(u0, u1);
2235 __m128 v3 = _mm_unpackhi_ps(u2, u3);
2237 _mm_storeu_ps(ptr + 0, v0);
2238 _mm_storeu_ps(ptr + 4, v1);
2239 _mm_storeu_ps(ptr + 8, v2);
2240 _mm_storeu_ps(ptr + 12, v3);
2243 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b)
2245 __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
2246 __m128i t1 = _mm_unpackhi_epi64(a.val, b.val);
2248 _mm_storeu_si128((__m128i*)ptr, t0);
2249 _mm_storeu_si128((__m128i*)(ptr + 2), t1);
2252 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
2254 __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
2255 __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2256 __m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
2258 _mm_storeu_si128((__m128i*)ptr, t0);
2259 _mm_storeu_si128((__m128i*)(ptr + 2), t1);
2260 _mm_storeu_si128((__m128i*)(ptr + 4), t2);
2263 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, const v_uint64x2& d)
2265 __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
2266 __m128i t1 = _mm_unpacklo_epi64(c.val, d.val);
2267 __m128i t2 = _mm_unpackhi_epi64(a.val, b.val);
2268 __m128i t3 = _mm_unpackhi_epi64(c.val, d.val);
2270 _mm_storeu_si128((__m128i*)ptr, t0);
2271 _mm_storeu_si128((__m128i*)(ptr + 2), t1);
2272 _mm_storeu_si128((__m128i*)(ptr + 4), t2);
2273 _mm_storeu_si128((__m128i*)(ptr + 6), t3);
2276 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2277 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2280 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2281 a0 = v_reinterpret_as_##suffix0(a1); \
2282 b0 = v_reinterpret_as_##suffix0(b1); \
2284 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2286 _Tpvec1 a1, b1, c1; \
2287 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2288 a0 = v_reinterpret_as_##suffix0(a1); \
2289 b0 = v_reinterpret_as_##suffix0(b1); \
2290 c0 = v_reinterpret_as_##suffix0(c1); \
2292 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2294 _Tpvec1 a1, b1, c1, d1; \
2295 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2296 a0 = v_reinterpret_as_##suffix0(a1); \
2297 b0 = v_reinterpret_as_##suffix0(b1); \
2298 c0 = v_reinterpret_as_##suffix0(c1); \
2299 d0 = v_reinterpret_as_##suffix0(d1); \
2301 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0 ) \
2303 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2304 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2305 v_store_interleave((_Tp1*)ptr, a1, b1); \
2307 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0 ) \
2309 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2310 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2311 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2312 v_store_interleave((_Tp1*)ptr, a1, b1, c1); \
2314 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2315 const _Tpvec0& c0, const _Tpvec0& d0 ) \
2317 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2318 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2319 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2320 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2321 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1); \
2324 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2325 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2326 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2327 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2328 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2330 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2332 return v_float32x4(_mm_cvtepi32_ps(a.val));
2335 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2337 return v_float32x4(_mm_cvtpd_ps(a.val));
2340 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2342 return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
2345 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2347 return v_float64x2(_mm_cvtepi32_pd(a.val));
2350 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2352 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
2355 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2357 return v_float64x2(_mm_cvtps_pd(a.val));
2360 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2362 return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
2366 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
2368 return v_float32x4(_mm_cvtph_ps(a.val));
2371 inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
2373 return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
2376 inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
2378 return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
2382 ////////////// Lookup table access ////////////////////
2384 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2386 int CV_DECL_ALIGNED(32) idx[4];
2387 v_store_aligned(idx, idxvec);
2388 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
2391 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2393 int CV_DECL_ALIGNED(32) idx[4];
2394 v_store_aligned(idx, idxvec);
2395 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
2398 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2401 v_store_low(idx, idxvec);
2402 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
2405 // loads pairs from the table and deinterleaves them, e.g. returns:
2406 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
2407 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
2408 // note that the indices are float's indices, not the float-pair indices.
2409 // in theory, this function can be used to implement bilinear interpolation,
2410 // when idxvec are the offsets within the image.
2411 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2413 int CV_DECL_ALIGNED(32) idx[4];
2414 v_store_aligned(idx, idxvec);
2415 __m128 z = _mm_setzero_ps();
2416 __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
2417 __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
2418 xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
2419 xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
2420 __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
2421 __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
2422 x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
2423 y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
2426 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2429 v_store_low(idx, idxvec);
2430 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
2431 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
2432 x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
2433 y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
2436 inline void v_cleanup() {}
2438 //! @name Check SIMD support
2440 //! @brief Check CPU capability of SIMD operation
2441 static inline bool hasSIMD128()
2443 return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
2448 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END