1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
49 #include "opencv2/core/utility.hpp"
52 #define CV_SIMD128_64F 1
59 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
74 typedef uchar lane_type;
75 typedef __m128i vector_type;
78 v_uint8x16() : val(_mm_setzero_si128()) {}
79 explicit v_uint8x16(__m128i v) : val(v) {}
80 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
81 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
83 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
84 (char)v4, (char)v5, (char)v6, (char)v7,
85 (char)v8, (char)v9, (char)v10, (char)v11,
86 (char)v12, (char)v13, (char)v14, (char)v15);
90 return (uchar)_mm_cvtsi128_si32(val);
98 typedef schar lane_type;
99 typedef __m128i vector_type;
100 enum { nlanes = 16 };
102 v_int8x16() : val(_mm_setzero_si128()) {}
103 explicit v_int8x16(__m128i v) : val(v) {}
104 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
105 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
107 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
108 (char)v4, (char)v5, (char)v6, (char)v7,
109 (char)v8, (char)v9, (char)v10, (char)v11,
110 (char)v12, (char)v13, (char)v14, (char)v15);
114 return (schar)_mm_cvtsi128_si32(val);
122 typedef ushort lane_type;
123 typedef __m128i vector_type;
126 v_uint16x8() : val(_mm_setzero_si128()) {}
127 explicit v_uint16x8(__m128i v) : val(v) {}
128 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
130 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
131 (short)v4, (short)v5, (short)v6, (short)v7);
135 return (ushort)_mm_cvtsi128_si32(val);
143 typedef short lane_type;
144 typedef __m128i vector_type;
147 v_int16x8() : val(_mm_setzero_si128()) {}
148 explicit v_int16x8(__m128i v) : val(v) {}
149 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
151 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
152 (short)v4, (short)v5, (short)v6, (short)v7);
156 return (short)_mm_cvtsi128_si32(val);
164 typedef unsigned lane_type;
165 typedef __m128i vector_type;
168 v_uint32x4() : val(_mm_setzero_si128()) {}
169 explicit v_uint32x4(__m128i v) : val(v) {}
170 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
172 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
174 unsigned get0() const
176 return (unsigned)_mm_cvtsi128_si32(val);
184 typedef int lane_type;
185 typedef __m128i vector_type;
188 v_int32x4() : val(_mm_setzero_si128()) {}
189 explicit v_int32x4(__m128i v) : val(v) {}
190 v_int32x4(int v0, int v1, int v2, int v3)
192 val = _mm_setr_epi32(v0, v1, v2, v3);
196 return _mm_cvtsi128_si32(val);
204 typedef float lane_type;
205 typedef __m128 vector_type;
208 v_float32x4() : val(_mm_setzero_ps()) {}
209 explicit v_float32x4(__m128 v) : val(v) {}
210 v_float32x4(float v0, float v1, float v2, float v3)
212 val = _mm_setr_ps(v0, v1, v2, v3);
216 return _mm_cvtss_f32(val);
224 typedef uint64 lane_type;
225 typedef __m128i vector_type;
228 v_uint64x2() : val(_mm_setzero_si128()) {}
229 explicit v_uint64x2(__m128i v) : val(v) {}
230 v_uint64x2(uint64 v0, uint64 v1)
232 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
236 int a = _mm_cvtsi128_si32(val);
237 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
238 return (unsigned)a | ((uint64)(unsigned)b << 32);
246 typedef int64 lane_type;
247 typedef __m128i vector_type;
250 v_int64x2() : val(_mm_setzero_si128()) {}
251 explicit v_int64x2(__m128i v) : val(v) {}
252 v_int64x2(int64 v0, int64 v1)
254 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
258 int a = _mm_cvtsi128_si32(val);
259 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
260 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
268 typedef double lane_type;
269 typedef __m128d vector_type;
272 v_float64x2() : val(_mm_setzero_pd()) {}
273 explicit v_float64x2(__m128d v) : val(v) {}
274 v_float64x2(double v0, double v1)
276 val = _mm_setr_pd(v0, v1);
280 return _mm_cvtsd_f64(val);
288 typedef short lane_type;
289 typedef __m128i vector_type;
292 v_float16x8() : val(_mm_setzero_si128()) {}
293 explicit v_float16x8(__m128i v) : val(v) {}
294 v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
296 val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
300 return (short)_mm_cvtsi128_si32(val);
305 inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
306 inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }
308 namespace hal_sse_internal
310 template <typename to_sse_type, typename from_sse_type>
311 to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
313 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
315 to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
316 { return sse_cast_intrin(a); }
318 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
319 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
320 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
321 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
322 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
323 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
324 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
325 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
326 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
329 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
330 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
331 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
332 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
333 { return _Tpvec(cast(a.val)); }
335 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
336 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
337 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
338 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
339 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
340 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
341 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
342 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
344 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
345 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
346 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
347 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
349 template<typename _Tpvec> inline
350 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
351 template<typename _Tpvec> inline
352 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
353 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
354 { return v_float32x4(_mm_castsi128_ps(a.val)); }
355 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
356 { return v_float32x4(_mm_castsi128_ps(a.val)); }
357 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
358 { return v_float64x2(_mm_castsi128_pd(a.val)); }
359 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
360 { return v_float64x2(_mm_castsi128_pd(a.val)); }
362 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
363 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
364 { return _Tpvec(_mm_castps_si128(a.val)); } \
365 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
366 { return _Tpvec(_mm_castpd_si128(a.val)); }
368 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
369 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
370 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
371 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
372 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
373 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
374 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
375 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
377 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
378 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
379 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
380 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
382 //////////////// PACK ///////////////
383 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
385 __m128i delta = _mm_set1_epi16(255);
386 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
387 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
390 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
392 __m128i delta = _mm_set1_epi16(255);
393 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
394 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
397 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
398 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
400 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
401 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
403 template<int n> inline
404 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
406 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
407 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
408 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
409 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
412 template<int n> inline
413 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
415 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
416 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
417 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
420 template<int n> inline
421 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
423 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
424 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
425 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
428 template<int n> inline
429 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
431 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
432 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
433 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
436 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
437 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
439 inline void v_pack_store(schar* ptr, v_int16x8& a)
440 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
442 template<int n> inline
443 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
445 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
446 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
447 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
448 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
450 template<int n> inline
451 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
453 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
454 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
455 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
456 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
460 // byte-wise "mask ? a : b"
461 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
464 return _mm_blendv_epi8(b, a, mask);
466 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
470 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
472 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
473 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
474 __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
475 __m128i r = _mm_packs_epi32(a1, b1);
476 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
479 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
481 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
482 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
483 __m128i r = _mm_packs_epi32(a1, a1);
484 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
487 template<int n> inline
488 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
490 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
491 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
492 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
493 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
496 template<int n> inline
497 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
499 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
500 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
501 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
502 _mm_storel_epi64((__m128i*)ptr, a2);
505 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
507 __m128i delta32 = _mm_set1_epi32(32768);
508 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
509 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
512 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
514 __m128i delta32 = _mm_set1_epi32(32768);
515 __m128i a1 = _mm_sub_epi32(a.val, delta32);
516 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
517 _mm_storel_epi64((__m128i*)ptr, r);
520 template<int n> inline
521 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
523 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
524 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
525 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
526 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
527 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
528 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
531 template<int n> inline
532 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
534 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
535 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
536 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
537 _mm_storel_epi64((__m128i*)ptr, a2);
540 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
541 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
543 inline void v_pack_store(short* ptr, const v_int32x4& a)
545 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
548 template<int n> inline
549 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
551 __m128i delta = _mm_set1_epi32(1 << (n-1));
552 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
553 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
556 template<int n> inline
557 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
559 __m128i delta = _mm_set1_epi32(1 << (n-1));
560 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
561 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
565 // [a0 0 | b0 0] [a1 0 | b1 0]
566 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
568 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
569 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
570 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
573 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
575 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
576 _mm_storel_epi64((__m128i*)ptr, a1);
579 // [a0 0 | b0 0] [a1 0 | b1 0]
580 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
582 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
583 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
584 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
587 inline void v_pack_store(int* ptr, const v_int64x2& a)
589 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
590 _mm_storel_epi64((__m128i*)ptr, a1);
593 template<int n> inline
594 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
596 uint64 delta = (uint64)1 << (n-1);
597 v_uint64x2 delta2(delta, delta);
598 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
599 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
600 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
601 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
602 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
605 template<int n> inline
606 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
608 uint64 delta = (uint64)1 << (n-1);
609 v_uint64x2 delta2(delta, delta);
610 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
611 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
612 _mm_storel_epi64((__m128i*)ptr, a2);
615 inline __m128i v_sign_epi64(__m128i a)
617 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
620 inline __m128i v_srai_epi64(__m128i a, int imm)
622 __m128i smask = v_sign_epi64(a);
623 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
626 template<int n> inline
627 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
629 int64 delta = (int64)1 << (n-1);
630 v_int64x2 delta2(delta, delta);
631 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
632 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
633 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
634 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
635 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
638 template<int n> inline
639 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
641 int64 delta = (int64)1 << (n-1);
642 v_int64x2 delta2(delta, delta);
643 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
644 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
645 _mm_storel_epi64((__m128i*)ptr, a2);
648 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
649 const v_float32x4& m1, const v_float32x4& m2,
650 const v_float32x4& m3)
652 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
653 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
654 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
655 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
657 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
660 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
661 const v_float32x4& m1, const v_float32x4& m2,
662 const v_float32x4& a)
664 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
665 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
666 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
668 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
671 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
672 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
674 return _Tpvec(intrin(a.val, b.val)); \
676 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
678 a.val = intrin(a.val, b.val); \
682 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
683 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
684 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
685 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
686 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
687 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
688 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
689 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
690 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
691 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
692 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
693 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
694 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
695 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
696 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
697 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
698 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
699 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
700 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
701 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
702 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
703 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
704 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
705 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
706 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
707 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
709 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
711 __m128i c0 = _mm_mul_epu32(a.val, b.val);
712 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
713 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
714 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
715 return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
717 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
720 return v_int32x4(_mm_mullo_epi32(a.val, b.val));
722 __m128i c0 = _mm_mul_epu32(a.val, b.val);
723 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
724 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
725 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
726 return v_int32x4(_mm_unpacklo_epi64(d0, d1));
729 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
734 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
740 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
741 v_int32x4& c, v_int32x4& d)
743 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
744 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
745 c.val = _mm_unpacklo_epi16(v0, v1);
746 d.val = _mm_unpackhi_epi16(v0, v1);
749 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
750 v_uint32x4& c, v_uint32x4& d)
752 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
753 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
754 c.val = _mm_unpacklo_epi16(v0, v1);
755 d.val = _mm_unpackhi_epi16(v0, v1);
758 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
759 v_uint64x2& c, v_uint64x2& d)
761 __m128i c0 = _mm_mul_epu32(a.val, b.val);
762 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
763 c.val = _mm_unpacklo_epi64(c0, c1);
764 d.val = _mm_unpackhi_epi64(c0, c1);
767 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
769 return v_int32x4(_mm_madd_epi16(a.val, b.val));
772 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
774 return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
777 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
778 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
779 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
780 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
781 inline _Tpvec operator ~ (const _Tpvec& a) \
783 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
786 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
787 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
788 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
789 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
790 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
791 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
792 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
793 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
794 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
795 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
797 inline v_float32x4 v_sqrt(const v_float32x4& x)
798 { return v_float32x4(_mm_sqrt_ps(x.val)); }
800 inline v_float32x4 v_invsqrt(const v_float32x4& x)
802 static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
804 __m128 h = _mm_mul_ps(t, _0_5);
806 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
807 return v_float32x4(t);
810 inline v_float64x2 v_sqrt(const v_float64x2& x)
811 { return v_float64x2(_mm_sqrt_pd(x.val)); }
813 inline v_float64x2 v_invsqrt(const v_float64x2& x)
815 static const __m128d v_1 = _mm_set1_pd(1.);
816 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
819 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
820 inline _Tpuvec v_abs(const _Tpsvec& x) \
821 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
823 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
824 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
825 inline v_uint32x4 v_abs(const v_int32x4& x)
827 __m128i s = _mm_srli_epi32(x.val, 31);
828 __m128i f = _mm_srai_epi32(x.val, 31);
829 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
831 inline v_float32x4 v_abs(const v_float32x4& x)
832 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
833 inline v_float64x2 v_abs(const v_float64x2& x)
835 return v_float64x2(_mm_and_pd(x.val,
836 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
839 // TODO: exp, log, sin, cos
841 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
842 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
844 return _Tpvec(intrin(a.val, b.val)); \
847 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
848 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
849 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
850 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
851 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
852 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
853 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
854 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
856 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
859 return v_int8x16(_mm_min_epi8(a.val, b.val));
861 __m128i delta = _mm_set1_epi8((char)-128);
862 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
863 _mm_xor_si128(b.val, delta))));
866 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
869 return v_int8x16(_mm_max_epi8(a.val, b.val));
871 __m128i delta = _mm_set1_epi8((char)-128);
872 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
873 _mm_xor_si128(b.val, delta))));
876 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
879 return v_uint16x8(_mm_min_epu16(a.val, b.val));
881 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
884 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
887 return v_uint16x8(_mm_max_epu16(a.val, b.val));
889 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
892 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
895 return v_uint32x4(_mm_min_epu32(a.val, b.val));
897 __m128i delta = _mm_set1_epi32((int)0x80000000);
898 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
899 return v_uint32x4(v_select_si128(mask, b.val, a.val));
902 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
905 return v_uint32x4(_mm_max_epu32(a.val, b.val));
907 __m128i delta = _mm_set1_epi32((int)0x80000000);
908 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
909 return v_uint32x4(v_select_si128(mask, a.val, b.val));
912 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
915 return v_int32x4(_mm_min_epi32(a.val, b.val));
917 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
920 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
923 return v_int32x4(_mm_max_epi32(a.val, b.val));
925 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
929 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
930 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
931 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
932 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
934 __m128i not_mask = _mm_set1_epi32(-1); \
935 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
937 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
938 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
939 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
941 __m128i not_mask = _mm_set1_epi32(-1); \
942 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
944 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
946 __m128i smask = _mm_set1_##suffix(sbit); \
947 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
949 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
951 __m128i smask = _mm_set1_##suffix(sbit); \
952 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
954 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
956 __m128i smask = _mm_set1_##suffix(sbit); \
957 __m128i not_mask = _mm_set1_epi32(-1); \
958 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
959 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
961 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
963 __m128i smask = _mm_set1_##suffix(sbit); \
964 __m128i not_mask = _mm_set1_epi32(-1); \
965 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
966 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
968 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
970 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
972 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
974 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
976 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
978 __m128i not_mask = _mm_set1_epi32(-1); \
979 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
981 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
983 __m128i not_mask = _mm_set1_epi32(-1); \
984 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
987 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
988 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
989 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
991 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
992 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
993 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
994 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
995 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
996 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
997 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
998 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
999 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
1000 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
1001 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
1002 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
1003 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
1005 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
1006 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
1008 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
1009 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
1010 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1011 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1012 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1014 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
1015 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
1017 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
1018 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
1019 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
1020 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
1021 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1022 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
1023 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1024 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
1026 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
1027 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
1029 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
1031 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
1033 __m128i smask = _mm_set1_epi32(smask32); \
1034 __m128i a1 = _mm_xor_si128(a.val, smask); \
1035 __m128i b1 = _mm_xor_si128(b.val, smask); \
1036 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
1039 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
1040 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
1042 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1044 return v_max(a, b) - v_min(a, b);
1047 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1049 __m128i d = _mm_sub_epi32(a.val, b.val);
1050 __m128i m = _mm_cmpgt_epi32(b.val, a.val);
1051 return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
1054 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1059 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1061 return v_fma(a, b, c);
1064 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1067 return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1069 return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1073 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1076 return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1078 return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1082 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1083 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1085 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1086 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1088 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1090 _Tpvec res = v_fma(a, a, b*b); \
1091 return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1093 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1095 return v_fma(a, a, b*b); \
1097 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1099 return v_fma(a, b, c); \
1102 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
1103 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1105 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1106 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1108 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1110 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1112 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1114 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1116 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1118 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1120 return _Tpsvec(srai(a.val, imm)); \
1123 inline _Tpuvec v_shl(const _Tpuvec& a) \
1125 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1128 inline _Tpsvec v_shl(const _Tpsvec& a) \
1130 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1133 inline _Tpuvec v_shr(const _Tpuvec& a) \
1135 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1138 inline _Tpsvec v_shr(const _Tpsvec& a) \
1140 return _Tpsvec(srai(a.val, imm)); \
1143 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1144 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1145 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1147 namespace hal_sse_internal
1150 bool is_invalid = ((imm < 0) || (imm > 16)),
1151 bool is_first = (imm == 0),
1152 bool is_half = (imm == 8),
1153 bool is_second = (imm == 16),
1154 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1155 class v_sse_palignr_u8_class;
1158 class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1161 class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1164 inline __m128i operator()(const __m128i& a, const __m128i&) const
1171 class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1174 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1176 return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1181 class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1184 inline __m128i operator()(const __m128i&, const __m128i& b) const
1191 class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1195 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1197 return _mm_alignr_epi8(b, a, imm);
1201 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1203 enum { imm2 = (sizeof(__m128i) - imm) };
1204 return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1210 inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
1212 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
1213 return v_sse_palignr_u8_class<imm>()(a, b);
1217 template<int imm, typename _Tpvec>
1218 inline _Tpvec v_rotate_right(const _Tpvec &a)
1220 using namespace hal_sse_internal;
1221 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1222 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1224 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1227 template<int imm, typename _Tpvec>
1228 inline _Tpvec v_rotate_left(const _Tpvec &a)
1230 using namespace hal_sse_internal;
1231 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1232 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1234 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1237 template<int imm, typename _Tpvec>
1238 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1240 using namespace hal_sse_internal;
1241 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1242 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1243 v_sse_palignr_u8<imm2>(
1244 v_sse_reinterpret_as<__m128i>(a.val),
1245 v_sse_reinterpret_as<__m128i>(b.val))));
1248 template<int imm, typename _Tpvec>
1249 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1251 using namespace hal_sse_internal;
1252 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1253 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1254 v_sse_palignr_u8<imm2>(
1255 v_sse_reinterpret_as<__m128i>(b.val),
1256 v_sse_reinterpret_as<__m128i>(a.val))));
1259 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1260 inline _Tpvec v_load(const _Tp* ptr) \
1261 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1262 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1263 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1264 inline _Tpvec v_load_low(const _Tp* ptr) \
1265 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1266 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1268 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1269 _mm_loadl_epi64((const __m128i*)ptr1))); \
1271 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1272 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1273 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1274 { _mm_store_si128((__m128i*)ptr, a.val); } \
1275 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1276 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1277 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1278 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1280 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1281 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1282 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1283 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1284 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1285 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1286 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1287 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1289 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1290 inline _Tpvec v_load(const _Tp* ptr) \
1291 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1292 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1293 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1294 inline _Tpvec v_load_low(const _Tp* ptr) \
1295 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1296 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1298 return _Tpvec(_mm_castsi128_##suffix( \
1299 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1300 _mm_loadl_epi64((const __m128i*)ptr1)))); \
1302 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1303 { _mm_storeu_##suffix(ptr, a.val); } \
1304 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1305 { _mm_store_##suffix(ptr, a.val); } \
1306 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1307 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1308 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1310 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1311 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1314 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1315 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1317 inline v_float16x8 v_load_f16(const short* ptr)
1318 { return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
1319 inline v_float16x8 v_load_f16_aligned(const short* ptr)
1320 { return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
1322 inline void v_store(short* ptr, const v_float16x8& a)
1323 { _mm_storeu_si128((__m128i*)ptr, a.val); }
1324 inline void v_store_aligned(short* ptr, const v_float16x8& a)
1325 { _mm_store_si128((__m128i*)ptr, a.val); }
1327 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1328 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1330 __m128i val = a.val; \
1331 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1332 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1333 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1334 return (scalartype)_mm_cvtsi128_si32(val); \
1336 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1338 __m128i val = a.val; \
1339 __m128i smask = _mm_set1_epi16(sbit); \
1340 val = _mm_xor_si128(val, smask); \
1341 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1342 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1343 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1344 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1346 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
1347 inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
1349 __m128i val = a.val; \
1350 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
1351 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
1352 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
1353 return (scalartype)_mm_cvtsi128_si32(val); \
1355 inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
1357 __m128i val = a.val; \
1358 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
1359 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
1360 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
1361 return (unsigned scalartype)_mm_cvtsi128_si32(val); \
1363 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1364 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1365 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
1367 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1368 inline scalartype v_reduce_sum(const _Tpvec& a) \
1370 regtype val = a.val; \
1371 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1372 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1373 return (scalartype)_mm_cvt##extract(val); \
1376 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1377 inline scalartype v_reduce_##func(const _Tpvec& a) \
1379 scalartype CV_DECL_ALIGNED(16) buf[4]; \
1380 v_store_aligned(buf, a); \
1381 scalartype s0 = scalar_func(buf[0], buf[1]); \
1382 scalartype s1 = scalar_func(buf[2], buf[3]); \
1383 return scalar_func(s0, s1); \
1386 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1387 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1388 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1390 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1391 const v_float32x4& c, const v_float32x4& d)
1394 __m128 ab = _mm_hadd_ps(a.val, b.val);
1395 __m128 cd = _mm_hadd_ps(c.val, d.val);
1396 return v_float32x4(_mm_hadd_ps(ab, cd));
1398 __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1399 __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1400 return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1404 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1405 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1406 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1407 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1408 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1409 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1411 #define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
1412 inline v_uint32x4 v_popcount(const _Tpvec& a) \
1414 __m128i m1 = _mm_set1_epi32(0x55555555); \
1415 __m128i m2 = _mm_set1_epi32(0x33333333); \
1416 __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
1417 __m128i p = a.val; \
1418 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
1419 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
1420 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
1421 p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
1422 p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
1423 return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
1426 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
1427 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
1428 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
1429 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
1430 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
1431 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
1433 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
1434 inline int v_signmask(const _Tpvec& a) \
1436 return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
1438 inline bool v_check_all(const _Tpvec& a) \
1439 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
1440 inline bool v_check_any(const _Tpvec& a) \
1441 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
1443 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
1444 inline __m128i v_packq_epi32(__m128i a)
1446 __m128i b = _mm_packs_epi32(a, a);
1447 return _mm_packs_epi16(b, b);
1450 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1451 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1452 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1453 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1454 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1455 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1456 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
1457 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
1460 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1461 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1463 return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1466 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1467 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1468 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1469 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1470 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1471 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1472 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
1473 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
1474 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1475 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1479 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1480 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1482 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1485 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1486 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1487 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1488 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1489 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1490 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1491 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1492 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1493 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1494 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1497 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
1498 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
1500 __m128i z = _mm_setzero_si128(); \
1501 b0.val = _mm_unpacklo_##suffix(a.val, z); \
1502 b1.val = _mm_unpackhi_##suffix(a.val, z); \
1504 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
1506 __m128i z = _mm_setzero_si128(); \
1507 return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
1509 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
1511 b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
1512 b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
1514 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
1516 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1517 return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
1520 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
1521 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
1523 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
1525 __m128i z = _mm_setzero_si128();
1526 b0.val = _mm_unpacklo_epi32(a.val, z);
1527 b1.val = _mm_unpackhi_epi32(a.val, z);
1529 inline v_uint64x2 v_load_expand(const unsigned* ptr)
1531 __m128i z = _mm_setzero_si128();
1532 return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
1534 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
1536 __m128i s = _mm_srai_epi32(a.val, 31);
1537 b0.val = _mm_unpacklo_epi32(a.val, s);
1538 b1.val = _mm_unpackhi_epi32(a.val, s);
1540 inline v_int64x2 v_load_expand(const int* ptr)
1542 __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
1543 __m128i s = _mm_srai_epi32(a, 31);
1544 return v_int64x2(_mm_unpacklo_epi32(a, s));
1547 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1549 __m128i z = _mm_setzero_si128();
1550 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1551 return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
1554 inline v_int32x4 v_load_expand_q(const schar* ptr)
1556 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1557 a = _mm_unpacklo_epi8(a, a);
1558 a = _mm_unpacklo_epi8(a, a);
1559 return v_int32x4(_mm_srai_epi32(a, 24));
1562 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1563 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1565 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1566 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1568 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1570 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1571 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1573 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1575 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1576 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1578 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1580 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1581 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1582 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1585 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1586 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1587 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1588 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1589 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1590 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1591 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1592 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1594 template<int s, typename _Tpvec>
1595 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1597 return v_rotate_right<s>(a, b);
1600 inline v_int32x4 v_round(const v_float32x4& a)
1601 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
1603 inline v_int32x4 v_floor(const v_float32x4& a)
1605 __m128i a1 = _mm_cvtps_epi32(a.val);
1606 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
1607 return v_int32x4(_mm_add_epi32(a1, mask));
1610 inline v_int32x4 v_ceil(const v_float32x4& a)
1612 __m128i a1 = _mm_cvtps_epi32(a.val);
1613 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
1614 return v_int32x4(_mm_sub_epi32(a1, mask));
1617 inline v_int32x4 v_trunc(const v_float32x4& a)
1618 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
1620 inline v_int32x4 v_round(const v_float64x2& a)
1621 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
1623 inline v_int32x4 v_floor(const v_float64x2& a)
1625 __m128i a1 = _mm_cvtpd_epi32(a.val);
1626 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
1627 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1628 return v_int32x4(_mm_add_epi32(a1, mask));
1631 inline v_int32x4 v_ceil(const v_float64x2& a)
1633 __m128i a1 = _mm_cvtpd_epi32(a.val);
1634 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
1635 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1636 return v_int32x4(_mm_sub_epi32(a1, mask));
1639 inline v_int32x4 v_trunc(const v_float64x2& a)
1640 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
1642 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1643 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1644 const _Tpvec& a2, const _Tpvec& a3, \
1645 _Tpvec& b0, _Tpvec& b1, \
1646 _Tpvec& b2, _Tpvec& b3) \
1648 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
1649 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
1650 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
1651 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
1653 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
1654 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
1655 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
1656 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
1659 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1660 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1661 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1663 // adopted from sse_utils.hpp
1664 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1666 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1667 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1669 __m128i t10 = _mm_unpacklo_epi8(t00, t01);
1670 __m128i t11 = _mm_unpackhi_epi8(t00, t01);
1672 __m128i t20 = _mm_unpacklo_epi8(t10, t11);
1673 __m128i t21 = _mm_unpackhi_epi8(t10, t11);
1675 __m128i t30 = _mm_unpacklo_epi8(t20, t21);
1676 __m128i t31 = _mm_unpackhi_epi8(t20, t21);
1678 a.val = _mm_unpacklo_epi8(t30, t31);
1679 b.val = _mm_unpackhi_epi8(t30, t31);
1682 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1685 static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
1686 static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
1687 static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
1689 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
1690 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1691 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1693 __m128i s0 = _mm_shuffle_epi8(t0, m0);
1694 __m128i s1 = _mm_shuffle_epi8(t1, m1);
1695 __m128i s2 = _mm_shuffle_epi8(t2, m2);
1697 t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
1698 a.val = _mm_alignr_epi8(s2, t0, 5);
1700 t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
1701 b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
1703 t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
1704 c.val = _mm_alignr_epi8(t2, s0, 11);
1706 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1707 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1708 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1710 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
1711 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
1712 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
1714 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
1715 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
1716 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
1718 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
1719 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
1720 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
1722 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
1723 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
1724 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
1728 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
1730 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
1731 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1732 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
1733 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
1735 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
1736 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
1737 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
1738 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
1740 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
1741 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
1742 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
1743 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
1745 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
1746 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
1747 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
1748 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
1750 a.val = _mm_unpacklo_epi8(v0, v1);
1751 b.val = _mm_unpackhi_epi8(v0, v1);
1752 c.val = _mm_unpacklo_epi8(v2, v3);
1753 d.val = _mm_unpackhi_epi8(v2, v3);
1756 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
1758 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1759 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1760 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1762 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
1763 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
1764 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
1766 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
1767 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
1768 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
1770 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
1771 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
1772 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
1775 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
1777 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
1778 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
1779 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1780 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
1782 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
1783 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
1784 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
1785 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
1787 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
1788 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
1789 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
1790 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
1792 a.val = _mm_unpacklo_epi16(u0, u1);
1793 b.val = _mm_unpackhi_epi16(u0, u1);
1794 c.val = _mm_unpacklo_epi16(u2, u3);
1795 d.val = _mm_unpackhi_epi16(u2, u3);
1798 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
1800 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1801 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1802 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1804 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
1805 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
1806 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
1808 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
1809 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
1810 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
1813 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
1815 v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
1816 v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
1817 v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
1818 v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
1820 v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
1823 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
1825 __m128 t0 = _mm_loadu_ps(ptr + 0);
1826 __m128 t1 = _mm_loadu_ps(ptr + 4);
1827 __m128 t2 = _mm_loadu_ps(ptr + 8);
1829 __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
1830 a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
1832 __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
1833 __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
1834 b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
1836 __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
1837 c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
1840 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
1842 __m128 t0 = _mm_loadu_ps(ptr + 0);
1843 __m128 t1 = _mm_loadu_ps(ptr + 4);
1844 __m128 t2 = _mm_loadu_ps(ptr + 8);
1845 __m128 t3 = _mm_loadu_ps(ptr + 12);
1846 __m128 t02lo = _mm_unpacklo_ps(t0, t2);
1847 __m128 t13lo = _mm_unpacklo_ps(t1, t3);
1848 __m128 t02hi = _mm_unpackhi_ps(t0, t2);
1849 __m128 t13hi = _mm_unpackhi_ps(t1, t3);
1850 a.val = _mm_unpacklo_ps(t02lo, t13lo);
1851 b.val = _mm_unpackhi_ps(t02lo, t13lo);
1852 c.val = _mm_unpacklo_ps(t02hi, t13hi);
1853 d.val = _mm_unpackhi_ps(t02hi, t13hi);
1856 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
1858 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
1859 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
1860 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1862 a = v_uint64x2(_mm_unpacklo_epi64(t0, _mm_unpackhi_epi64(t1, t1)));
1863 b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
1864 c = v_uint64x2(_mm_unpacklo_epi64(t1, _mm_unpackhi_epi64(t2, t2)));
1867 inline void v_load_deinterleave(const int64 *ptr, v_int64x2& a, v_int64x2& b, v_int64x2& c)
1869 v_uint64x2 t0, t1, t2;
1870 v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
1871 a = v_reinterpret_as_s64(t0);
1872 b = v_reinterpret_as_s64(t1);
1873 c = v_reinterpret_as_s64(t2);
1876 inline void v_load_deinterleave(const double *ptr, v_float64x2& a, v_float64x2& b, v_float64x2& c)
1878 v_uint64x2 t0, t1, t2;
1879 v_load_deinterleave((const uint64*)ptr, t0, t1, t2);
1880 a = v_reinterpret_as_f64(t0);
1881 b = v_reinterpret_as_f64(t1);
1882 c = v_reinterpret_as_f64(t2);
1886 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
1888 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
1890 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
1891 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
1893 a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
1894 b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
1897 inline void v_load_deinterleave(const short* ptr, v_int16x8& a, v_int16x8& b)
1899 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
1900 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
1902 __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
1903 __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
1904 __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
1905 __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
1907 a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
1908 b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
1911 inline void v_load_deinterleave(const ushort*ptr, v_uint16x8& a, v_uint16x8& b)
1914 v_load_deinterleave((const short*)ptr, sa, sb);
1915 a = v_reinterpret_as_u16(sa);
1916 b = v_reinterpret_as_u16(sb);
1919 inline void v_store_interleave(short* ptr, const v_int16x8& a, const v_int16x8& b)
1922 t0 = _mm_unpacklo_epi16(a.val, b.val);
1923 t1 = _mm_unpackhi_epi16(a.val, b.val);
1924 _mm_storeu_si128((__m128i*)(ptr), t0);
1925 _mm_storeu_si128((__m128i*)(ptr + 8), t1);
1928 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b)
1930 __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
1931 __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
1933 _mm_storeu_si128((__m128i*)(ptr), v0);
1934 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
1937 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1938 const v_uint8x16& c )
1941 static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
1942 static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
1943 static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
1945 __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
1946 t0 = _mm_alignr_epi8(c.val, t0, 5);
1947 __m128i s0 = _mm_shuffle_epi8(t0, m0);
1949 __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
1950 t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
1951 __m128i s1 = _mm_shuffle_epi8(t1, m1);
1953 __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
1954 t2 = _mm_alignr_epi8(t2, a.val, 11);
1955 __m128i s2 = _mm_shuffle_epi8(t2, m2);
1957 _mm_storeu_si128((__m128i*)ptr, s0);
1958 _mm_storeu_si128((__m128i*)(ptr + 16), s1);
1959 _mm_storeu_si128((__m128i*)(ptr + 32), s2);
1961 __m128i z = _mm_setzero_si128();
1962 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
1963 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
1964 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
1965 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
1967 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
1968 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
1969 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
1970 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
1972 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
1973 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
1974 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
1975 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
1977 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
1978 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
1979 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
1980 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
1982 p20 = _mm_slli_si128(p20, 1);
1983 p22 = _mm_slli_si128(p22, 1);
1985 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
1986 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
1987 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
1988 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
1990 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
1991 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
1992 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
1993 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
1995 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
1996 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
1997 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
1999 _mm_storeu_si128((__m128i*)(ptr), v0);
2000 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2001 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2005 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2006 const v_uint8x16& c, const v_uint8x16& d)
2012 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
2013 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
2014 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
2015 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
2017 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
2018 __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
2019 __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
2020 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
2022 _mm_storeu_si128((__m128i*)ptr, v0);
2023 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2024 _mm_storeu_si128((__m128i*)(ptr + 32), v1);
2025 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2028 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2029 const v_uint16x8& b,
2030 const v_uint16x8& c )
2032 __m128i z = _mm_setzero_si128();
2033 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2034 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2035 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2036 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2038 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2039 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2040 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2041 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2043 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2044 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2045 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2046 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2048 p20 = _mm_slli_si128(p20, 2);
2049 p22 = _mm_slli_si128(p22, 2);
2051 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2052 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2053 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2054 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2056 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2057 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2058 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2060 _mm_storeu_si128((__m128i*)(ptr), v0);
2061 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2062 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2065 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2066 const v_uint16x8& c, const v_uint16x8& d)
2072 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
2073 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
2074 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
2075 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
2077 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
2078 __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
2079 __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
2080 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
2082 _mm_storeu_si128((__m128i*)ptr, v0);
2083 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2084 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2085 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2088 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2089 const v_uint32x4& c )
2091 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2092 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
2094 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2095 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2096 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2098 _mm_storeu_si128((__m128i*)ptr, v0);
2099 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2100 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2103 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2104 const v_uint32x4& c, const v_uint32x4& d)
2106 v_uint32x4 t0, t1, t2, t3;
2107 v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
2109 v_store(ptr + 4, t1);
2110 v_store(ptr + 8, t2);
2111 v_store(ptr + 12, t3);
2114 // 2-channel, float only
2115 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
2119 __m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
2120 __m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
2122 _mm_storeu_ps(ptr, u0);
2123 _mm_storeu_ps((ptr + 4), u1);
2126 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
2128 __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2129 __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2130 __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2131 __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2132 __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2133 __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2134 __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2135 __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2136 __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2138 _mm_storeu_ps(ptr + 0, v0);
2139 _mm_storeu_ps(ptr + 4, v1);
2140 _mm_storeu_ps(ptr + 8, v2);
2143 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2144 const v_float32x4& c, const v_float32x4& d)
2146 __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2147 __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2148 __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2149 __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2150 __m128 v0 = _mm_unpacklo_ps(u0, u1);
2151 __m128 v2 = _mm_unpacklo_ps(u2, u3);
2152 __m128 v1 = _mm_unpackhi_ps(u0, u1);
2153 __m128 v3 = _mm_unpackhi_ps(u2, u3);
2155 _mm_storeu_ps(ptr + 0, v0);
2156 _mm_storeu_ps(ptr + 4, v1);
2157 _mm_storeu_ps(ptr + 8, v2);
2158 _mm_storeu_ps(ptr + 12, v3);
2161 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c)
2163 __m128i t0 = _mm_unpacklo_epi64(a.val, b.val);
2164 __m128i t1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2165 __m128i t2 = _mm_unpackhi_epi64(b.val, c.val);
2167 _mm_storeu_si128((__m128i*)ptr, t0);
2168 _mm_storeu_si128((__m128i*)(ptr + 2), t1);
2169 _mm_storeu_si128((__m128i*)(ptr + 4), t2);
2172 inline void v_store_interleave(int64 *ptr, const v_int64x2& a, const v_int64x2& b, const v_int64x2& c)
2174 v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
2177 inline void v_store_interleave(double *ptr, const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
2179 v_store_interleave((uint64*)ptr, v_reinterpret_as_u64(a), v_reinterpret_as_u64(b), v_reinterpret_as_u64(c));
2182 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
2183 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
2184 _Tpvec& b0, _Tpvec& c0 ) \
2186 _Tpuvec a1, b1, c1; \
2187 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
2188 a0 = v_reinterpret_as_##suffix(a1); \
2189 b0 = v_reinterpret_as_##suffix(b1); \
2190 c0 = v_reinterpret_as_##suffix(c1); \
2192 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
2193 _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
2195 _Tpuvec a1, b1, c1, d1; \
2196 v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
2197 a0 = v_reinterpret_as_##suffix(a1); \
2198 b0 = v_reinterpret_as_##suffix(b1); \
2199 c0 = v_reinterpret_as_##suffix(c1); \
2200 d0 = v_reinterpret_as_##suffix(d1); \
2202 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
2203 const _Tpvec& b0, const _Tpvec& c0 ) \
2205 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
2206 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
2207 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
2208 v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
2210 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
2211 const _Tpvec& c0, const _Tpvec& d0 ) \
2213 _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
2214 _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
2215 _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
2216 _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
2217 v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
2220 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2221 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2222 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2223 //OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
2225 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2227 return v_float32x4(_mm_cvtepi32_ps(a.val));
2230 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2232 return v_float32x4(_mm_cvtpd_ps(a.val));
2235 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2237 return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
2240 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2242 return v_float64x2(_mm_cvtepi32_pd(a.val));
2245 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2247 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
2250 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2252 return v_float64x2(_mm_cvtps_pd(a.val));
2255 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2257 return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
2261 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
2263 return v_float32x4(_mm_cvtph_ps(a.val));
2266 inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
2268 return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
2271 inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
2273 return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
2277 ////////////// Lookup table access ////////////////////
2279 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2281 int CV_DECL_ALIGNED(32) idx[4];
2282 v_store_aligned(idx, idxvec);
2283 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
2286 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2288 int CV_DECL_ALIGNED(32) idx[4];
2289 v_store_aligned(idx, idxvec);
2290 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
2293 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2296 v_store_low(idx, idxvec);
2297 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
2300 // loads pairs from the table and deinterleaves them, e.g. returns:
2301 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
2302 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
2303 // note that the indices are float's indices, not the float-pair indices.
2304 // in theory, this function can be used to implement bilinear interpolation,
2305 // when idxvec are the offsets within the image.
2306 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2308 int CV_DECL_ALIGNED(32) idx[4];
2309 v_store_aligned(idx, idxvec);
2310 __m128 z = _mm_setzero_ps();
2311 __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
2312 __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
2313 xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
2314 xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
2315 __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
2316 __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
2317 x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
2318 y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
2321 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2324 v_store_low(idx, idxvec);
2325 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
2326 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
2327 x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
2328 y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
2331 inline void v_cleanup() {}
2333 //! @name Check SIMD support
2335 //! @brief Check CPU capability of SIMD operation
2336 static inline bool hasSIMD128()
2338 return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
2343 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END