1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
17 // Third party copyrights are property of their respective owners.
19 // Redistribution and use in source and binary forms, with or without modification,
20 // are permitted provided that the following conditions are met:
22 // * Redistribution's of source code must retain the above copyright notice,
23 // this list of conditions and the following disclaimer.
25 // * Redistribution's in binary form must reproduce the above copyright notice,
26 // this list of conditions and the following disclaimer in the documentation
27 // and/or other materials provided with the distribution.
29 // * The name of the copyright holders may not be used to endorse or promote products
30 // derived from this software without specific prior written permission.
32 // This software is provided by the copyright holders and contributors "as is" and
33 // any express or implied warranties, including, but not limited to, the implied
34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
35 // In no event shall the Intel Corporation or contributors be liable for any direct,
36 // indirect, incidental, special, exemplary, or consequential damages
37 // (including, but not limited to, procurement of substitute goods or services;
38 // loss of use, data, or profits; or business interruption) however caused
39 // and on any theory of liability, whether in contract, strict liability,
40 // or tort (including negligence or otherwise) arising in any way out of
41 // the use of this software, even if advised of the possibility of such damage.
45 #ifndef OPENCV_HAL_SSE_HPP
46 #define OPENCV_HAL_SSE_HPP
49 #include "opencv2/core/utility.hpp"
52 #define CV_SIMD128_64F 1
59 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
63 typedef uchar lane_type;
64 typedef __m128i vector_type;
67 v_uint8x16() : val(_mm_setzero_si128()) {}
68 explicit v_uint8x16(__m128i v) : val(v) {}
69 v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
70 uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
72 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
73 (char)v4, (char)v5, (char)v6, (char)v7,
74 (char)v8, (char)v9, (char)v10, (char)v11,
75 (char)v12, (char)v13, (char)v14, (char)v15);
79 return (uchar)_mm_cvtsi128_si32(val);
87 typedef schar lane_type;
88 typedef __m128i vector_type;
91 v_int8x16() : val(_mm_setzero_si128()) {}
92 explicit v_int8x16(__m128i v) : val(v) {}
93 v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
94 schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
96 val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
97 (char)v4, (char)v5, (char)v6, (char)v7,
98 (char)v8, (char)v9, (char)v10, (char)v11,
99 (char)v12, (char)v13, (char)v14, (char)v15);
103 return (schar)_mm_cvtsi128_si32(val);
111 typedef ushort lane_type;
112 typedef __m128i vector_type;
115 v_uint16x8() : val(_mm_setzero_si128()) {}
116 explicit v_uint16x8(__m128i v) : val(v) {}
117 v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
119 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
120 (short)v4, (short)v5, (short)v6, (short)v7);
124 return (ushort)_mm_cvtsi128_si32(val);
132 typedef short lane_type;
133 typedef __m128i vector_type;
136 v_int16x8() : val(_mm_setzero_si128()) {}
137 explicit v_int16x8(__m128i v) : val(v) {}
138 v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
140 val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
141 (short)v4, (short)v5, (short)v6, (short)v7);
145 return (short)_mm_cvtsi128_si32(val);
153 typedef unsigned lane_type;
154 typedef __m128i vector_type;
157 v_uint32x4() : val(_mm_setzero_si128()) {}
158 explicit v_uint32x4(__m128i v) : val(v) {}
159 v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
161 val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
163 unsigned get0() const
165 return (unsigned)_mm_cvtsi128_si32(val);
173 typedef int lane_type;
174 typedef __m128i vector_type;
177 v_int32x4() : val(_mm_setzero_si128()) {}
178 explicit v_int32x4(__m128i v) : val(v) {}
179 v_int32x4(int v0, int v1, int v2, int v3)
181 val = _mm_setr_epi32(v0, v1, v2, v3);
185 return _mm_cvtsi128_si32(val);
193 typedef float lane_type;
194 typedef __m128 vector_type;
197 v_float32x4() : val(_mm_setzero_ps()) {}
198 explicit v_float32x4(__m128 v) : val(v) {}
199 v_float32x4(float v0, float v1, float v2, float v3)
201 val = _mm_setr_ps(v0, v1, v2, v3);
205 return _mm_cvtss_f32(val);
213 typedef uint64 lane_type;
214 typedef __m128i vector_type;
217 v_uint64x2() : val(_mm_setzero_si128()) {}
218 explicit v_uint64x2(__m128i v) : val(v) {}
219 v_uint64x2(uint64 v0, uint64 v1)
221 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
225 int a = _mm_cvtsi128_si32(val);
226 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
227 return (unsigned)a | ((uint64)(unsigned)b << 32);
235 typedef int64 lane_type;
236 typedef __m128i vector_type;
239 v_int64x2() : val(_mm_setzero_si128()) {}
240 explicit v_int64x2(__m128i v) : val(v) {}
241 v_int64x2(int64 v0, int64 v1)
243 val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
247 int a = _mm_cvtsi128_si32(val);
248 int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
249 return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
257 typedef double lane_type;
258 typedef __m128d vector_type;
261 v_float64x2() : val(_mm_setzero_pd()) {}
262 explicit v_float64x2(__m128d v) : val(v) {}
263 v_float64x2(double v0, double v1)
265 val = _mm_setr_pd(v0, v1);
269 return _mm_cvtsd_f64(val);
277 typedef short lane_type;
278 typedef __m128i vector_type;
281 v_float16x8() : val(_mm_setzero_si128()) {}
282 explicit v_float16x8(__m128i v) : val(v) {}
283 v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
285 val = _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
289 return (short)_mm_cvtsi128_si32(val);
294 inline v_float16x8 v_setzero_f16() { return v_float16x8(_mm_setzero_si128()); }
295 inline v_float16x8 v_setall_f16(short val) { return v_float16x8(_mm_set1_epi16(val)); }
297 namespace hal_sse_internal
299 template <typename to_sse_type, typename from_sse_type>
300 to_sse_type v_sse_reinterpret_as(const from_sse_type& val);
302 #define OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(to_sse_type, from_sse_type, sse_cast_intrin) \
304 to_sse_type v_sse_reinterpret_as(const from_sse_type& a) \
305 { return sse_cast_intrin(a); }
307 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128i, OPENCV_HAL_NOP)
308 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128, _mm_castps_si128)
309 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128i, __m128d, _mm_castpd_si128)
310 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128i, _mm_castsi128_ps)
311 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128, OPENCV_HAL_NOP)
312 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128, __m128d, _mm_castpd_ps)
313 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128i, _mm_castsi128_pd)
314 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128, _mm_castps_pd)
315 OPENCV_HAL_IMPL_SSE_REINTERPRET_RAW(__m128d, __m128d, OPENCV_HAL_NOP)
318 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
319 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
320 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
321 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
322 { return _Tpvec(cast(a.val)); }
324 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
325 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
326 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
327 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
328 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
329 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
330 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
331 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
333 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
334 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
335 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
336 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
338 template<typename _Tpvec> inline
339 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
340 template<typename _Tpvec> inline
341 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
342 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
343 { return v_float32x4(_mm_castsi128_ps(a.val)); }
344 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
345 { return v_float32x4(_mm_castsi128_ps(a.val)); }
346 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
347 { return v_float64x2(_mm_castsi128_pd(a.val)); }
348 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
349 { return v_float64x2(_mm_castsi128_pd(a.val)); }
351 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
352 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
353 { return _Tpvec(_mm_castps_si128(a.val)); } \
354 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
355 { return _Tpvec(_mm_castpd_si128(a.val)); }
357 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
358 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
359 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
360 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
361 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
362 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
363 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
364 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
366 inline v_float32x4 v_reinterpret_as_f32(const v_float32x4& a) {return a; }
367 inline v_float64x2 v_reinterpret_as_f64(const v_float64x2& a) {return a; }
368 inline v_float32x4 v_reinterpret_as_f32(const v_float64x2& a) {return v_float32x4(_mm_castpd_ps(a.val)); }
369 inline v_float64x2 v_reinterpret_as_f64(const v_float32x4& a) {return v_float64x2(_mm_castps_pd(a.val)); }
371 //////////////// PACK ///////////////
372 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
374 __m128i delta = _mm_set1_epi16(255);
375 return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
376 _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
379 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
381 __m128i delta = _mm_set1_epi16(255);
382 __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
383 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
386 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
387 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
389 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
390 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
392 template<int n> inline
393 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
395 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
396 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
397 return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
398 _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
401 template<int n> inline
402 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
404 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
405 __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
406 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
409 template<int n> inline
410 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
412 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
413 return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
414 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
417 template<int n> inline
418 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
420 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
421 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
422 _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
425 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
426 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
428 inline void v_pack_store(schar* ptr, v_int16x8& a)
429 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
431 template<int n> inline
432 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
434 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
435 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
436 return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
437 _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
439 template<int n> inline
440 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
442 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
443 __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
444 __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
445 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
449 // byte-wise "mask ? a : b"
450 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
453 return _mm_blendv_epi8(b, a, mask);
455 return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
459 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
461 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
462 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
463 __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
464 __m128i r = _mm_packs_epi32(a1, b1);
465 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
468 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
470 __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
471 __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
472 __m128i r = _mm_packs_epi32(a1, a1);
473 _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
476 template<int n> inline
477 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
479 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
480 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
481 __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
482 return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
485 template<int n> inline
486 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
488 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
489 __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
490 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
491 _mm_storel_epi64((__m128i*)ptr, a2);
494 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
496 __m128i delta32 = _mm_set1_epi32(32768);
497 __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
498 return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
501 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
503 __m128i delta32 = _mm_set1_epi32(32768);
504 __m128i a1 = _mm_sub_epi32(a.val, delta32);
505 __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
506 _mm_storel_epi64((__m128i*)ptr, r);
509 template<int n> inline
510 v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
512 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
513 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
514 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
515 __m128i b1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(b.val, delta), n), delta32);
516 __m128i b2 = _mm_sub_epi16(_mm_packs_epi32(b1, b1), _mm_set1_epi16(-32768));
517 return v_uint16x8(_mm_unpacklo_epi64(a2, b2));
520 template<int n> inline
521 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
523 __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
524 __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
525 __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
526 _mm_storel_epi64((__m128i*)ptr, a2);
529 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
530 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
532 inline void v_pack_store(short* ptr, const v_int32x4& a)
534 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
537 template<int n> inline
538 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
540 __m128i delta = _mm_set1_epi32(1 << (n-1));
541 return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
542 _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
545 template<int n> inline
546 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
548 __m128i delta = _mm_set1_epi32(1 << (n-1));
549 __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
550 _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a1, a1));
554 // [a0 0 | b0 0] [a1 0 | b1 0]
555 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
557 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
558 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
559 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
562 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
564 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
565 _mm_storel_epi64((__m128i*)ptr, a1);
568 // [a0 0 | b0 0] [a1 0 | b1 0]
569 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
571 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
572 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
573 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
576 inline void v_pack_store(int* ptr, const v_int64x2& a)
578 __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
579 _mm_storel_epi64((__m128i*)ptr, a1);
582 template<int n> inline
583 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
585 uint64 delta = (uint64)1 << (n-1);
586 v_uint64x2 delta2(delta, delta);
587 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
588 __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
589 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
590 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
591 return v_uint32x4(_mm_unpacklo_epi32(v0, v1));
594 template<int n> inline
595 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
597 uint64 delta = (uint64)1 << (n-1);
598 v_uint64x2 delta2(delta, delta);
599 __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
600 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
601 _mm_storel_epi64((__m128i*)ptr, a2);
604 inline __m128i v_sign_epi64(__m128i a)
606 return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
609 inline __m128i v_srai_epi64(__m128i a, int imm)
611 __m128i smask = v_sign_epi64(a);
612 return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
615 template<int n> inline
616 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
618 int64 delta = (int64)1 << (n-1);
619 v_int64x2 delta2(delta, delta);
620 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
621 __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
622 __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
623 __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
624 return v_int32x4(_mm_unpacklo_epi32(v0, v1));
627 template<int n> inline
628 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
630 int64 delta = (int64)1 << (n-1);
631 v_int64x2 delta2(delta, delta);
632 __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
633 __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
634 _mm_storel_epi64((__m128i*)ptr, a2);
637 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
638 const v_float32x4& m1, const v_float32x4& m2,
639 const v_float32x4& m3)
641 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
642 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
643 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
644 __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
646 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
649 inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
650 const v_float32x4& m1, const v_float32x4& m2,
651 const v_float32x4& a)
653 __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
654 __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
655 __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
657 return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
660 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
661 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
663 return _Tpvec(intrin(a.val, b.val)); \
665 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
667 a.val = intrin(a.val, b.val); \
671 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
672 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
673 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
674 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
675 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
676 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
677 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
678 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
679 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
680 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
681 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
682 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
683 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
684 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
685 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
686 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
687 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
688 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
689 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
690 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
691 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
692 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
693 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
694 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
695 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
696 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
698 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
700 __m128i c0 = _mm_mul_epu32(a.val, b.val);
701 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
702 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
703 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
704 return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
706 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
709 return v_int32x4(_mm_mullo_epi32(a.val, b.val));
711 __m128i c0 = _mm_mul_epu32(a.val, b.val);
712 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
713 __m128i d0 = _mm_unpacklo_epi32(c0, c1);
714 __m128i d1 = _mm_unpackhi_epi32(c0, c1);
715 return v_int32x4(_mm_unpacklo_epi64(d0, d1));
718 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
723 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
729 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
730 v_int32x4& c, v_int32x4& d)
732 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
733 __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
734 c.val = _mm_unpacklo_epi16(v0, v1);
735 d.val = _mm_unpackhi_epi16(v0, v1);
738 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
739 v_uint32x4& c, v_uint32x4& d)
741 __m128i v0 = _mm_mullo_epi16(a.val, b.val);
742 __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
743 c.val = _mm_unpacklo_epi16(v0, v1);
744 d.val = _mm_unpackhi_epi16(v0, v1);
747 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
748 v_uint64x2& c, v_uint64x2& d)
750 __m128i c0 = _mm_mul_epu32(a.val, b.val);
751 __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
752 c.val = _mm_unpacklo_epi64(c0, c1);
753 d.val = _mm_unpackhi_epi64(c0, c1);
756 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
758 return v_int32x4(_mm_madd_epi16(a.val, b.val));
761 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
763 return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
766 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
767 OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
768 OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
769 OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
770 inline _Tpvec operator ~ (const _Tpvec& a) \
772 return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
775 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
776 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
777 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
778 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
779 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
780 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
781 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
782 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
783 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
784 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
786 inline v_float32x4 v_sqrt(const v_float32x4& x)
787 { return v_float32x4(_mm_sqrt_ps(x.val)); }
789 inline v_float32x4 v_invsqrt(const v_float32x4& x)
791 const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
793 __m128 h = _mm_mul_ps(t, _0_5);
795 t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
796 return v_float32x4(t);
799 inline v_float64x2 v_sqrt(const v_float64x2& x)
800 { return v_float64x2(_mm_sqrt_pd(x.val)); }
802 inline v_float64x2 v_invsqrt(const v_float64x2& x)
804 const __m128d v_1 = _mm_set1_pd(1.);
805 return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
808 #define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
809 inline _Tpuvec v_abs(const _Tpsvec& x) \
810 { return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
812 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint8x16, v_int8x16, min, u8, i8)
813 OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(v_uint16x8, v_int16x8, max, i16, i16)
814 inline v_uint32x4 v_abs(const v_int32x4& x)
816 __m128i s = _mm_srli_epi32(x.val, 31);
817 __m128i f = _mm_srai_epi32(x.val, 31);
818 return v_uint32x4(_mm_add_epi32(_mm_xor_si128(x.val, f), s));
820 inline v_float32x4 v_abs(const v_float32x4& x)
821 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
822 inline v_float64x2 v_abs(const v_float64x2& x)
824 return v_float64x2(_mm_and_pd(x.val,
825 _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
828 // TODO: exp, log, sin, cos
830 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
831 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
833 return _Tpvec(intrin(a.val, b.val)); \
836 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
837 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
838 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
839 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
840 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
841 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
842 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
843 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
845 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
848 return v_int8x16(_mm_min_epi8(a.val, b.val));
850 __m128i delta = _mm_set1_epi8((char)-128);
851 return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
852 _mm_xor_si128(b.val, delta))));
855 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
858 return v_int8x16(_mm_max_epi8(a.val, b.val));
860 __m128i delta = _mm_set1_epi8((char)-128);
861 return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
862 _mm_xor_si128(b.val, delta))));
865 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
868 return v_uint16x8(_mm_min_epu16(a.val, b.val));
870 return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
873 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
876 return v_uint16x8(_mm_max_epu16(a.val, b.val));
878 return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
881 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
884 return v_uint32x4(_mm_min_epu32(a.val, b.val));
886 __m128i delta = _mm_set1_epi32((int)0x80000000);
887 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
888 return v_uint32x4(v_select_si128(mask, b.val, a.val));
891 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
894 return v_uint32x4(_mm_max_epu32(a.val, b.val));
896 __m128i delta = _mm_set1_epi32((int)0x80000000);
897 __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
898 return v_uint32x4(v_select_si128(mask, a.val, b.val));
901 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
904 return v_int32x4(_mm_min_epi32(a.val, b.val));
906 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
909 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
912 return v_int32x4(_mm_max_epi32(a.val, b.val));
914 return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
918 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
919 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
920 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
921 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
923 __m128i not_mask = _mm_set1_epi32(-1); \
924 return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
926 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
927 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
928 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
930 __m128i not_mask = _mm_set1_epi32(-1); \
931 return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
933 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
935 __m128i smask = _mm_set1_##suffix(sbit); \
936 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
938 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
940 __m128i smask = _mm_set1_##suffix(sbit); \
941 return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
943 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
945 __m128i smask = _mm_set1_##suffix(sbit); \
946 __m128i not_mask = _mm_set1_epi32(-1); \
947 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
948 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
950 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
952 __m128i smask = _mm_set1_##suffix(sbit); \
953 __m128i not_mask = _mm_set1_epi32(-1); \
954 __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
955 return _Tpuvec(_mm_xor_si128(res, not_mask)); \
957 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
959 return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
961 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
963 return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
965 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
967 __m128i not_mask = _mm_set1_epi32(-1); \
968 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
970 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
972 __m128i not_mask = _mm_set1_epi32(-1); \
973 return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
976 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
977 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
978 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
980 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
981 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
982 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
983 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
984 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
985 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
986 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
987 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
988 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
989 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
990 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
991 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
992 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
994 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
995 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
997 #define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
998 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
999 { return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
1000 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
1001 { return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
1003 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
1004 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
1006 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
1007 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
1008 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
1009 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
1010 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
1011 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
1012 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
1013 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
1015 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
1016 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
1018 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
1020 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
1022 __m128i smask = _mm_set1_epi32(smask32); \
1023 __m128i a1 = _mm_xor_si128(a.val, smask); \
1024 __m128i b1 = _mm_xor_si128(b.val, smask); \
1025 return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
1028 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
1029 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
1031 inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
1033 return v_max(a, b) - v_min(a, b);
1036 inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
1038 __m128i d = _mm_sub_epi32(a.val, b.val);
1039 __m128i m = _mm_cmpgt_epi32(b.val, a.val);
1040 return v_uint32x4(_mm_sub_epi32(_mm_xor_si128(d, m), m));
1043 inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1048 inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
1050 return v_fma(a, b, c);
1053 inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
1056 return v_float32x4(_mm_fmadd_ps(a.val, b.val, c.val));
1058 return v_float32x4(_mm_add_ps(_mm_mul_ps(a.val, b.val), c.val));
1062 inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
1065 return v_float64x2(_mm_fmadd_pd(a.val, b.val, c.val));
1067 return v_float64x2(_mm_add_pd(_mm_mul_pd(a.val, b.val), c.val));
1071 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
1072 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
1074 _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
1075 return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
1077 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1079 _Tpvec res = v_fma(a, a, b*b); \
1080 return _Tpvec(_mm_sqrt_##suffix(res.val)); \
1082 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1084 return v_fma(a, a, b*b); \
1086 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1088 return v_fma(a, b, c); \
1091 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
1092 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
1094 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
1095 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
1097 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1099 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
1101 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1103 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
1105 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1107 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
1109 return _Tpsvec(srai(a.val, imm)); \
1112 inline _Tpuvec v_shl(const _Tpuvec& a) \
1114 return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
1117 inline _Tpsvec v_shl(const _Tpsvec& a) \
1119 return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
1122 inline _Tpuvec v_shr(const _Tpuvec& a) \
1124 return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
1127 inline _Tpsvec v_shr(const _Tpsvec& a) \
1129 return _Tpsvec(srai(a.val, imm)); \
1132 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
1133 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
1134 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
1136 namespace hal_sse_internal
1139 bool is_invalid = ((imm < 0) || (imm > 16)),
1140 bool is_first = (imm == 0),
1141 bool is_half = (imm == 8),
1142 bool is_second = (imm == 16),
1143 bool is_other = (((imm > 0) && (imm < 8)) || ((imm > 8) && (imm < 16)))>
1144 class v_sse_palignr_u8_class;
1147 class v_sse_palignr_u8_class<imm, true, false, false, false, false>;
1150 class v_sse_palignr_u8_class<imm, false, true, false, false, false>
1153 inline __m128i operator()(const __m128i& a, const __m128i&) const
1160 class v_sse_palignr_u8_class<imm, false, false, true, false, false>
1163 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1165 return _mm_unpacklo_epi64(_mm_unpackhi_epi64(a, a), b);
1170 class v_sse_palignr_u8_class<imm, false, false, false, true, false>
1173 inline __m128i operator()(const __m128i&, const __m128i& b) const
1180 class v_sse_palignr_u8_class<imm, false, false, false, false, true>
1184 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1186 return _mm_alignr_epi8(b, a, imm);
1190 inline __m128i operator()(const __m128i& a, const __m128i& b) const
1192 enum { imm2 = (sizeof(__m128i) - imm) };
1193 return _mm_or_si128(_mm_srli_si128(a, imm), _mm_slli_si128(b, imm2));
1199 inline __m128i v_sse_palignr_u8(const __m128i& a, const __m128i& b)
1201 CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_sse_palignr_u8.");
1202 return v_sse_palignr_u8_class<imm>()(a, b);
1206 template<int imm, typename _Tpvec>
1207 inline _Tpvec v_rotate_right(const _Tpvec &a)
1209 using namespace hal_sse_internal;
1210 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1211 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1213 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1216 template<int imm, typename _Tpvec>
1217 inline _Tpvec v_rotate_left(const _Tpvec &a)
1219 using namespace hal_sse_internal;
1220 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1221 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1223 v_sse_reinterpret_as<__m128i>(a.val), imm2)));
1226 template<int imm, typename _Tpvec>
1227 inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
1229 using namespace hal_sse_internal;
1230 enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
1231 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1232 v_sse_palignr_u8<imm2>(
1233 v_sse_reinterpret_as<__m128i>(a.val),
1234 v_sse_reinterpret_as<__m128i>(b.val))));
1237 template<int imm, typename _Tpvec>
1238 inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
1240 using namespace hal_sse_internal;
1241 enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
1242 return _Tpvec(v_sse_reinterpret_as<typename _Tpvec::vector_type>(
1243 v_sse_palignr_u8<imm2>(
1244 v_sse_reinterpret_as<__m128i>(b.val),
1245 v_sse_reinterpret_as<__m128i>(a.val))));
1248 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
1249 inline _Tpvec v_load(const _Tp* ptr) \
1250 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
1251 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1252 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
1253 inline _Tpvec v_load_low(const _Tp* ptr) \
1254 { return _Tpvec(_mm_loadl_epi64((const __m128i*)ptr)); } \
1255 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1257 return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1258 _mm_loadl_epi64((const __m128i*)ptr1))); \
1260 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1261 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
1262 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1263 { _mm_store_si128((__m128i*)ptr, a.val); } \
1264 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1265 { _mm_stream_si128((__m128i*)ptr, a.val); } \
1266 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1268 if( mode == hal::STORE_UNALIGNED ) \
1269 _mm_storeu_si128((__m128i*)ptr, a.val); \
1270 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1271 _mm_stream_si128((__m128i*)ptr, a.val); \
1273 _mm_store_si128((__m128i*)ptr, a.val); \
1275 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1276 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
1277 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1278 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
1280 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
1281 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
1282 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
1283 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
1284 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
1285 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
1286 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
1287 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
1289 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
1290 inline _Tpvec v_load(const _Tp* ptr) \
1291 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
1292 inline _Tpvec v_load_aligned(const _Tp* ptr) \
1293 { return _Tpvec(_mm_load_##suffix(ptr)); } \
1294 inline _Tpvec v_load_low(const _Tp* ptr) \
1295 { return _Tpvec(_mm_castsi128_##suffix(_mm_loadl_epi64((const __m128i*)ptr))); } \
1296 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
1298 return _Tpvec(_mm_castsi128_##suffix( \
1299 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
1300 _mm_loadl_epi64((const __m128i*)ptr1)))); \
1302 inline void v_store(_Tp* ptr, const _Tpvec& a) \
1303 { _mm_storeu_##suffix(ptr, a.val); } \
1304 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
1305 { _mm_store_##suffix(ptr, a.val); } \
1306 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
1307 { _mm_stream_##suffix(ptr, a.val); } \
1308 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
1310 if( mode == hal::STORE_UNALIGNED ) \
1311 _mm_storeu_##suffix(ptr, a.val); \
1312 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
1313 _mm_stream_##suffix(ptr, a.val); \
1315 _mm_store_##suffix(ptr, a.val); \
1317 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
1318 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
1319 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
1321 __m128i a1 = _mm_cast##suffix##_si128(a.val); \
1322 _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
1325 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
1326 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
1328 inline v_float16x8 v_load_f16(const short* ptr)
1329 { return v_float16x8(_mm_loadu_si128((const __m128i*)ptr)); }
1330 inline v_float16x8 v_load_f16_aligned(const short* ptr)
1331 { return v_float16x8(_mm_load_si128((const __m128i*)ptr)); }
1333 inline void v_store(short* ptr, const v_float16x8& a)
1334 { _mm_storeu_si128((__m128i*)ptr, a.val); }
1335 inline void v_store_aligned(short* ptr, const v_float16x8& a)
1336 { _mm_store_si128((__m128i*)ptr, a.val); }
1338 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
1339 inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
1341 __m128i val = a.val; \
1342 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1343 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1344 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1345 return (scalartype)_mm_cvtsi128_si32(val); \
1347 inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
1349 __m128i val = a.val; \
1350 __m128i smask = _mm_set1_epi16(sbit); \
1351 val = _mm_xor_si128(val, smask); \
1352 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,8)); \
1353 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,4)); \
1354 val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
1355 return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
1357 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
1358 inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
1360 __m128i val = a.val; \
1361 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
1362 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
1363 val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
1364 return (scalartype)_mm_cvtsi128_si32(val); \
1366 inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
1368 __m128i val = a.val; \
1369 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
1370 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
1371 val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
1372 return (unsigned scalartype)_mm_cvtsi128_si32(val); \
1374 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
1375 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
1376 OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
1378 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
1379 inline scalartype v_reduce_sum(const _Tpvec& a) \
1381 regtype val = a.val; \
1382 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 8))); \
1383 val = _mm_add_##suffix(val, cast_to(_mm_srli_si128(cast_from(val), 4))); \
1384 return (scalartype)_mm_cvt##extract(val); \
1387 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
1388 inline scalartype v_reduce_##func(const _Tpvec& a) \
1390 scalartype CV_DECL_ALIGNED(16) buf[4]; \
1391 v_store_aligned(buf, a); \
1392 scalartype s0 = scalar_func(buf[0], buf[1]); \
1393 scalartype s1 = scalar_func(buf[2], buf[3]); \
1394 return scalar_func(s0, s1); \
1397 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1398 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
1399 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
1401 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
1402 const v_float32x4& c, const v_float32x4& d)
1405 __m128 ab = _mm_hadd_ps(a.val, b.val);
1406 __m128 cd = _mm_hadd_ps(c.val, d.val);
1407 return v_float32x4(_mm_hadd_ps(ab, cd));
1409 __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
1410 __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
1411 return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
1415 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
1416 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
1417 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
1418 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
1419 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
1420 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
1422 #define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
1423 inline v_uint32x4 v_popcount(const _Tpvec& a) \
1425 __m128i m1 = _mm_set1_epi32(0x55555555); \
1426 __m128i m2 = _mm_set1_epi32(0x33333333); \
1427 __m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
1428 __m128i p = a.val; \
1429 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
1430 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
1431 p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
1432 p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
1433 p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
1434 return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
1437 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
1438 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
1439 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
1440 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
1441 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
1442 OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
1444 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
1445 inline int v_signmask(const _Tpvec& a) \
1447 return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
1449 inline bool v_check_all(const _Tpvec& a) \
1450 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
1451 inline bool v_check_any(const _Tpvec& a) \
1452 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
1454 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
1455 inline __m128i v_packq_epi32(__m128i a)
1457 __m128i b = _mm_packs_epi32(a, a);
1458 return _mm_packs_epi16(b, b);
1461 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1462 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
1463 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1464 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
1465 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1466 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
1467 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
1468 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
1471 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
1472 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1474 return _Tpvec(cast_ret(_mm_blendv_##suffix(cast(b.val), cast(a.val), cast(mask.val)))); \
1477 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1478 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1479 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1480 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, OPENCV_HAL_NOP, OPENCV_HAL_NOP, epi8)
1481 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1482 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, _mm_castps_si128, _mm_castsi128_ps, ps)
1483 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, TBD, TBD, pd)
1484 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, TBD, TBD, ps)
1485 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, OPENCV_HAL_NOP, OPENCV_HAL_NOP, ps)
1486 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, OPENCV_HAL_NOP, OPENCV_HAL_NOP, pd)
1490 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
1491 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
1493 return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
1496 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
1497 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
1498 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
1499 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
1500 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
1501 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
1502 // OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
1503 // OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
1504 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
1505 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
1508 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
1509 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
1511 __m128i z = _mm_setzero_si128(); \
1512 b0.val = _mm_unpacklo_##suffix(a.val, z); \
1513 b1.val = _mm_unpackhi_##suffix(a.val, z); \
1515 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
1517 __m128i z = _mm_setzero_si128(); \
1518 return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
1520 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
1522 b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
1523 b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
1525 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
1527 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1528 return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
1531 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
1532 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
1534 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
1536 __m128i z = _mm_setzero_si128();
1537 b0.val = _mm_unpacklo_epi32(a.val, z);
1538 b1.val = _mm_unpackhi_epi32(a.val, z);
1540 inline v_uint64x2 v_load_expand(const unsigned* ptr)
1542 __m128i z = _mm_setzero_si128();
1543 return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
1545 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
1547 __m128i s = _mm_srai_epi32(a.val, 31);
1548 b0.val = _mm_unpacklo_epi32(a.val, s);
1549 b1.val = _mm_unpackhi_epi32(a.val, s);
1551 inline v_int64x2 v_load_expand(const int* ptr)
1553 __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
1554 __m128i s = _mm_srai_epi32(a, 31);
1555 return v_int64x2(_mm_unpacklo_epi32(a, s));
1558 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
1560 __m128i z = _mm_setzero_si128();
1561 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1562 return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
1565 inline v_int32x4 v_load_expand_q(const schar* ptr)
1567 __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
1568 a = _mm_unpacklo_epi8(a, a);
1569 a = _mm_unpacklo_epi8(a, a);
1570 return v_int32x4(_mm_srai_epi32(a, 24));
1573 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
1574 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
1576 b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
1577 b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
1579 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
1581 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1582 return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
1584 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
1586 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1587 return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
1589 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
1591 __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
1592 c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
1593 d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
1596 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1597 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1598 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1599 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1600 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1601 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1602 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1603 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
1605 template<int s, typename _Tpvec>
1606 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
1608 return v_rotate_right<s>(a, b);
1611 inline v_int32x4 v_round(const v_float32x4& a)
1612 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
1614 inline v_int32x4 v_floor(const v_float32x4& a)
1616 __m128i a1 = _mm_cvtps_epi32(a.val);
1617 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
1618 return v_int32x4(_mm_add_epi32(a1, mask));
1621 inline v_int32x4 v_ceil(const v_float32x4& a)
1623 __m128i a1 = _mm_cvtps_epi32(a.val);
1624 __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
1625 return v_int32x4(_mm_sub_epi32(a1, mask));
1628 inline v_int32x4 v_trunc(const v_float32x4& a)
1629 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
1631 inline v_int32x4 v_round(const v_float64x2& a)
1632 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
1634 inline v_int32x4 v_floor(const v_float64x2& a)
1636 __m128i a1 = _mm_cvtpd_epi32(a.val);
1637 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
1638 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1639 return v_int32x4(_mm_add_epi32(a1, mask));
1642 inline v_int32x4 v_ceil(const v_float64x2& a)
1644 __m128i a1 = _mm_cvtpd_epi32(a.val);
1645 __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
1646 mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
1647 return v_int32x4(_mm_sub_epi32(a1, mask));
1650 inline v_int32x4 v_trunc(const v_float64x2& a)
1651 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
1653 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1654 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1655 const _Tpvec& a2, const _Tpvec& a3, \
1656 _Tpvec& b0, _Tpvec& b1, \
1657 _Tpvec& b2, _Tpvec& b3) \
1659 __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
1660 __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
1661 __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
1662 __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
1664 b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
1665 b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
1666 b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
1667 b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
1670 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1671 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1672 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
1674 // load deinterleave
1675 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
1677 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1678 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1680 __m128i t10 = _mm_unpacklo_epi8(t00, t01);
1681 __m128i t11 = _mm_unpackhi_epi8(t00, t01);
1683 __m128i t20 = _mm_unpacklo_epi8(t10, t11);
1684 __m128i t21 = _mm_unpackhi_epi8(t10, t11);
1686 __m128i t30 = _mm_unpacklo_epi8(t20, t21);
1687 __m128i t31 = _mm_unpackhi_epi8(t20, t21);
1689 a.val = _mm_unpacklo_epi8(t30, t31);
1690 b.val = _mm_unpackhi_epi8(t30, t31);
1693 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
1696 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
1697 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
1698 __m128i s0 = _mm_loadu_si128((const __m128i*)ptr);
1699 __m128i s1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1700 __m128i s2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1701 __m128i a0 = _mm_blendv_epi8(_mm_blendv_epi8(s0, s1, m0), s2, m1);
1702 __m128i b0 = _mm_blendv_epi8(_mm_blendv_epi8(s1, s2, m0), s0, m1);
1703 __m128i c0 = _mm_blendv_epi8(_mm_blendv_epi8(s2, s0, m0), s1, m1);
1704 const __m128i sh_b = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
1705 const __m128i sh_g = _mm_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14);
1706 const __m128i sh_r = _mm_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
1707 a0 = _mm_shuffle_epi8(a0, sh_b);
1708 b0 = _mm_shuffle_epi8(b0, sh_g);
1709 c0 = _mm_shuffle_epi8(c0, sh_r);
1714 const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
1715 const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
1716 const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
1718 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
1719 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1720 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1722 __m128i s0 = _mm_shuffle_epi8(t0, m0);
1723 __m128i s1 = _mm_shuffle_epi8(t1, m1);
1724 __m128i s2 = _mm_shuffle_epi8(t2, m2);
1726 t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
1727 a.val = _mm_alignr_epi8(s2, t0, 5);
1729 t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
1730 b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
1732 t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
1733 c.val = _mm_alignr_epi8(t2, s0, 11);
1735 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1736 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1737 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
1739 __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
1740 __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
1741 __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
1743 __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
1744 __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
1745 __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
1747 __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
1748 __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
1749 __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
1751 a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
1752 b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
1753 c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
1757 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
1759 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
1760 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1761 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
1762 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
1764 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
1765 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
1766 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
1767 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b6 b14 ...
1769 u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
1770 u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
1771 u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
1772 u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
1774 v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
1775 v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
1776 v2 = _mm_unpackhi_epi8(u0, u1); // c0 c2 c4 c6 ...
1777 v3 = _mm_unpackhi_epi8(u2, u3); // c1 c3 c5 c7 ...
1779 a.val = _mm_unpacklo_epi8(v0, v1);
1780 b.val = _mm_unpackhi_epi8(v0, v1);
1781 c.val = _mm_unpacklo_epi8(v2, v3);
1782 d.val = _mm_unpackhi_epi8(v2, v3);
1785 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
1787 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1 a2 b2 a3 b3
1788 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8)); // a4 b4 a5 b5 a6 b6 a7 b7
1790 __m128i v2 = _mm_unpacklo_epi16(v0, v1); // a0 a4 b0 b4 a1 a5 b1 b5
1791 __m128i v3 = _mm_unpackhi_epi16(v0, v1); // a2 a6 b2 b6 a3 a7 b3 b7
1792 __m128i v4 = _mm_unpacklo_epi16(v2, v3); // a0 a2 a4 a6 b0 b2 b4 b6
1793 __m128i v5 = _mm_unpackhi_epi16(v2, v3); // a1 a3 a5 a7 b1 b3 b5 b7
1795 a.val = _mm_unpacklo_epi16(v4, v5); // a0 a1 a2 a3 a4 a5 a6 a7
1796 b.val = _mm_unpackhi_epi16(v4, v5); // b0 b1 ab b3 b4 b5 b6 b7
1799 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
1802 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr));
1803 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 8));
1804 __m128i v2 = _mm_loadu_si128((__m128i*)(ptr + 16));
1805 __m128i a0 = _mm_blend_epi16(_mm_blend_epi16(v0, v1, 0x92), v2, 0x24);
1806 __m128i b0 = _mm_blend_epi16(_mm_blend_epi16(v2, v0, 0x92), v1, 0x24);
1807 __m128i c0 = _mm_blend_epi16(_mm_blend_epi16(v1, v2, 0x92), v0, 0x24);
1809 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
1810 const __m128i sh_b = _mm_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
1811 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
1812 a0 = _mm_shuffle_epi8(a0, sh_a);
1813 b0 = _mm_shuffle_epi8(b0, sh_b);
1814 c0 = _mm_shuffle_epi8(c0, sh_c);
1820 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1821 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1822 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
1824 __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
1825 __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
1826 __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
1828 __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
1829 __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
1830 __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
1832 a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
1833 b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
1834 c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
1838 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
1840 __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
1841 __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
1842 __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
1843 __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
1845 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
1846 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
1847 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
1848 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
1850 u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
1851 u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
1852 u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
1853 u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
1855 a.val = _mm_unpacklo_epi16(u0, u1);
1856 b.val = _mm_unpackhi_epi16(u0, u1);
1857 c.val = _mm_unpacklo_epi16(u2, u3);
1858 d.val = _mm_unpackhi_epi16(u2, u3);
1861 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
1863 __m128i v0 = _mm_loadu_si128((__m128i*)(ptr)); // a0 b0 a1 b1
1864 __m128i v1 = _mm_loadu_si128((__m128i*)(ptr + 4)); // a2 b2 a3 b3
1866 __m128i v2 = _mm_unpacklo_epi32(v0, v1); // a0 a2 b0 b2
1867 __m128i v3 = _mm_unpackhi_epi32(v0, v1); // a1 a3 b1 b3
1869 a.val = _mm_unpacklo_epi32(v2, v3); // a0 a1 a2 a3
1870 b.val = _mm_unpackhi_epi32(v2, v3); // b0 b1 ab b3
1873 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
1875 __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
1876 __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
1877 __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
1879 __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
1880 __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
1881 __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
1883 a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
1884 b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
1885 c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
1888 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
1890 v_uint32x4 s0(_mm_loadu_si128((const __m128i*)ptr)); // a0 b0 c0 d0
1891 v_uint32x4 s1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
1892 v_uint32x4 s2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
1893 v_uint32x4 s3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
1895 v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
1898 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
1900 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
1902 __m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
1903 __m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
1905 a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
1906 b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
1909 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
1911 __m128 t0 = _mm_loadu_ps(ptr + 0);
1912 __m128 t1 = _mm_loadu_ps(ptr + 4);
1913 __m128 t2 = _mm_loadu_ps(ptr + 8);
1915 __m128 at12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 1, 0, 2));
1916 a.val = _mm_shuffle_ps(t0, at12, _MM_SHUFFLE(2, 0, 3, 0));
1918 __m128 bt01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 0, 0, 1));
1919 __m128 bt12 = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(0, 2, 0, 3));
1920 b.val = _mm_shuffle_ps(bt01, bt12, _MM_SHUFFLE(2, 0, 2, 0));
1922 __m128 ct01 = _mm_shuffle_ps(t0, t1, _MM_SHUFFLE(0, 1, 0, 2));
1923 c.val = _mm_shuffle_ps(ct01, t2, _MM_SHUFFLE(3, 0, 2, 0));
1926 inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
1928 __m128 t0 = _mm_loadu_ps(ptr + 0);
1929 __m128 t1 = _mm_loadu_ps(ptr + 4);
1930 __m128 t2 = _mm_loadu_ps(ptr + 8);
1931 __m128 t3 = _mm_loadu_ps(ptr + 12);
1932 __m128 t02lo = _mm_unpacklo_ps(t0, t2);
1933 __m128 t13lo = _mm_unpacklo_ps(t1, t3);
1934 __m128 t02hi = _mm_unpackhi_ps(t0, t2);
1935 __m128 t13hi = _mm_unpackhi_ps(t1, t3);
1936 a.val = _mm_unpacklo_ps(t02lo, t13lo);
1937 b.val = _mm_unpackhi_ps(t02lo, t13lo);
1938 c.val = _mm_unpacklo_ps(t02hi, t13hi);
1939 d.val = _mm_unpackhi_ps(t02hi, t13hi);
1942 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
1944 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
1945 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2));
1947 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
1948 b = v_uint64x2(_mm_unpackhi_epi64(t0, t1));
1951 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
1953 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0, b0
1954 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0, a1
1955 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // b1, c1
1957 t1 = _mm_shuffle_epi32(t1, 0x4e); // a1, c0
1959 a = v_uint64x2(_mm_unpacklo_epi64(t0, t1));
1960 b = v_uint64x2(_mm_unpacklo_epi64(_mm_unpackhi_epi64(t0, t0), t2));
1961 c = v_uint64x2(_mm_unpackhi_epi64(t1, t2));
1964 inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
1965 v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
1967 __m128i t0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0
1968 __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 2)); // c0 d0
1969 __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 4)); // a1 b1
1970 __m128i t3 = _mm_loadu_si128((const __m128i*)(ptr + 6)); // c1 d1
1972 a = v_uint64x2(_mm_unpacklo_epi64(t0, t2));
1973 b = v_uint64x2(_mm_unpackhi_epi64(t0, t2));
1974 c = v_uint64x2(_mm_unpacklo_epi64(t1, t3));
1975 d = v_uint64x2(_mm_unpackhi_epi64(t1, t3));
1980 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
1981 hal::StoreMode mode = hal::STORE_UNALIGNED)
1983 __m128i v0 = _mm_unpacklo_epi8(a.val, b.val);
1984 __m128i v1 = _mm_unpackhi_epi8(a.val, b.val);
1986 if( mode == hal::STORE_ALIGNED_NOCACHE )
1988 _mm_stream_si128((__m128i*)(ptr), v0);
1989 _mm_stream_si128((__m128i*)(ptr + 16), v1);
1991 else if( mode == hal::STORE_ALIGNED )
1993 _mm_store_si128((__m128i*)(ptr), v0);
1994 _mm_store_si128((__m128i*)(ptr + 16), v1);
1998 _mm_storeu_si128((__m128i*)(ptr), v0);
1999 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2003 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2004 const v_uint8x16& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2007 const __m128i sh_a = _mm_setr_epi8(0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2008 const __m128i sh_b = _mm_setr_epi8(5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2009 const __m128i sh_c = _mm_setr_epi8(10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2010 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2011 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2012 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2014 const __m128i m0 = _mm_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2015 const __m128i m1 = _mm_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2016 __m128i v0 = _mm_blendv_epi8(_mm_blendv_epi8(a0, b0, m1), c0, m0);
2017 __m128i v1 = _mm_blendv_epi8(_mm_blendv_epi8(b0, c0, m1), a0, m0);
2018 __m128i v2 = _mm_blendv_epi8(_mm_blendv_epi8(c0, a0, m1), b0, m0);
2020 const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
2021 const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
2022 const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
2024 __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
2025 t0 = _mm_alignr_epi8(c.val, t0, 5);
2026 __m128i v0 = _mm_shuffle_epi8(t0, m0);
2028 __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
2029 t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
2030 __m128i v1 = _mm_shuffle_epi8(t1, m1);
2032 __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
2033 t2 = _mm_alignr_epi8(t2, a.val, 11);
2034 __m128i v2 = _mm_shuffle_epi8(t2, m2);
2036 __m128i z = _mm_setzero_si128();
2037 __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
2038 __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
2039 __m128i c0 = _mm_unpacklo_epi8(c.val, z);
2040 __m128i c1 = _mm_unpackhi_epi8(c.val, z);
2042 __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
2043 __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
2044 __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
2045 __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
2047 __m128i p10 = _mm_unpacklo_epi32(p00, p01);
2048 __m128i p11 = _mm_unpackhi_epi32(p00, p01);
2049 __m128i p12 = _mm_unpacklo_epi32(p02, p03);
2050 __m128i p13 = _mm_unpackhi_epi32(p02, p03);
2052 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2053 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2054 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2055 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2057 p20 = _mm_slli_si128(p20, 1);
2058 p22 = _mm_slli_si128(p22, 1);
2060 __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
2061 __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
2062 __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
2063 __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
2065 __m128i p40 = _mm_unpacklo_epi64(p30, p31);
2066 __m128i p41 = _mm_unpackhi_epi64(p30, p31);
2067 __m128i p42 = _mm_unpacklo_epi64(p32, p33);
2068 __m128i p43 = _mm_unpackhi_epi64(p32, p33);
2070 __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
2071 __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
2072 __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
2075 if( mode == hal::STORE_ALIGNED_NOCACHE )
2077 _mm_stream_si128((__m128i*)(ptr), v0);
2078 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2079 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2081 else if( mode == hal::STORE_ALIGNED )
2083 _mm_store_si128((__m128i*)(ptr), v0);
2084 _mm_store_si128((__m128i*)(ptr + 16), v1);
2085 _mm_store_si128((__m128i*)(ptr + 32), v2);
2089 _mm_storeu_si128((__m128i*)(ptr), v0);
2090 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2091 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2095 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
2096 const v_uint8x16& c, const v_uint8x16& d,
2097 hal::StoreMode mode = hal::STORE_UNALIGNED)
2103 __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
2104 __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
2105 __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
2106 __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
2108 __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
2109 __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
2110 __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
2111 __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
2113 if( mode == hal::STORE_ALIGNED_NOCACHE )
2115 _mm_stream_si128((__m128i*)(ptr), v0);
2116 _mm_stream_si128((__m128i*)(ptr + 16), v1);
2117 _mm_stream_si128((__m128i*)(ptr + 32), v2);
2118 _mm_stream_si128((__m128i*)(ptr + 48), v3);
2120 else if( mode == hal::STORE_ALIGNED )
2122 _mm_store_si128((__m128i*)(ptr), v0);
2123 _mm_store_si128((__m128i*)(ptr + 16), v1);
2124 _mm_store_si128((__m128i*)(ptr + 32), v2);
2125 _mm_store_si128((__m128i*)(ptr + 48), v3);
2129 _mm_storeu_si128((__m128i*)(ptr), v0);
2130 _mm_storeu_si128((__m128i*)(ptr + 16), v1);
2131 _mm_storeu_si128((__m128i*)(ptr + 32), v2);
2132 _mm_storeu_si128((__m128i*)(ptr + 48), v3);
2136 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2137 hal::StoreMode mode = hal::STORE_UNALIGNED)
2139 __m128i v0 = _mm_unpacklo_epi16(a.val, b.val);
2140 __m128i v1 = _mm_unpackhi_epi16(a.val, b.val);
2142 if( mode == hal::STORE_ALIGNED_NOCACHE )
2144 _mm_stream_si128((__m128i*)(ptr), v0);
2145 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2147 else if( mode == hal::STORE_ALIGNED )
2149 _mm_store_si128((__m128i*)(ptr), v0);
2150 _mm_store_si128((__m128i*)(ptr + 8), v1);
2154 _mm_storeu_si128((__m128i*)(ptr), v0);
2155 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2159 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
2160 const v_uint16x8& b, const v_uint16x8& c,
2161 hal::StoreMode mode = hal::STORE_UNALIGNED)
2164 const __m128i sh_a = _mm_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2165 const __m128i sh_b = _mm_setr_epi8(10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2166 const __m128i sh_c = _mm_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2167 __m128i a0 = _mm_shuffle_epi8(a.val, sh_a);
2168 __m128i b0 = _mm_shuffle_epi8(b.val, sh_b);
2169 __m128i c0 = _mm_shuffle_epi8(c.val, sh_c);
2171 __m128i v0 = _mm_blend_epi16(_mm_blend_epi16(a0, b0, 0x92), c0, 0x24);
2172 __m128i v1 = _mm_blend_epi16(_mm_blend_epi16(c0, a0, 0x92), b0, 0x24);
2173 __m128i v2 = _mm_blend_epi16(_mm_blend_epi16(b0, c0, 0x92), a0, 0x24);
2175 __m128i z = _mm_setzero_si128();
2176 __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
2177 __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
2178 __m128i c0 = _mm_unpacklo_epi16(c.val, z);
2179 __m128i c1 = _mm_unpackhi_epi16(c.val, z);
2181 __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
2182 __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
2183 __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
2184 __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
2186 __m128i p20 = _mm_unpacklo_epi64(p10, p11);
2187 __m128i p21 = _mm_unpackhi_epi64(p10, p11);
2188 __m128i p22 = _mm_unpacklo_epi64(p12, p13);
2189 __m128i p23 = _mm_unpackhi_epi64(p12, p13);
2191 p20 = _mm_slli_si128(p20, 2);
2192 p22 = _mm_slli_si128(p22, 2);
2194 __m128i p30 = _mm_unpacklo_epi64(p20, p21);
2195 __m128i p31 = _mm_unpackhi_epi64(p20, p21);
2196 __m128i p32 = _mm_unpacklo_epi64(p22, p23);
2197 __m128i p33 = _mm_unpackhi_epi64(p22, p23);
2199 __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
2200 __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
2201 __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
2203 if( mode == hal::STORE_ALIGNED_NOCACHE )
2205 _mm_stream_si128((__m128i*)(ptr), v0);
2206 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2207 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2209 else if( mode == hal::STORE_ALIGNED )
2211 _mm_store_si128((__m128i*)(ptr), v0);
2212 _mm_store_si128((__m128i*)(ptr + 8), v1);
2213 _mm_store_si128((__m128i*)(ptr + 16), v2);
2217 _mm_storeu_si128((__m128i*)(ptr), v0);
2218 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2219 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2223 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
2224 const v_uint16x8& c, const v_uint16x8& d,
2225 hal::StoreMode mode = hal::STORE_UNALIGNED)
2231 __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
2232 __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
2233 __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
2234 __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
2236 __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
2237 __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
2238 __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
2239 __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
2241 if( mode == hal::STORE_ALIGNED_NOCACHE )
2243 _mm_stream_si128((__m128i*)(ptr), v0);
2244 _mm_stream_si128((__m128i*)(ptr + 8), v1);
2245 _mm_stream_si128((__m128i*)(ptr + 16), v2);
2246 _mm_stream_si128((__m128i*)(ptr + 24), v3);
2248 else if( mode == hal::STORE_ALIGNED )
2250 _mm_store_si128((__m128i*)(ptr), v0);
2251 _mm_store_si128((__m128i*)(ptr + 8), v1);
2252 _mm_store_si128((__m128i*)(ptr + 16), v2);
2253 _mm_store_si128((__m128i*)(ptr + 24), v3);
2257 _mm_storeu_si128((__m128i*)(ptr), v0);
2258 _mm_storeu_si128((__m128i*)(ptr + 8), v1);
2259 _mm_storeu_si128((__m128i*)(ptr + 16), v2);
2260 _mm_storeu_si128((__m128i*)(ptr + 24), v3);
2264 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2265 hal::StoreMode mode = hal::STORE_UNALIGNED)
2267 __m128i v0 = _mm_unpacklo_epi32(a.val, b.val);
2268 __m128i v1 = _mm_unpackhi_epi32(a.val, b.val);
2270 if( mode == hal::STORE_ALIGNED_NOCACHE )
2272 _mm_stream_si128((__m128i*)(ptr), v0);
2273 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2275 else if( mode == hal::STORE_ALIGNED )
2277 _mm_store_si128((__m128i*)(ptr), v0);
2278 _mm_store_si128((__m128i*)(ptr + 4), v1);
2282 _mm_storeu_si128((__m128i*)(ptr), v0);
2283 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2287 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2288 const v_uint32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2290 v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
2291 v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
2293 __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
2294 __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
2295 __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
2297 if( mode == hal::STORE_ALIGNED_NOCACHE )
2299 _mm_stream_si128((__m128i*)(ptr), v0);
2300 _mm_stream_si128((__m128i*)(ptr + 4), v1);
2301 _mm_stream_si128((__m128i*)(ptr + 8), v2);
2303 else if( mode == hal::STORE_ALIGNED )
2305 _mm_store_si128((__m128i*)(ptr), v0);
2306 _mm_store_si128((__m128i*)(ptr + 4), v1);
2307 _mm_store_si128((__m128i*)(ptr + 8), v2);
2311 _mm_storeu_si128((__m128i*)(ptr), v0);
2312 _mm_storeu_si128((__m128i*)(ptr + 4), v1);
2313 _mm_storeu_si128((__m128i*)(ptr + 8), v2);
2317 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
2318 const v_uint32x4& c, const v_uint32x4& d,
2319 hal::StoreMode mode = hal::STORE_UNALIGNED)
2321 v_uint32x4 v0, v1, v2, v3;
2322 v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
2324 if( mode == hal::STORE_ALIGNED_NOCACHE )
2326 _mm_stream_si128((__m128i*)(ptr), v0.val);
2327 _mm_stream_si128((__m128i*)(ptr + 4), v1.val);
2328 _mm_stream_si128((__m128i*)(ptr + 8), v2.val);
2329 _mm_stream_si128((__m128i*)(ptr + 12), v3.val);
2331 else if( mode == hal::STORE_ALIGNED )
2333 _mm_store_si128((__m128i*)(ptr), v0.val);
2334 _mm_store_si128((__m128i*)(ptr + 4), v1.val);
2335 _mm_store_si128((__m128i*)(ptr + 8), v2.val);
2336 _mm_store_si128((__m128i*)(ptr + 12), v3.val);
2340 _mm_storeu_si128((__m128i*)(ptr), v0.val);
2341 _mm_storeu_si128((__m128i*)(ptr + 4), v1.val);
2342 _mm_storeu_si128((__m128i*)(ptr + 8), v2.val);
2343 _mm_storeu_si128((__m128i*)(ptr + 12), v3.val);
2347 // 2-channel, float only
2348 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2349 hal::StoreMode mode = hal::STORE_UNALIGNED)
2351 __m128 v0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
2352 __m128 v1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
2354 if( mode == hal::STORE_ALIGNED_NOCACHE )
2356 _mm_stream_ps(ptr, v0);
2357 _mm_stream_ps(ptr + 4, v1);
2359 else if( mode == hal::STORE_ALIGNED )
2361 _mm_store_ps(ptr, v0);
2362 _mm_store_ps(ptr + 4, v1);
2366 _mm_storeu_ps(ptr, v0);
2367 _mm_storeu_ps(ptr + 4, v1);
2371 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2372 const v_float32x4& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2374 __m128 u0 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(0, 0, 0, 0));
2375 __m128 u1 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(1, 1, 0, 0));
2376 __m128 v0 = _mm_shuffle_ps(u0, u1, _MM_SHUFFLE(2, 0, 2, 0));
2377 __m128 u2 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(1, 1, 1, 1));
2378 __m128 u3 = _mm_shuffle_ps(a.val, b.val, _MM_SHUFFLE(2, 2, 2, 2));
2379 __m128 v1 = _mm_shuffle_ps(u2, u3, _MM_SHUFFLE(2, 0, 2, 0));
2380 __m128 u4 = _mm_shuffle_ps(c.val, a.val, _MM_SHUFFLE(3, 3, 2, 2));
2381 __m128 u5 = _mm_shuffle_ps(b.val, c.val, _MM_SHUFFLE(3, 3, 3, 3));
2382 __m128 v2 = _mm_shuffle_ps(u4, u5, _MM_SHUFFLE(2, 0, 2, 0));
2384 if( mode == hal::STORE_ALIGNED_NOCACHE )
2386 _mm_stream_ps(ptr, v0);
2387 _mm_stream_ps(ptr + 4, v1);
2388 _mm_stream_ps(ptr + 8, v2);
2390 else if( mode == hal::STORE_ALIGNED )
2392 _mm_store_ps(ptr, v0);
2393 _mm_store_ps(ptr + 4, v1);
2394 _mm_store_ps(ptr + 8, v2);
2398 _mm_storeu_ps(ptr, v0);
2399 _mm_storeu_ps(ptr + 4, v1);
2400 _mm_storeu_ps(ptr + 8, v2);
2404 inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
2405 const v_float32x4& c, const v_float32x4& d,
2406 hal::StoreMode mode = hal::STORE_UNALIGNED)
2408 __m128 u0 = _mm_unpacklo_ps(a.val, c.val);
2409 __m128 u1 = _mm_unpacklo_ps(b.val, d.val);
2410 __m128 u2 = _mm_unpackhi_ps(a.val, c.val);
2411 __m128 u3 = _mm_unpackhi_ps(b.val, d.val);
2412 __m128 v0 = _mm_unpacklo_ps(u0, u1);
2413 __m128 v2 = _mm_unpacklo_ps(u2, u3);
2414 __m128 v1 = _mm_unpackhi_ps(u0, u1);
2415 __m128 v3 = _mm_unpackhi_ps(u2, u3);
2417 if( mode == hal::STORE_ALIGNED_NOCACHE )
2419 _mm_stream_ps(ptr, v0);
2420 _mm_stream_ps(ptr + 4, v1);
2421 _mm_stream_ps(ptr + 8, v2);
2422 _mm_stream_ps(ptr + 12, v3);
2424 else if( mode == hal::STORE_ALIGNED )
2426 _mm_store_ps(ptr, v0);
2427 _mm_store_ps(ptr + 4, v1);
2428 _mm_store_ps(ptr + 8, v2);
2429 _mm_store_ps(ptr + 12, v3);
2433 _mm_storeu_ps(ptr, v0);
2434 _mm_storeu_ps(ptr + 4, v1);
2435 _mm_storeu_ps(ptr + 8, v2);
2436 _mm_storeu_ps(ptr + 12, v3);
2440 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2441 hal::StoreMode mode = hal::STORE_UNALIGNED)
2443 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2444 __m128i v1 = _mm_unpackhi_epi64(a.val, b.val);
2446 if( mode == hal::STORE_ALIGNED_NOCACHE )
2448 _mm_stream_si128((__m128i*)(ptr), v0);
2449 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2451 else if( mode == hal::STORE_ALIGNED )
2453 _mm_store_si128((__m128i*)(ptr), v0);
2454 _mm_store_si128((__m128i*)(ptr + 2), v1);
2458 _mm_storeu_si128((__m128i*)(ptr), v0);
2459 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2463 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2464 const v_uint64x2& c, hal::StoreMode mode = hal::STORE_UNALIGNED)
2466 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2467 __m128i v1 = _mm_unpacklo_epi64(c.val, _mm_unpackhi_epi64(a.val, a.val));
2468 __m128i v2 = _mm_unpackhi_epi64(b.val, c.val);
2470 if( mode == hal::STORE_ALIGNED_NOCACHE )
2472 _mm_stream_si128((__m128i*)(ptr), v0);
2473 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2474 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2476 else if( mode == hal::STORE_ALIGNED )
2478 _mm_store_si128((__m128i*)(ptr), v0);
2479 _mm_store_si128((__m128i*)(ptr + 2), v1);
2480 _mm_store_si128((__m128i*)(ptr + 4), v2);
2484 _mm_storeu_si128((__m128i*)(ptr), v0);
2485 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2486 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2490 inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
2491 const v_uint64x2& c, const v_uint64x2& d,
2492 hal::StoreMode mode = hal::STORE_UNALIGNED)
2494 __m128i v0 = _mm_unpacklo_epi64(a.val, b.val);
2495 __m128i v1 = _mm_unpacklo_epi64(c.val, d.val);
2496 __m128i v2 = _mm_unpackhi_epi64(a.val, b.val);
2497 __m128i v3 = _mm_unpackhi_epi64(c.val, d.val);
2499 if( mode == hal::STORE_ALIGNED_NOCACHE )
2501 _mm_stream_si128((__m128i*)(ptr), v0);
2502 _mm_stream_si128((__m128i*)(ptr + 2), v1);
2503 _mm_stream_si128((__m128i*)(ptr + 4), v2);
2504 _mm_stream_si128((__m128i*)(ptr + 6), v3);
2506 else if( mode == hal::STORE_ALIGNED )
2508 _mm_store_si128((__m128i*)(ptr), v0);
2509 _mm_store_si128((__m128i*)(ptr + 2), v1);
2510 _mm_store_si128((__m128i*)(ptr + 4), v2);
2511 _mm_store_si128((__m128i*)(ptr + 6), v3);
2515 _mm_storeu_si128((__m128i*)(ptr), v0);
2516 _mm_storeu_si128((__m128i*)(ptr + 2), v1);
2517 _mm_storeu_si128((__m128i*)(ptr + 4), v2);
2518 _mm_storeu_si128((__m128i*)(ptr + 6), v3);
2522 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2523 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2526 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2527 a0 = v_reinterpret_as_##suffix0(a1); \
2528 b0 = v_reinterpret_as_##suffix0(b1); \
2530 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2532 _Tpvec1 a1, b1, c1; \
2533 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2534 a0 = v_reinterpret_as_##suffix0(a1); \
2535 b0 = v_reinterpret_as_##suffix0(b1); \
2536 c0 = v_reinterpret_as_##suffix0(c1); \
2538 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2540 _Tpvec1 a1, b1, c1, d1; \
2541 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2542 a0 = v_reinterpret_as_##suffix0(a1); \
2543 b0 = v_reinterpret_as_##suffix0(b1); \
2544 c0 = v_reinterpret_as_##suffix0(c1); \
2545 d0 = v_reinterpret_as_##suffix0(d1); \
2547 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2548 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2550 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2551 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2552 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2554 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2555 const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2557 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2558 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2559 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2560 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2562 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2563 const _Tpvec0& c0, const _Tpvec0& d0, \
2564 hal::StoreMode mode = hal::STORE_UNALIGNED ) \
2566 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2567 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2568 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2569 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2570 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2573 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
2574 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
2575 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
2576 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
2577 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
2579 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
2581 return v_float32x4(_mm_cvtepi32_ps(a.val));
2584 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
2586 return v_float32x4(_mm_cvtpd_ps(a.val));
2589 inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
2591 return v_float32x4(_mm_movelh_ps(_mm_cvtpd_ps(a.val), _mm_cvtpd_ps(b.val)));
2594 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
2596 return v_float64x2(_mm_cvtepi32_pd(a.val));
2599 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
2601 return v_float64x2(_mm_cvtepi32_pd(_mm_srli_si128(a.val,8)));
2604 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
2606 return v_float64x2(_mm_cvtps_pd(a.val));
2609 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
2611 return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
2615 inline v_float32x4 v_cvt_f32(const v_float16x8& a)
2617 return v_float32x4(_mm_cvtph_ps(a.val));
2620 inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
2622 return v_float32x4(_mm_cvtph_ps(_mm_unpackhi_epi64(a.val, a.val)));
2625 inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
2627 return v_float16x8(_mm_unpacklo_epi64(_mm_cvtps_ph(a.val, 0), _mm_cvtps_ph(b.val, 0)));
2631 ////////////// Lookup table access ////////////////////
2633 inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
2635 int CV_DECL_ALIGNED(32) idx[4];
2636 v_store_aligned(idx, idxvec);
2637 return v_int32x4(_mm_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
2640 inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
2642 int CV_DECL_ALIGNED(32) idx[4];
2643 v_store_aligned(idx, idxvec);
2644 return v_float32x4(_mm_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
2647 inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
2650 v_store_low(idx, idxvec);
2651 return v_float64x2(_mm_setr_pd(tab[idx[0]], tab[idx[1]]));
2654 // loads pairs from the table and deinterleaves them, e.g. returns:
2655 // x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
2656 // y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
2657 // note that the indices are float's indices, not the float-pair indices.
2658 // in theory, this function can be used to implement bilinear interpolation,
2659 // when idxvec are the offsets within the image.
2660 inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
2662 int CV_DECL_ALIGNED(32) idx[4];
2663 v_store_aligned(idx, idxvec);
2664 __m128 z = _mm_setzero_ps();
2665 __m128 xy01 = _mm_loadl_pi(z, (__m64*)(tab + idx[0]));
2666 __m128 xy23 = _mm_loadl_pi(z, (__m64*)(tab + idx[2]));
2667 xy01 = _mm_loadh_pi(xy01, (__m64*)(tab + idx[1]));
2668 xy23 = _mm_loadh_pi(xy23, (__m64*)(tab + idx[3]));
2669 __m128 xxyy02 = _mm_unpacklo_ps(xy01, xy23);
2670 __m128 xxyy13 = _mm_unpackhi_ps(xy01, xy23);
2671 x = v_float32x4(_mm_unpacklo_ps(xxyy02, xxyy13));
2672 y = v_float32x4(_mm_unpackhi_ps(xxyy02, xxyy13));
2675 inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
2678 v_store_low(idx, idxvec);
2679 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
2680 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
2681 x = v_float64x2(_mm_unpacklo_pd(xy0, xy1));
2682 y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
2685 inline void v_cleanup() {}
2687 //! @name Check SIMD support
2689 //! @brief Check CPU capability of SIMD operation
2690 static inline bool hasSIMD128()
2692 return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
2697 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END