1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html
5 #ifndef OPENCV_HAL_INTRIN_AVX_HPP
6 #define OPENCV_HAL_INTRIN_AVX_HPP
9 #define CV_SIMD256_64F 1
16 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
18 ///////// Utils ////////////
20 inline __m256i _v256_combine(const __m128i& lo, const __m128i& hi)
21 { return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); }
23 inline __m256 _v256_combine(const __m128& lo, const __m128& hi)
24 { return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1); }
26 inline __m256d _v256_combine(const __m128d& lo, const __m128d& hi)
27 { return _mm256_insertf128_pd(_mm256_castpd128_pd256(lo), hi, 1); }
29 inline int _v_cvtsi256_si32(const __m256i& a)
30 { return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); }
32 inline __m256i _v256_shuffle_odd_64(const __m256i& v)
33 { return _mm256_permute4x64_epi64(v, _MM_SHUFFLE(3, 1, 2, 0)); }
35 inline __m256d _v256_shuffle_odd_64(const __m256d& v)
36 { return _mm256_permute4x64_pd(v, _MM_SHUFFLE(3, 1, 2, 0)); }
39 inline __m256i _v256_permute2x128(const __m256i& a, const __m256i& b)
40 { return _mm256_permute2x128_si256(a, b, imm); }
43 inline __m256 _v256_permute2x128(const __m256& a, const __m256& b)
44 { return _mm256_permute2f128_ps(a, b, imm); }
47 inline __m256d _v256_permute2x128(const __m256d& a, const __m256d& b)
48 { return _mm256_permute2f128_pd(a, b, imm); }
50 template<int imm, typename _Tpvec>
51 inline _Tpvec v256_permute2x128(const _Tpvec& a, const _Tpvec& b)
52 { return _Tpvec(_v256_permute2x128<imm>(a.val, b.val)); }
55 inline __m256i _v256_permute4x64(const __m256i& a)
56 { return _mm256_permute4x64_epi64(a, imm); }
59 inline __m256d _v256_permute4x64(const __m256d& a)
60 { return _mm256_permute4x64_pd(a, imm); }
62 template<int imm, typename _Tpvec>
63 inline _Tpvec v256_permute4x64(const _Tpvec& a)
64 { return _Tpvec(_v256_permute4x64<imm>(a.val)); }
66 inline __m128i _v256_extract_high(const __m256i& v)
67 { return _mm256_extracti128_si256(v, 1); }
69 inline __m128 _v256_extract_high(const __m256& v)
70 { return _mm256_extractf128_ps(v, 1); }
72 inline __m128d _v256_extract_high(const __m256d& v)
73 { return _mm256_extractf128_pd(v, 1); }
75 inline __m128i _v256_extract_low(const __m256i& v)
76 { return _mm256_castsi256_si128(v); }
78 inline __m128 _v256_extract_low(const __m256& v)
79 { return _mm256_castps256_ps128(v); }
81 inline __m128d _v256_extract_low(const __m256d& v)
82 { return _mm256_castpd256_pd128(v); }
84 ///////// Types ////////////
88 typedef uchar lane_type;
92 explicit v_uint8x32(__m256i v) : val(v) {}
93 v_uint8x32(uchar v0, uchar v1, uchar v2, uchar v3,
94 uchar v4, uchar v5, uchar v6, uchar v7,
95 uchar v8, uchar v9, uchar v10, uchar v11,
96 uchar v12, uchar v13, uchar v14, uchar v15,
97 uchar v16, uchar v17, uchar v18, uchar v19,
98 uchar v20, uchar v21, uchar v22, uchar v23,
99 uchar v24, uchar v25, uchar v26, uchar v27,
100 uchar v28, uchar v29, uchar v30, uchar v31)
102 val = _mm256_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
103 (char)v4, (char)v5, (char)v6 , (char)v7, (char)v8, (char)v9,
104 (char)v10, (char)v11, (char)v12, (char)v13, (char)v14, (char)v15,
105 (char)v16, (char)v17, (char)v18, (char)v19, (char)v20, (char)v21,
106 (char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
107 (char)v28, (char)v29, (char)v30, (char)v31);
109 v_uint8x32() : val(_mm256_setzero_si256()) {}
110 uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
115 typedef schar lane_type;
116 enum { nlanes = 32 };
119 explicit v_int8x32(__m256i v) : val(v) {}
120 v_int8x32(schar v0, schar v1, schar v2, schar v3,
121 schar v4, schar v5, schar v6, schar v7,
122 schar v8, schar v9, schar v10, schar v11,
123 schar v12, schar v13, schar v14, schar v15,
124 schar v16, schar v17, schar v18, schar v19,
125 schar v20, schar v21, schar v22, schar v23,
126 schar v24, schar v25, schar v26, schar v27,
127 schar v28, schar v29, schar v30, schar v31)
129 val = _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
130 v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
131 v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
133 v_int8x32() : val(_mm256_setzero_si256()) {}
134 schar get0() const { return (schar)_v_cvtsi256_si32(val); }
139 typedef ushort lane_type;
140 enum { nlanes = 16 };
143 explicit v_uint16x16(__m256i v) : val(v) {}
144 v_uint16x16(ushort v0, ushort v1, ushort v2, ushort v3,
145 ushort v4, ushort v5, ushort v6, ushort v7,
146 ushort v8, ushort v9, ushort v10, ushort v11,
147 ushort v12, ushort v13, ushort v14, ushort v15)
149 val = _mm256_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
150 (short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
151 (short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
153 v_uint16x16() : val(_mm256_setzero_si256()) {}
154 ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
159 typedef short lane_type;
160 enum { nlanes = 16 };
163 explicit v_int16x16(__m256i v) : val(v) {}
164 v_int16x16(short v0, short v1, short v2, short v3,
165 short v4, short v5, short v6, short v7,
166 short v8, short v9, short v10, short v11,
167 short v12, short v13, short v14, short v15)
169 val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
170 v8, v9, v10, v11, v12, v13, v14, v15);
172 v_int16x16() : val(_mm256_setzero_si256()) {}
173 short get0() const { return (short)_v_cvtsi256_si32(val); }
178 typedef unsigned lane_type;
182 explicit v_uint32x8(__m256i v) : val(v) {}
183 v_uint32x8(unsigned v0, unsigned v1, unsigned v2, unsigned v3,
184 unsigned v4, unsigned v5, unsigned v6, unsigned v7)
186 val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
187 (unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
189 v_uint32x8() : val(_mm256_setzero_si256()) {}
190 unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
195 typedef int lane_type;
199 explicit v_int32x8(__m256i v) : val(v) {}
200 v_int32x8(int v0, int v1, int v2, int v3,
201 int v4, int v5, int v6, int v7)
203 val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
205 v_int32x8() : val(_mm256_setzero_si256()) {}
206 int get0() const { return _v_cvtsi256_si32(val); }
211 typedef float lane_type;
215 explicit v_float32x8(__m256 v) : val(v) {}
216 v_float32x8(float v0, float v1, float v2, float v3,
217 float v4, float v5, float v6, float v7)
219 val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
221 v_float32x8() : val(_mm256_setzero_ps()) {}
222 float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
227 typedef uint64 lane_type;
231 explicit v_uint64x4(__m256i v) : val(v) {}
232 v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
233 { val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
234 v_uint64x4() : val(_mm256_setzero_si256()) {}
236 { return (uint64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
241 typedef int64 lane_type;
245 explicit v_int64x4(__m256i v) : val(v) {}
246 v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
247 { val = _mm256_setr_epi64x(v0, v1, v2, v3); }
248 v_int64x4() : val(_mm256_setzero_si256()) {}
249 int64 get0() const { return (int64)_mm_cvtsi128_si64(_mm256_castsi256_si128(val)); }
254 typedef double lane_type;
258 explicit v_float64x4(__m256d v) : val(v) {}
259 v_float64x4(double v0, double v1, double v2, double v3)
260 { val = _mm256_setr_pd(v0, v1, v2, v3); }
261 v_float64x4() : val(_mm256_setzero_pd()) {}
262 double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
267 typedef short lane_type;
268 enum { nlanes = 16 };
271 explicit v_float16x16(__m256i v) : val(v) {}
272 v_float16x16(short v0, short v1, short v2, short v3,
273 short v4, short v5, short v6, short v7,
274 short v8, short v9, short v10, short v11,
275 short v12, short v13, short v14, short v15)
277 val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
279 v_float16x16() : val(_mm256_setzero_si256()) {}
280 short get0() const { return (short)_v_cvtsi256_si32(val); }
282 inline v_float16x16 v256_setzero_f16() { return v_float16x16(_mm256_setzero_si256()); }
283 inline v_float16x16 v256_setall_f16(short val) { return v_float16x16(_mm256_set1_epi16(val)); }
285 //////////////// Load and store operations ///////////////
287 #define OPENCV_HAL_IMPL_AVX_LOADSTORE(_Tpvec, _Tp) \
288 inline _Tpvec v256_load(const _Tp* ptr) \
289 { return _Tpvec(_mm256_loadu_si256((const __m256i*)ptr)); } \
290 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
291 { return _Tpvec(_mm256_load_si256((const __m256i*)ptr)); } \
292 inline _Tpvec v256_load_low(const _Tp* ptr) \
294 __m128i v128 = _mm_loadu_si128((const __m128i*)ptr); \
295 return _Tpvec(_mm256_castsi128_si256(v128)); \
297 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
299 __m128i vlo = _mm_loadu_si128((const __m128i*)ptr0); \
300 __m128i vhi = _mm_loadu_si128((const __m128i*)ptr1); \
301 return _Tpvec(_v256_combine(vlo, vhi)); \
303 inline void v_store(_Tp* ptr, const _Tpvec& a) \
304 { _mm256_storeu_si256((__m256i*)ptr, a.val); } \
305 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
306 { _mm256_store_si256((__m256i*)ptr, a.val); } \
307 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
308 { _mm256_stream_si256((__m256i*)ptr, a.val); } \
309 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
311 if( mode == hal::STORE_UNALIGNED ) \
312 _mm256_storeu_si256((__m256i*)ptr, a.val); \
313 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
314 _mm256_stream_si256((__m256i*)ptr, a.val); \
316 _mm256_store_si256((__m256i*)ptr, a.val); \
318 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
319 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_low(a.val)); } \
320 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
321 { _mm_storeu_si128((__m128i*)ptr, _v256_extract_high(a.val)); }
323 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint8x32, uchar)
324 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int8x32, schar)
325 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint16x16, ushort)
326 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int16x16, short)
327 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint32x8, unsigned)
328 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int32x8, int)
329 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_uint64x4, uint64)
330 OPENCV_HAL_IMPL_AVX_LOADSTORE(v_int64x4, int64)
332 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \
333 inline _Tpvec v256_load(const _Tp* ptr) \
334 { return _Tpvec(_mm256_loadu_##suffix(ptr)); } \
335 inline _Tpvec v256_load_aligned(const _Tp* ptr) \
336 { return _Tpvec(_mm256_load_##suffix(ptr)); } \
337 inline _Tpvec v256_load_low(const _Tp* ptr) \
339 return _Tpvec(_mm256_cast##suffix##128_##suffix##256 \
340 (_mm_loadu_##suffix(ptr))); \
342 inline _Tpvec v256_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
344 halfreg vlo = _mm_loadu_##suffix(ptr0); \
345 halfreg vhi = _mm_loadu_##suffix(ptr1); \
346 return _Tpvec(_v256_combine(vlo, vhi)); \
348 inline void v_store(_Tp* ptr, const _Tpvec& a) \
349 { _mm256_storeu_##suffix(ptr, a.val); } \
350 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
351 { _mm256_store_##suffix(ptr, a.val); } \
352 inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
353 { _mm256_stream_##suffix(ptr, a.val); } \
354 inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
356 if( mode == hal::STORE_UNALIGNED ) \
357 _mm256_storeu_##suffix(ptr, a.val); \
358 else if( mode == hal::STORE_ALIGNED_NOCACHE ) \
359 _mm256_stream_##suffix(ptr, a.val); \
361 _mm256_store_##suffix(ptr, a.val); \
363 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
364 { _mm_storeu_##suffix(ptr, _v256_extract_low(a.val)); } \
365 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
366 { _mm_storeu_##suffix(ptr, _v256_extract_high(a.val)); }
368 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float32x8, float, ps, __m128)
369 OPENCV_HAL_IMPL_AVX_LOADSTORE_FLT(v_float64x4, double, pd, __m128d)
371 #define OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, _Tpvecf, suffix, cast) \
372 inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \
373 { return _Tpvec(cast(a.val)); }
375 #define OPENCV_HAL_IMPL_AVX_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \
376 inline _Tpvec v256_setzero_##suffix() \
377 { return _Tpvec(_mm256_setzero_si256()); } \
378 inline _Tpvec v256_setall_##suffix(_Tp v) \
379 { return _Tpvec(_mm256_set1_##ssuffix((ctype_s)v)); } \
380 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, OPENCV_HAL_NOP) \
381 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, OPENCV_HAL_NOP) \
382 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, OPENCV_HAL_NOP) \
383 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, OPENCV_HAL_NOP) \
384 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, OPENCV_HAL_NOP) \
385 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, OPENCV_HAL_NOP) \
386 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, OPENCV_HAL_NOP) \
387 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, OPENCV_HAL_NOP) \
388 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float32x8, suffix, _mm256_castps_si256) \
389 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_float64x4, suffix, _mm256_castpd_si256)
391 OPENCV_HAL_IMPL_AVX_INIT(v_uint8x32, uchar, u8, epi8, char)
392 OPENCV_HAL_IMPL_AVX_INIT(v_int8x32, schar, s8, epi8, char)
393 OPENCV_HAL_IMPL_AVX_INIT(v_uint16x16, ushort, u16, epi16, short)
394 OPENCV_HAL_IMPL_AVX_INIT(v_int16x16, short, s16, epi16, short)
395 OPENCV_HAL_IMPL_AVX_INIT(v_uint32x8, unsigned, u32, epi32, int)
396 OPENCV_HAL_IMPL_AVX_INIT(v_int32x8, int, s32, epi32, int)
397 OPENCV_HAL_IMPL_AVX_INIT(v_uint64x4, uint64, u64, epi64x, int64)
398 OPENCV_HAL_IMPL_AVX_INIT(v_int64x4, int64, s64, epi64x, int64)
400 #define OPENCV_HAL_IMPL_AVX_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
401 inline _Tpvec v256_setzero_##suffix() \
402 { return _Tpvec(_mm256_setzero_##zsuffix()); } \
403 inline _Tpvec v256_setall_##suffix(_Tp v) \
404 { return _Tpvec(_mm256_set1_##zsuffix(v)); } \
405 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint8x32, suffix, cast) \
406 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int8x32, suffix, cast) \
407 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint16x16, suffix, cast) \
408 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int16x16, suffix, cast) \
409 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint32x8, suffix, cast) \
410 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int32x8, suffix, cast) \
411 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_uint64x4, suffix, cast) \
412 OPENCV_HAL_IMPL_AVX_CAST(_Tpvec, v_int64x4, suffix, cast)
414 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float32x8, float, f32, ps, _mm256_castsi256_ps)
415 OPENCV_HAL_IMPL_AVX_INIT_FLT(v_float64x4, double, f64, pd, _mm256_castsi256_pd)
417 inline v_float32x8 v_reinterpret_as_f32(const v_float32x8& a)
419 inline v_float32x8 v_reinterpret_as_f32(const v_float64x4& a)
420 { return v_float32x8(_mm256_castpd_ps(a.val)); }
422 inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
424 inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
425 { return v_float64x4(_mm256_castps_pd(a.val)); }
427 inline v_float16x16 v256_load_f16(const short* ptr)
428 { return v_float16x16(_mm256_loadu_si256((const __m256i*)ptr)); }
429 inline v_float16x16 v256_load_f16_aligned(const short* ptr)
430 { return v_float16x16(_mm256_load_si256((const __m256i*)ptr)); }
432 inline void v_store(short* ptr, const v_float16x16& a)
433 { _mm256_storeu_si256((__m256i*)ptr, a.val); }
434 inline void v_store_aligned(short* ptr, const v_float16x16& a)
435 { _mm256_store_si256((__m256i*)ptr, a.val); }
438 /*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \
439 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
440 { return _Tpvec(perm(a.val, b.val, 0x20)); } \
441 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
442 { return _Tpvec(perm(a.val, b.val, 0x31)); } \
443 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
444 _Tpvec& c, _Tpvec& d) \
445 { c = v_combine_low(a, b); d = v_combine_high(a, b); }
447 #define OPENCV_HAL_IMPL_AVX_UNPACKS(_Tpvec, suffix) \
448 OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, _mm256_permute2x128_si256) \
449 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, \
450 _Tpvec& b0, _Tpvec& b1) \
452 __m256i v0 = _v256_shuffle_odd_64(a0.val); \
453 __m256i v1 = _v256_shuffle_odd_64(a1.val); \
454 b0.val = _mm256_unpacklo_##suffix(v0, v1); \
455 b1.val = _mm256_unpackhi_##suffix(v0, v1); \
458 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint8x32, epi8)
459 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int8x32, epi8)
460 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint16x16, epi16)
461 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int16x16, epi16)
462 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint32x8, epi32)
463 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int32x8, epi32)
464 OPENCV_HAL_IMPL_AVX_UNPACKS(v_uint64x4, epi64)
465 OPENCV_HAL_IMPL_AVX_UNPACKS(v_int64x4, epi64)
466 OPENCV_HAL_IMPL_AVX_COMBINE(v_float32x8, _mm256_permute2f128_ps)
467 OPENCV_HAL_IMPL_AVX_COMBINE(v_float64x4, _mm256_permute2f128_pd)
469 inline void v_zip(const v_float32x8& a0, const v_float32x8& a1, v_float32x8& b0, v_float32x8& b1)
471 __m256 v0 = _mm256_unpacklo_ps(a0.val, a1.val);
472 __m256 v1 = _mm256_unpackhi_ps(a0.val, a1.val);
473 v_recombine(v_float32x8(v0), v_float32x8(v1), b0, b1);
476 inline void v_zip(const v_float64x4& a0, const v_float64x4& a1, v_float64x4& b0, v_float64x4& b1)
478 __m256d v0 = _v_shuffle_odd_64(a0.val);
479 __m256d v1 = _v_shuffle_odd_64(a1.val);
480 b0.val = _mm256_unpacklo_pd(v0, v1);
481 b1.val = _mm256_unpackhi_pd(v0, v1);
484 //////////////// Variant Value reordering ///////////////
487 #define OPENCV_HAL_IMPL_AVX_UNPACK(_Tpvec, suffix) \
488 inline _Tpvec v256_unpacklo(const _Tpvec& a, const _Tpvec& b) \
489 { return _Tpvec(_mm256_unpacklo_##suffix(a.val, b.val)); } \
490 inline _Tpvec v256_unpackhi(const _Tpvec& a, const _Tpvec& b) \
491 { return _Tpvec(_mm256_unpackhi_##suffix(a.val, b.val)); }
493 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint8x32, epi8)
494 OPENCV_HAL_IMPL_AVX_UNPACK(v_int8x32, epi8)
495 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint16x16, epi16)
496 OPENCV_HAL_IMPL_AVX_UNPACK(v_int16x16, epi16)
497 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint32x8, epi32)
498 OPENCV_HAL_IMPL_AVX_UNPACK(v_int32x8, epi32)
499 OPENCV_HAL_IMPL_AVX_UNPACK(v_uint64x4, epi64)
500 OPENCV_HAL_IMPL_AVX_UNPACK(v_int64x4, epi64)
501 OPENCV_HAL_IMPL_AVX_UNPACK(v_float32x8, ps)
502 OPENCV_HAL_IMPL_AVX_UNPACK(v_float64x4, pd)
505 #define OPENCV_HAL_IMPL_AVX_BLEND(_Tpvec, suffix) \
507 inline _Tpvec v256_blend(const _Tpvec& a, const _Tpvec& b) \
508 { return _Tpvec(_mm256_blend_##suffix(a.val, b.val, m)); }
510 OPENCV_HAL_IMPL_AVX_BLEND(v_uint16x16, epi16)
511 OPENCV_HAL_IMPL_AVX_BLEND(v_int16x16, epi16)
512 OPENCV_HAL_IMPL_AVX_BLEND(v_uint32x8, epi32)
513 OPENCV_HAL_IMPL_AVX_BLEND(v_int32x8, epi32)
514 OPENCV_HAL_IMPL_AVX_BLEND(v_float32x8, ps)
515 OPENCV_HAL_IMPL_AVX_BLEND(v_float64x4, pd)
518 inline v_uint64x4 v256_blend(const v_uint64x4& a, const v_uint64x4& b)
521 enum {M1 = (M0 | (M0 << 2)) & 0x33};
522 enum {M2 = (M1 | (M1 << 1)) & 0x55};
523 enum {MM = M2 | (M2 << 1)};
524 return v_uint64x4(_mm256_blend_epi32(a.val, b.val, MM));
527 inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
528 { return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
531 // todo: emluate 64bit
532 #define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
534 inline _Tpvec v256_shuffle(const _Tpvec& a) \
535 { return _Tpvec(_mm256_##intrin(a.val, m)); }
537 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_uint32x8, shuffle_epi32)
538 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_int32x8, shuffle_epi32)
539 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float32x8, permute_ps)
540 OPENCV_HAL_IMPL_AVX_SHUFFLE(v_float64x4, permute_pd)
542 template<typename _Tpvec>
543 inline void v256_zip(const _Tpvec& a, const _Tpvec& b, _Tpvec& ab0, _Tpvec& ab1)
545 ab0 = v256_unpacklo(a, b);
546 ab1 = v256_unpackhi(a, b);
549 template<typename _Tpvec>
550 inline _Tpvec v256_combine_diagonal(const _Tpvec& a, const _Tpvec& b)
551 { return _Tpvec(_mm256_blend_epi32(a.val, b.val, 0xf0)); }
553 inline v_float32x8 v256_combine_diagonal(const v_float32x8& a, const v_float32x8& b)
554 { return v256_blend<0xf0>(a, b); }
556 inline v_float64x4 v256_combine_diagonal(const v_float64x4& a, const v_float64x4& b)
557 { return v256_blend<0xc>(a, b); }
559 template<typename _Tpvec>
560 inline _Tpvec v256_alignr_128(const _Tpvec& a, const _Tpvec& b)
561 { return v256_permute2x128<0x21>(a, b); }
563 template<typename _Tpvec>
564 inline _Tpvec v256_alignr_64(const _Tpvec& a, const _Tpvec& b)
565 { return _Tpvec(_mm256_alignr_epi8(a.val, b.val, 8)); }
566 inline v_float64x4 v256_alignr_64(const v_float64x4& a, const v_float64x4& b)
567 { return v_float64x4(_mm256_shuffle_pd(b.val, a.val, _MM_SHUFFLE(0, 0, 1, 1))); }
568 // todo: emulate float32
570 template<typename _Tpvec>
571 inline _Tpvec v256_swap_halves(const _Tpvec& a)
572 { return v256_permute2x128<1>(a, a); }
574 template<typename _Tpvec>
575 inline _Tpvec v256_reverse_64(const _Tpvec& a)
576 { return v256_permute4x64<_MM_SHUFFLE(0, 1, 2, 3)>(a); }
579 #define OPENCV_HAL_IMPL_AVX_ZIP(_Tpvec) \
580 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
581 { return v256_permute2x128<0x20>(a, b); } \
582 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
583 { return v256_permute2x128<0x31>(a, b); } \
584 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \
585 _Tpvec& c, _Tpvec& d) \
587 _Tpvec a1b0 = v256_alignr_128(a, b); \
588 c = v256_combine_diagonal(a, a1b0); \
589 d = v256_combine_diagonal(a1b0, b); \
591 inline void v_zip(const _Tpvec& a, const _Tpvec& b, \
592 _Tpvec& ab0, _Tpvec& ab1) \
594 _Tpvec ab0ab2, ab1ab3; \
595 v256_zip(a, b, ab0ab2, ab1ab3); \
596 v_recombine(ab0ab2, ab1ab3, ab0, ab1); \
599 OPENCV_HAL_IMPL_AVX_ZIP(v_uint8x32)
600 OPENCV_HAL_IMPL_AVX_ZIP(v_int8x32)
601 OPENCV_HAL_IMPL_AVX_ZIP(v_uint16x16)
602 OPENCV_HAL_IMPL_AVX_ZIP(v_int16x16)
603 OPENCV_HAL_IMPL_AVX_ZIP(v_uint32x8)
604 OPENCV_HAL_IMPL_AVX_ZIP(v_int32x8)
605 OPENCV_HAL_IMPL_AVX_ZIP(v_uint64x4)
606 OPENCV_HAL_IMPL_AVX_ZIP(v_int64x4)
607 OPENCV_HAL_IMPL_AVX_ZIP(v_float32x8)
608 OPENCV_HAL_IMPL_AVX_ZIP(v_float64x4)
610 ////////// Arithmetic, bitwise and comparison operations /////////
612 /* Element-wise binary and unary operations */
615 #define OPENCV_HAL_IMPL_AVX_BIN_OP(bin_op, _Tpvec, intrin) \
616 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
617 { return _Tpvec(intrin(a.val, b.val)); } \
618 inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
619 { a.val = intrin(a.val, b.val); return a; }
621 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint8x32, _mm256_adds_epu8)
622 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint8x32, _mm256_subs_epu8)
623 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
624 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int8x32, _mm256_subs_epi8)
625 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint16x16, _mm256_adds_epu16)
626 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint16x16, _mm256_subs_epu16)
627 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint16x16, _mm256_mullo_epi16)
628 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int16x16, _mm256_adds_epi16)
629 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int16x16, _mm256_subs_epi16)
630 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int16x16, _mm256_mullo_epi16)
631 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint32x8, _mm256_add_epi32)
632 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint32x8, _mm256_sub_epi32)
633 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_uint32x8, _mm256_mullo_epi32)
634 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int32x8, _mm256_add_epi32)
635 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int32x8, _mm256_sub_epi32)
636 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_int32x8, _mm256_mullo_epi32)
637 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_uint64x4, _mm256_add_epi64)
638 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_uint64x4, _mm256_sub_epi64)
639 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int64x4, _mm256_add_epi64)
640 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_int64x4, _mm256_sub_epi64)
642 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float32x8, _mm256_add_ps)
643 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float32x8, _mm256_sub_ps)
644 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float32x8, _mm256_mul_ps)
645 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float32x8, _mm256_div_ps)
646 OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_float64x4, _mm256_add_pd)
647 OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
648 OPENCV_HAL_IMPL_AVX_BIN_OP(*, v_float64x4, _mm256_mul_pd)
649 OPENCV_HAL_IMPL_AVX_BIN_OP(/, v_float64x4, _mm256_div_pd)
651 inline void v_mul_expand(const v_int16x16& a, const v_int16x16& b,
652 v_int32x8& c, v_int32x8& d)
654 v_int16x16 vhi = v_int16x16(_mm256_mulhi_epi16(a.val, b.val));
657 v_zip(a * b, vhi, v0, v1);
659 c = v_reinterpret_as_s32(v0);
660 d = v_reinterpret_as_s32(v1);
663 inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
664 v_uint32x8& c, v_uint32x8& d)
666 v_uint16x16 vhi = v_uint16x16(_mm256_mulhi_epu16(a.val, b.val));
669 v_zip(a * b, vhi, v0, v1);
671 c = v_reinterpret_as_u32(v0);
672 d = v_reinterpret_as_u32(v1);
675 inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
676 v_uint64x4& c, v_uint64x4& d)
678 __m256i v0 = _mm256_mul_epu32(a.val, b.val);
679 __m256i v1 = _mm256_mul_epu32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
680 v_zip(v_uint64x4(v0), v_uint64x4(v1), c, d);
684 /** Non-saturating arithmetics **/
685 #define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
686 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
687 { return _Tpvec(intrin(a.val, b.val)); }
689 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint8x32, _mm256_add_epi8)
690 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int8x32, _mm256_add_epi8)
691 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_uint16x16, _mm256_add_epi16)
692 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_add_wrap, v_int16x16, _mm256_add_epi16)
693 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint8x32, _mm256_sub_epi8)
694 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int8x32, _mm256_sub_epi8)
695 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_uint16x16, _mm256_sub_epi16)
696 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_sub_wrap, v_int16x16, _mm256_sub_epi16)
698 /** Bitwise shifts **/
699 #define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
700 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
701 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
702 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
703 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
704 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
705 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
706 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
707 { return _Tpsvec(srai(a.val, imm)); } \
709 inline _Tpuvec v_shl(const _Tpuvec& a) \
710 { return _Tpuvec(_mm256_slli_##suffix(a.val, imm)); } \
712 inline _Tpsvec v_shl(const _Tpsvec& a) \
713 { return _Tpsvec(_mm256_slli_##suffix(a.val, imm)); } \
715 inline _Tpuvec v_shr(const _Tpuvec& a) \
716 { return _Tpuvec(_mm256_srli_##suffix(a.val, imm)); } \
718 inline _Tpsvec v_shr(const _Tpsvec& a) \
719 { return _Tpsvec(srai(a.val, imm)); }
721 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint16x16, v_int16x16, epi16, _mm256_srai_epi16)
722 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint32x8, v_int32x8, epi32, _mm256_srai_epi32)
724 inline __m256i _mm256_srai_epi64xx(const __m256i a, int imm)
726 __m256i d = _mm256_set1_epi64x((int64)1 << 63);
727 __m256i r = _mm256_srli_epi64(_mm256_add_epi64(a, d), imm);
728 return _mm256_sub_epi64(r, _mm256_srli_epi64(d, imm));
730 OPENCV_HAL_IMPL_AVX_SHIFT_OP(v_uint64x4, v_int64x4, epi64, _mm256_srai_epi64xx)
733 /** Bitwise logic **/
734 #define OPENCV_HAL_IMPL_AVX_LOGIC_OP(_Tpvec, suffix, not_const) \
735 OPENCV_HAL_IMPL_AVX_BIN_OP(&, _Tpvec, _mm256_and_##suffix) \
736 OPENCV_HAL_IMPL_AVX_BIN_OP(|, _Tpvec, _mm256_or_##suffix) \
737 OPENCV_HAL_IMPL_AVX_BIN_OP(^, _Tpvec, _mm256_xor_##suffix) \
738 inline _Tpvec operator ~ (const _Tpvec& a) \
739 { return _Tpvec(_mm256_xor_##suffix(a.val, not_const)); }
741 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint8x32, si256, _mm256_set1_epi32(-1))
742 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int8x32, si256, _mm256_set1_epi32(-1))
743 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint16x16, si256, _mm256_set1_epi32(-1))
744 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int16x16, si256, _mm256_set1_epi32(-1))
745 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint32x8, si256, _mm256_set1_epi32(-1))
746 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int32x8, si256, _mm256_set1_epi32(-1))
747 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_uint64x4, si256, _mm256_set1_epi64x(-1))
748 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_int64x4, si256, _mm256_set1_epi64x(-1))
749 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float32x8, ps, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
750 OPENCV_HAL_IMPL_AVX_LOGIC_OP(v_float64x4, pd, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
753 #define OPENCV_HAL_IMPL_AVX_SELECT(_Tpvec, suffix) \
754 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
755 { return _Tpvec(_mm256_blendv_##suffix(b.val, a.val, mask.val)); }
757 OPENCV_HAL_IMPL_AVX_SELECT(v_uint8x32, epi8)
758 OPENCV_HAL_IMPL_AVX_SELECT(v_int8x32, epi8)
759 OPENCV_HAL_IMPL_AVX_SELECT(v_uint16x16, epi8)
760 OPENCV_HAL_IMPL_AVX_SELECT(v_int16x16, epi8)
761 OPENCV_HAL_IMPL_AVX_SELECT(v_uint32x8, epi8)
762 OPENCV_HAL_IMPL_AVX_SELECT(v_int32x8, epi8)
763 OPENCV_HAL_IMPL_AVX_SELECT(v_float32x8, ps)
764 OPENCV_HAL_IMPL_AVX_SELECT(v_float64x4, pd)
767 #define OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpvec) \
768 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
769 { return ~(a == b); } \
770 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
772 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
773 { return ~(a < b); } \
774 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
777 #define OPENCV_HAL_IMPL_AVX_CMP_OP_INT(_Tpuvec, _Tpsvec, suffix, sbit) \
778 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
779 { return _Tpuvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
780 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
782 __m256i smask = _mm256_set1_##suffix(sbit); \
783 return _Tpuvec(_mm256_cmpgt_##suffix( \
784 _mm256_xor_si256(a.val, smask), \
785 _mm256_xor_si256(b.val, smask))); \
787 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
788 { return _Tpsvec(_mm256_cmpeq_##suffix(a.val, b.val)); } \
789 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
790 { return _Tpsvec(_mm256_cmpgt_##suffix(a.val, b.val)); } \
791 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpuvec) \
792 OPENCV_HAL_IMPL_AVX_CMP_OP_OV(_Tpsvec)
794 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint8x32, v_int8x32, epi8, (char)-128)
795 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint16x16, v_int16x16, epi16, (short)-32768)
796 OPENCV_HAL_IMPL_AVX_CMP_OP_INT(v_uint32x8, v_int32x8, epi32, (int)0x80000000)
798 #define OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(_Tpvec) \
799 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
800 { return _Tpvec(_mm256_cmpeq_epi64(a.val, b.val)); } \
801 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
802 { return ~(a == b); }
804 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_uint64x4)
805 OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
807 #define OPENCV_HAL_IMPL_AVX_CMP_FLT(bin_op, imm8, _Tpvec, suffix) \
808 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
809 { return _Tpvec(_mm256_cmp_##suffix(a.val, b.val, imm8)); }
811 #define OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(_Tpvec, suffix) \
812 OPENCV_HAL_IMPL_AVX_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, suffix) \
813 OPENCV_HAL_IMPL_AVX_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, suffix) \
814 OPENCV_HAL_IMPL_AVX_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, suffix) \
815 OPENCV_HAL_IMPL_AVX_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, suffix) \
816 OPENCV_HAL_IMPL_AVX_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, suffix) \
817 OPENCV_HAL_IMPL_AVX_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, suffix)
819 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
820 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
823 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32, _mm256_min_epu8)
824 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32, _mm256_max_epu8)
825 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int8x32, _mm256_min_epi8)
826 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int8x32, _mm256_max_epi8)
827 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint16x16, _mm256_min_epu16)
828 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint16x16, _mm256_max_epu16)
829 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int16x16, _mm256_min_epi16)
830 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int16x16, _mm256_max_epi16)
831 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint32x8, _mm256_min_epu32)
832 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint32x8, _mm256_max_epu32)
833 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_int32x8, _mm256_min_epi32)
834 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_int32x8, _mm256_max_epi32)
835 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float32x8, _mm256_min_ps)
836 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float32x8, _mm256_max_ps)
837 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_float64x4, _mm256_min_pd)
838 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_float64x4, _mm256_max_pd)
842 inline v_uint8x32 v_rotate_left(const v_uint8x32& a, const v_uint8x32& b)
844 __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x03);
850 case 16: return v_uint8x32(swap);
853 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(a.val, swap, 16 - imm));
854 if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(swap, b.val, 32 - imm));
860 inline v_uint8x32 v_rotate_right(const v_uint8x32& a, const v_uint8x32& b)
862 __m256i swap = _mm256_permute2x128_si256(a.val, b.val, 0x21);
868 case 16: return v_uint8x32(swap);
871 if (imm < 16) return v_uint8x32(_mm256_alignr_epi8(swap, a.val, imm));
872 if (imm < 32) return v_uint8x32(_mm256_alignr_epi8(b.val, swap, imm - 16));
878 inline v_uint8x32 v_rotate_left(const v_uint8x32& a)
881 // ESAC control[3] ? [127:0] = 0
882 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(0, 0, 2, 0));
889 res.val = _mm256_alignr_epi8(a.val, swapz, 16 - imm);
891 res.val = _mm256_slli_si256(swapz, imm - 16);
898 inline v_uint8x32 v_rotate_right(const v_uint8x32& a)
901 // ESAC control[3] ? [127:0] = 0
902 __m256i swapz = _mm256_permute2x128_si256(a.val, a.val, _MM_SHUFFLE(2, 0, 0, 1));
909 res.val = _mm256_alignr_epi8(swapz, a.val, imm);
911 res.val = _mm256_srli_si256(swapz, imm - 16);
917 #define OPENCV_HAL_IMPL_AVX_ROTATE_CAST(intrin, _Tpvec, cast) \
919 inline _Tpvec intrin(const _Tpvec& a, const _Tpvec& b) \
921 const int w = sizeof(typename _Tpvec::lane_type); \
922 v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a), \
923 v_reinterpret_as_u8(b)); \
924 return _Tpvec(cast(ret.val)); \
927 inline _Tpvec intrin(const _Tpvec& a) \
929 const int w = sizeof(typename _Tpvec::lane_type); \
930 v_uint8x32 ret = intrin<imm*w>(v_reinterpret_as_u8(a)); \
931 return _Tpvec(cast(ret.val)); \
934 #define OPENCV_HAL_IMPL_AVX_ROTATE(_Tpvec) \
935 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, _Tpvec, OPENCV_HAL_NOP) \
936 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, _Tpvec, OPENCV_HAL_NOP)
938 OPENCV_HAL_IMPL_AVX_ROTATE(v_int8x32)
939 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint16x16)
940 OPENCV_HAL_IMPL_AVX_ROTATE(v_int16x16)
941 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint32x8)
942 OPENCV_HAL_IMPL_AVX_ROTATE(v_int32x8)
943 OPENCV_HAL_IMPL_AVX_ROTATE(v_uint64x4)
944 OPENCV_HAL_IMPL_AVX_ROTATE(v_int64x4)
946 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float32x8, _mm256_castsi256_ps)
947 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps)
948 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
949 OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
951 ////////// Reduce and mask /////////
954 #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
955 inline sctype v_reduce_##func(const _Tpvec& a) \
957 __m128i v0 = _v256_extract_low(a.val); \
958 __m128i v1 = _v256_extract_high(a.val); \
959 v0 = intrin(v0, v1); \
960 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
961 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
962 v0 = intrin(v0, _mm_srli_si128(v0, 2)); \
963 return (sctype) _mm_cvtsi128_si32(v0); \
966 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, min, _mm_min_epu16)
967 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, min, _mm_min_epi16)
968 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_uint16x16, ushort, max, _mm_max_epu16)
969 OPENCV_HAL_IMPL_AVX_REDUCE_16(v_int16x16, short, max, _mm_max_epi16)
971 #define OPENCV_HAL_IMPL_AVX_REDUCE_8(_Tpvec, sctype, func, intrin) \
972 inline sctype v_reduce_##func(const _Tpvec& a) \
974 __m128i v0 = _v256_extract_low(a.val); \
975 __m128i v1 = _v256_extract_high(a.val); \
976 v0 = intrin(v0, v1); \
977 v0 = intrin(v0, _mm_srli_si128(v0, 8)); \
978 v0 = intrin(v0, _mm_srli_si128(v0, 4)); \
979 return (sctype) _mm_cvtsi128_si32(v0); \
982 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, min, _mm_min_epu32)
983 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, min, _mm_min_epi32)
984 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_uint32x8, unsigned, max, _mm_max_epu32)
985 OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
987 #define OPENCV_HAL_IMPL_AVX_REDUCE_FLT(func, intrin) \
988 inline float v_reduce_##func(const v_float32x8& a) \
990 __m128 v0 = _v256_extract_low(a.val); \
991 __m128 v1 = _v256_extract_high(a.val); \
992 v0 = intrin(v0, v1); \
993 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
994 v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \
995 return _mm_cvtss_f32(v0); \
998 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
999 OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
1001 inline ushort v_reduce_sum(const v_uint16x16& a)
1003 __m128i a0 = _v256_extract_low(a.val);
1004 __m128i a1 = _v256_extract_high(a.val);
1006 __m128i s0 = _mm_adds_epu16(a0, a1);
1007 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
1008 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
1009 s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
1011 return (ushort)_mm_cvtsi128_si32(s0);
1014 inline short v_reduce_sum(const v_int16x16& a)
1016 __m256i s0 = _mm256_hadds_epi16(a.val, a.val);
1017 s0 = _mm256_hadds_epi16(s0, s0);
1018 s0 = _mm256_hadds_epi16(s0, s0);
1020 __m128i s1 = _v256_extract_high(s0);
1021 s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
1023 return (short)_mm_cvtsi128_si32(s1);
1026 inline int v_reduce_sum(const v_int32x8& a)
1028 __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
1029 s0 = _mm256_hadd_epi32(s0, s0);
1031 __m128i s1 = _v256_extract_high(s0);
1032 s1 = _mm_add_epi32(_v256_extract_low(s0), s1);
1034 return _mm_cvtsi128_si32(s1);
1037 inline unsigned v_reduce_sum(const v_uint32x8& a)
1038 { return v_reduce_sum(v_reinterpret_as_s32(a)); }
1040 inline float v_reduce_sum(const v_float32x8& a)
1042 __m256 s0 = _mm256_hadd_ps(a.val, a.val);
1043 s0 = _mm256_hadd_ps(s0, s0);
1045 __m128 s1 = _v256_extract_high(s0);
1046 s1 = _mm_add_ps(_v256_extract_low(s0), s1);
1048 return _mm_cvtss_f32(s1);
1051 inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
1052 const v_float32x8& c, const v_float32x8& d)
1054 __m256 ab = _mm256_hadd_ps(a.val, b.val);
1055 __m256 cd = _mm256_hadd_ps(c.val, d.val);
1056 return v_float32x8(_mm256_hadd_ps(ab, cd));
1060 #define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
1061 inline v_uint32x8 v_popcount(const _Tpvec& a) \
1063 const v_uint32x8 m1 = v256_setall_u32(0x55555555); \
1064 const v_uint32x8 m2 = v256_setall_u32(0x33333333); \
1065 const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \
1066 v_uint32x8 p = v_reinterpret_as_u32(a); \
1067 p = ((p >> 1) & m1) + (p & m1); \
1068 p = ((p >> 2) & m2) + (p & m2); \
1069 p = ((p >> 4) & m4) + (p & m4); \
1070 p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \
1074 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
1075 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
1076 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
1077 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
1078 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
1079 OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
1082 inline int v_signmask(const v_int8x32& a)
1083 { return _mm256_movemask_epi8(a.val); }
1084 inline int v_signmask(const v_uint8x32& a)
1085 { return v_signmask(v_reinterpret_as_s8(a)); }
1087 inline int v_signmask(const v_int16x16& a)
1089 v_int8x32 v = v_int8x32(_mm256_packs_epi16(a.val, a.val));
1090 return v_signmask(v) & 255;
1092 inline int v_signmask(const v_uint16x16& a)
1093 { return v_signmask(v_reinterpret_as_s16(a)); }
1095 inline int v_signmask(const v_int32x8& a)
1097 __m256i a16 = _mm256_packs_epi32(a.val, a.val);
1098 v_int8x32 v = v_int8x32(_mm256_packs_epi16(a16, a16));
1099 return v_signmask(v) & 15;
1101 inline int v_signmask(const v_uint32x8& a)
1102 { return v_signmask(v_reinterpret_as_s32(a)); }
1104 inline int v_signmask(const v_float32x8& a)
1105 { return _mm256_movemask_ps(a.val); }
1106 inline int v_signmask(const v_float64x4& a)
1107 { return _mm256_movemask_pd(a.val); }
1110 #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \
1111 inline bool v_check_all(const _Tpvec& a) \
1113 int mask = v_signmask(v_reinterpret_as_s8(a)); \
1114 return and_op(mask, allmask) == allmask; \
1116 inline bool v_check_any(const _Tpvec& a) \
1118 int mask = v_signmask(v_reinterpret_as_s8(a)); \
1119 return and_op(mask, allmask) != 0; \
1122 OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, OPENCV_HAL_1ST, -1)
1123 OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, OPENCV_HAL_1ST, -1)
1124 OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaa)
1125 OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16, OPENCV_HAL_AND, (int)0xaaaa)
1126 OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, OPENCV_HAL_AND, (int)0x8888)
1127 OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, OPENCV_HAL_AND, (int)0x8888)
1129 #define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \
1130 inline bool v_check_all(const _Tpvec& a) \
1132 int mask = v_signmask(a); \
1133 return mask == allmask; \
1135 inline bool v_check_any(const _Tpvec& a) \
1137 int mask = v_signmask(a); \
1141 OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255)
1142 OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
1145 ////////// Other math /////////
1147 /** Some frequent operations **/
1148 #define OPENCV_HAL_IMPL_AVX_MULADD(_Tpvec, suffix) \
1149 inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1150 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1151 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
1152 { return _Tpvec(_mm256_fmadd_##suffix(a.val, b.val, c.val)); } \
1153 inline _Tpvec v_sqrt(const _Tpvec& x) \
1154 { return _Tpvec(_mm256_sqrt_##suffix(x.val)); } \
1155 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
1156 { return v_fma(a, a, b * b); } \
1157 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
1158 { return v_sqrt(v_fma(a, a, b*b)); }
1160 OPENCV_HAL_IMPL_AVX_MULADD(v_float32x8, ps)
1161 OPENCV_HAL_IMPL_AVX_MULADD(v_float64x4, pd)
1163 inline v_float32x8 v_invsqrt(const v_float32x8& x)
1165 v_float32x8 half = x * v256_setall_f32(0.5);
1166 v_float32x8 t = v_float32x8(_mm256_rsqrt_ps(x.val));
1167 // todo: _mm256_fnmsub_ps
1168 t *= v256_setall_f32(1.5) - ((t * t) * half);
1172 inline v_float64x4 v_invsqrt(const v_float64x4& x)
1174 return v256_setall_f64(1.) / v_sqrt(x);
1177 /** Absolute values **/
1178 #define OPENCV_HAL_IMPL_AVX_ABS(_Tpvec, suffix) \
1179 inline v_u##_Tpvec v_abs(const v_##_Tpvec& x) \
1180 { return v_u##_Tpvec(_mm256_abs_##suffix(x.val)); }
1182 OPENCV_HAL_IMPL_AVX_ABS(int8x32, epi8)
1183 OPENCV_HAL_IMPL_AVX_ABS(int16x16, epi16)
1184 OPENCV_HAL_IMPL_AVX_ABS(int32x8, epi32)
1186 inline v_float32x8 v_abs(const v_float32x8& x)
1187 { return x & v_float32x8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))); }
1188 inline v_float64x4 v_abs(const v_float64x4& x)
1189 { return x & v_float64x4(_mm256_castsi256_pd(_mm256_srli_epi64(_mm256_set1_epi64x(-1), 1))); }
1191 /** Absolute difference **/
1192 inline v_uint8x32 v_absdiff(const v_uint8x32& a, const v_uint8x32& b)
1193 { return v_add_wrap(a - b, b - a); }
1194 inline v_uint16x16 v_absdiff(const v_uint16x16& a, const v_uint16x16& b)
1195 { return v_add_wrap(a - b, b - a); }
1196 inline v_uint32x8 v_absdiff(const v_uint32x8& a, const v_uint32x8& b)
1197 { return v_max(a, b) - v_min(a, b); }
1199 inline v_uint8x32 v_absdiff(const v_int8x32& a, const v_int8x32& b)
1201 v_int8x32 d = v_sub_wrap(a, b);
1202 v_int8x32 m = a < b;
1203 return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
1206 inline v_uint16x16 v_absdiff(const v_int16x16& a, const v_int16x16& b)
1207 { return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
1209 inline v_uint32x8 v_absdiff(const v_int32x8& a, const v_int32x8& b)
1211 v_int32x8 d = a - b;
1212 v_int32x8 m = a < b;
1213 return v_reinterpret_as_u32((d ^ m) - m);
1216 inline v_float32x8 v_absdiff(const v_float32x8& a, const v_float32x8& b)
1217 { return v_abs(a - b); }
1219 inline v_float64x4 v_absdiff(const v_float64x4& a, const v_float64x4& b)
1220 { return v_abs(a - b); }
1222 ////////// Conversions /////////
1225 inline v_int32x8 v_round(const v_float32x8& a)
1226 { return v_int32x8(_mm256_cvtps_epi32(a.val)); }
1228 inline v_int32x8 v_round(const v_float64x4& a)
1229 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvtpd_epi32(a.val))); }
1231 inline v_int32x8 v_trunc(const v_float32x8& a)
1232 { return v_int32x8(_mm256_cvttps_epi32(a.val)); }
1234 inline v_int32x8 v_trunc(const v_float64x4& a)
1235 { return v_int32x8(_mm256_castsi128_si256(_mm256_cvttpd_epi32(a.val))); }
1237 inline v_int32x8 v_floor(const v_float32x8& a)
1238 { return v_int32x8(_mm256_cvttps_epi32(_mm256_floor_ps(a.val))); }
1240 inline v_int32x8 v_floor(const v_float64x4& a)
1241 { return v_trunc(v_float64x4(_mm256_floor_pd(a.val))); }
1243 inline v_int32x8 v_ceil(const v_float32x8& a)
1244 { return v_int32x8(_mm256_cvttps_epi32(_mm256_ceil_ps(a.val))); }
1246 inline v_int32x8 v_ceil(const v_float64x4& a)
1247 { return v_trunc(v_float64x4(_mm256_ceil_pd(a.val))); }
1250 inline v_float32x8 v_cvt_f32(const v_int32x8& a)
1251 { return v_float32x8(_mm256_cvtepi32_ps(a.val)); }
1253 inline v_float32x8 v_cvt_f32(const v_float64x4& a)
1254 { return v_float32x8(_mm256_castps128_ps256(_mm256_cvtpd_ps(a.val))); }
1256 inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
1258 __m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
1259 return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
1262 inline v_float64x4 v_cvt_f64(const v_int32x8& a)
1263 { return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_low(a.val))); }
1265 inline v_float64x4 v_cvt_f64_high(const v_int32x8& a)
1266 { return v_float64x4(_mm256_cvtepi32_pd(_v256_extract_high(a.val))); }
1268 inline v_float64x4 v_cvt_f64(const v_float32x8& a)
1269 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_low(a.val))); }
1271 inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
1272 { return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
1275 inline v_float32x8 v_cvt_f32(const v_float16x16& a)
1276 { return v_float32x8(_mm256_cvtph_ps(_v256_extract_low(a.val))); }
1278 inline v_float32x8 v_cvt_f32_high(const v_float16x16& a)
1279 { return v_float32x8(_mm256_cvtph_ps(_v256_extract_high(a.val))); }
1281 inline v_float16x16 v_cvt_f16(const v_float32x8& a, const v_float32x8& b)
1283 __m128i ah = _mm256_cvtps_ph(a.val, 0), bh = _mm256_cvtps_ph(b.val, 0);
1284 return v_float16x16(_mm256_inserti128_si256(_mm256_castsi128_si256(ah), bh, 1));
1288 ////////////// Lookup table access ////////////////////
1290 inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
1292 int CV_DECL_ALIGNED(32) idx[8];
1293 v_store_aligned(idx, idxvec);
1294 return v_int32x8(_mm256_setr_epi32(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1295 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1298 inline v_float32x8 v_lut(const float* tab, const v_int32x8& idxvec)
1300 int CV_DECL_ALIGNED(32) idx[8];
1301 v_store_aligned(idx, idxvec);
1302 return v_float32x8(_mm256_setr_ps(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
1303 tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]));
1306 inline v_float64x4 v_lut(const double* tab, const v_int32x8& idxvec)
1308 int CV_DECL_ALIGNED(32) idx[8];
1309 v_store_aligned(idx, idxvec);
1310 return v_float64x4(_mm256_setr_pd(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]));
1313 inline void v_lut_deinterleave(const float* tab, const v_int32x8& idxvec, v_float32x8& x, v_float32x8& y)
1315 int CV_DECL_ALIGNED(32) idx[8];
1316 v_store_aligned(idx, idxvec);
1317 __m128 z = _mm_setzero_ps();
1318 __m128 xy01, xy45, xy23, xy67;
1319 xy01 = _mm_loadl_pi(z, (const __m64*)(tab + idx[0]));
1320 xy01 = _mm_loadh_pi(xy01, (const __m64*)(tab + idx[1]));
1321 xy45 = _mm_loadl_pi(z, (const __m64*)(tab + idx[4]));
1322 xy45 = _mm_loadh_pi(xy45, (const __m64*)(tab + idx[5]));
1323 __m256 xy0145 = _v256_combine(xy01, xy45);
1324 xy23 = _mm_loadl_pi(z, (const __m64*)(tab + idx[2]));
1325 xy23 = _mm_loadh_pi(xy23, (const __m64*)(tab + idx[3]));
1326 xy67 = _mm_loadl_pi(z, (const __m64*)(tab + idx[6]));
1327 xy67 = _mm_loadh_pi(xy67, (const __m64*)(tab + idx[7]));
1328 __m256 xy2367 = _v256_combine(xy23, xy67);
1330 __m256 xxyy0145 = _mm256_unpacklo_ps(xy0145, xy2367);
1331 __m256 xxyy2367 = _mm256_unpackhi_ps(xy0145, xy2367);
1333 x = v_float32x8(_mm256_unpacklo_ps(xxyy0145, xxyy2367));
1334 y = v_float32x8(_mm256_unpackhi_ps(xxyy0145, xxyy2367));
1337 inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_float64x4& x, v_float64x4& y)
1339 int CV_DECL_ALIGNED(32) idx[4];
1340 v_store_low(idx, idxvec);
1341 __m128d xy0 = _mm_loadu_pd(tab + idx[0]);
1342 __m128d xy2 = _mm_loadu_pd(tab + idx[2]);
1343 __m128d xy1 = _mm_loadu_pd(tab + idx[1]);
1344 __m128d xy3 = _mm_loadu_pd(tab + idx[3]);
1345 __m256d xy02 = _v256_combine(xy0, xy2);
1346 __m256d xy13 = _v256_combine(xy1, xy3);
1348 x = v_float64x4(_mm256_unpacklo_pd(xy02, xy13));
1349 y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
1352 ////////// Matrix operations /////////
1354 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
1355 { return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
1357 inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
1358 { return v_dotprod(a, b) + c; }
1360 #define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
1361 v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
1363 inline v_float32x8 v_matmul(const v_float32x8& v, const v_float32x8& m0,
1364 const v_float32x8& m1, const v_float32x8& m2,
1365 const v_float32x8& m3)
1367 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1368 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1369 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1370 v_float32x8 v37 = OPENCV_HAL_AVX_SPLAT2_PS(v, 3);
1371 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
1374 inline v_float32x8 v_matmuladd(const v_float32x8& v, const v_float32x8& m0,
1375 const v_float32x8& m1, const v_float32x8& m2,
1376 const v_float32x8& a)
1378 v_float32x8 v04 = OPENCV_HAL_AVX_SPLAT2_PS(v, 0);
1379 v_float32x8 v15 = OPENCV_HAL_AVX_SPLAT2_PS(v, 1);
1380 v_float32x8 v26 = OPENCV_HAL_AVX_SPLAT2_PS(v, 2);
1381 return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
1384 #define OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
1385 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
1386 const _Tpvec& a2, const _Tpvec& a3, \
1387 _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \
1389 __m256i t0 = cast_from(_mm256_unpacklo_##suffix(a0.val, a1.val)); \
1390 __m256i t1 = cast_from(_mm256_unpacklo_##suffix(a2.val, a3.val)); \
1391 __m256i t2 = cast_from(_mm256_unpackhi_##suffix(a0.val, a1.val)); \
1392 __m256i t3 = cast_from(_mm256_unpackhi_##suffix(a2.val, a3.val)); \
1393 b0.val = cast_to(_mm256_unpacklo_epi64(t0, t1)); \
1394 b1.val = cast_to(_mm256_unpackhi_epi64(t0, t1)); \
1395 b2.val = cast_to(_mm256_unpacklo_epi64(t2, t3)); \
1396 b3.val = cast_to(_mm256_unpackhi_epi64(t2, t3)); \
1399 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_uint32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1400 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_int32x8, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
1401 OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_castsi256_ps)
1403 //////////////// Value reordering ///////////////
1406 #define OPENCV_HAL_IMPL_AVX_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
1407 inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
1409 b0.val = intrin(_v256_extract_low(a.val)); \
1410 b1.val = intrin(_v256_extract_high(a.val)); \
1412 inline _Tpwvec v256_load_expand(const _Tp* ptr) \
1414 __m128i a = _mm_loadu_si128((const __m128i*)ptr); \
1415 return _Tpwvec(intrin(a)); \
1418 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint8x32, v_uint16x16, uchar, _mm256_cvtepu8_epi16)
1419 OPENCV_HAL_IMPL_AVX_EXPAND(v_int8x32, v_int16x16, schar, _mm256_cvtepi8_epi16)
1420 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint16x16, v_uint32x8, ushort, _mm256_cvtepu16_epi32)
1421 OPENCV_HAL_IMPL_AVX_EXPAND(v_int16x16, v_int32x8, short, _mm256_cvtepi16_epi32)
1422 OPENCV_HAL_IMPL_AVX_EXPAND(v_uint32x8, v_uint64x4, unsigned, _mm256_cvtepu32_epi64)
1423 OPENCV_HAL_IMPL_AVX_EXPAND(v_int32x8, v_int64x4, int, _mm256_cvtepi32_epi64)
1425 #define OPENCV_HAL_IMPL_AVX_EXPAND_Q(_Tpvec, _Tp, intrin) \
1426 inline _Tpvec v256_load_expand_q(const _Tp* ptr) \
1428 __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
1429 return _Tpvec(intrin(a)); \
1432 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_uint32x8, uchar, _mm256_cvtepu8_epi32)
1433 OPENCV_HAL_IMPL_AVX_EXPAND_Q(v_int32x8, schar, _mm256_cvtepi8_epi32)
1437 inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
1438 { return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
1440 inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
1441 { return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
1443 inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
1444 { return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
1446 inline void v_pack_store(schar* ptr, const v_int16x16& a)
1447 { v_store_low(ptr, v_pack(a, a)); }
1449 inline void v_pack_store(uchar* ptr, const v_uint16x16& a)
1450 { v_store_low(ptr, v_pack(a, a)); }
1452 inline void v_pack_u_store(uchar* ptr, const v_int16x16& a)
1453 { v_store_low(ptr, v_pack_u(a, a)); }
1455 template<int n> inline
1456 v_uint8x32 v_rshr_pack(const v_uint16x16& a, const v_uint16x16& b)
1458 // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
1459 v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
1460 return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
1461 v_reinterpret_as_s16((b + delta) >> n));
1464 template<int n> inline
1465 void v_rshr_pack_store(uchar* ptr, const v_uint16x16& a)
1467 v_uint16x16 delta = v256_setall_u16((short)(1 << (n-1)));
1468 v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
1471 template<int n> inline
1472 v_uint8x32 v_rshr_pack_u(const v_int16x16& a, const v_int16x16& b)
1474 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1475 return v_pack_u((a + delta) >> n, (b + delta) >> n);
1478 template<int n> inline
1479 void v_rshr_pack_u_store(uchar* ptr, const v_int16x16& a)
1481 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1482 v_pack_u_store(ptr, (a + delta) >> n);
1485 template<int n> inline
1486 v_int8x32 v_rshr_pack(const v_int16x16& a, const v_int16x16& b)
1488 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1489 return v_pack((a + delta) >> n, (b + delta) >> n);
1492 template<int n> inline
1493 void v_rshr_pack_store(schar* ptr, const v_int16x16& a)
1495 v_int16x16 delta = v256_setall_s16((short)(1 << (n-1)));
1496 v_pack_store(ptr, (a + delta) >> n);
1500 inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
1501 { return v_int16x16(_v256_shuffle_odd_64(_mm256_packs_epi32(a.val, b.val))); }
1503 inline v_uint16x16 v_pack(const v_uint32x8& a, const v_uint32x8& b)
1504 { return v_uint16x16(_v256_shuffle_odd_64(_mm256_packus_epi32(a.val, b.val))); }
1506 inline v_uint16x16 v_pack_u(const v_int32x8& a, const v_int32x8& b)
1507 { return v_pack(v_reinterpret_as_u32(a), v_reinterpret_as_u32(b)); }
1509 inline void v_pack_store(short* ptr, const v_int32x8& a)
1510 { v_store_low(ptr, v_pack(a, a)); }
1512 inline void v_pack_store(ushort* ptr, const v_uint32x8& a)
1513 { v_store_low(ptr, v_pack(a, a)); }
1515 inline void v_pack_u_store(ushort* ptr, const v_int32x8& a)
1516 { v_store_low(ptr, v_pack_u(a, a)); }
1519 template<int n> inline
1520 v_uint16x16 v_rshr_pack(const v_uint32x8& a, const v_uint32x8& b)
1522 // we assume that n > 0, and so the shifted 32-bit values can be treated as signed numbers.
1523 v_uint32x8 delta = v256_setall_u32(1 << (n-1));
1524 return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
1525 v_reinterpret_as_s32((b + delta) >> n));
1528 template<int n> inline
1529 void v_rshr_pack_store(ushort* ptr, const v_uint32x8& a)
1531 v_uint32x8 delta = v256_setall_u32(1 << (n-1));
1532 v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
1535 template<int n> inline
1536 v_uint16x16 v_rshr_pack_u(const v_int32x8& a, const v_int32x8& b)
1538 v_int32x8 delta = v256_setall_s32(1 << (n-1));
1539 return v_pack_u((a + delta) >> n, (b + delta) >> n);
1542 template<int n> inline
1543 void v_rshr_pack_u_store(ushort* ptr, const v_int32x8& a)
1545 v_int32x8 delta = v256_setall_s32(1 << (n-1));
1546 v_pack_u_store(ptr, (a + delta) >> n);
1549 template<int n> inline
1550 v_int16x16 v_rshr_pack(const v_int32x8& a, const v_int32x8& b)
1552 v_int32x8 delta = v256_setall_s32(1 << (n-1));
1553 return v_pack((a + delta) >> n, (b + delta) >> n);
1556 template<int n> inline
1557 void v_rshr_pack_store(short* ptr, const v_int32x8& a)
1559 v_int32x8 delta = v256_setall_s32(1 << (n-1));
1560 v_pack_store(ptr, (a + delta) >> n);
1564 // Non-saturating pack
1565 inline v_uint32x8 v_pack(const v_uint64x4& a, const v_uint64x4& b)
1567 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
1568 __m256i b0 = _mm256_shuffle_epi32(b.val, _MM_SHUFFLE(0, 0, 2, 0));
1569 __m256i ab = _mm256_unpacklo_epi64(a0, b0); // a0, a1, b0, b1, a2, a3, b2, b3
1570 return v_uint32x8(_v256_shuffle_odd_64(ab));
1573 inline v_int32x8 v_pack(const v_int64x4& a, const v_int64x4& b)
1574 { return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
1576 inline void v_pack_store(unsigned* ptr, const v_uint64x4& a)
1578 __m256i a0 = _mm256_shuffle_epi32(a.val, _MM_SHUFFLE(0, 0, 2, 0));
1579 v_store_low(ptr, v_uint32x8(_v256_shuffle_odd_64(a0)));
1582 inline void v_pack_store(int* ptr, const v_int64x4& b)
1583 { v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
1585 template<int n> inline
1586 v_uint32x8 v_rshr_pack(const v_uint64x4& a, const v_uint64x4& b)
1588 v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
1589 return v_pack((a + delta) >> n, (b + delta) >> n);
1592 template<int n> inline
1593 void v_rshr_pack_store(unsigned* ptr, const v_uint64x4& a)
1595 v_uint64x4 delta = v256_setall_u64((uint64)1 << (n-1));
1596 v_pack_store(ptr, (a + delta) >> n);
1599 template<int n> inline
1600 v_int32x8 v_rshr_pack(const v_int64x4& a, const v_int64x4& b)
1602 v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
1603 return v_pack((a + delta) >> n, (b + delta) >> n);
1606 template<int n> inline
1607 void v_rshr_pack_store(int* ptr, const v_int64x4& a)
1609 v_int64x4 delta = v256_setall_s64((int64)1 << (n-1));
1610 v_pack_store(ptr, (a + delta) >> n);
1614 // its up there with load and store operations
1617 #define OPENCV_HAL_IMPL_AVX_EXTRACT(_Tpvec) \
1619 inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
1620 { return v_rotate_right<s>(a, b); }
1622 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint8x32)
1623 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int8x32)
1624 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint16x16)
1625 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int16x16)
1626 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint32x8)
1627 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int32x8)
1628 OPENCV_HAL_IMPL_AVX_EXTRACT(v_uint64x4)
1629 OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
1630 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
1631 OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
1634 ///////////////////// load deinterleave /////////////////////////////
1636 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& a, v_uint8x32& b )
1638 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
1639 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1641 const __m256i sh = _mm256_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
1642 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
1643 __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
1644 __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
1645 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
1646 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
1647 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
1648 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
1653 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& a, v_uint16x16& b )
1655 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
1656 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1658 const __m256i sh = _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1659 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
1660 __m256i p0 = _mm256_shuffle_epi8(ab0, sh);
1661 __m256i p1 = _mm256_shuffle_epi8(ab1, sh);
1662 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
1663 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
1664 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
1665 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
1666 a = v_uint16x16(a0);
1667 b = v_uint16x16(b0);
1670 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& a, v_uint32x8& b )
1672 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
1673 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1675 const int sh = 0+2*4+1*16+3*64;
1676 __m256i p0 = _mm256_shuffle_epi32(ab0, sh);
1677 __m256i p1 = _mm256_shuffle_epi32(ab1, sh);
1678 __m256i pl = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
1679 __m256i ph = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
1680 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
1681 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
1686 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& a, v_uint64x4& b )
1688 __m256i ab0 = _mm256_loadu_si256((const __m256i*)ptr);
1689 __m256i ab1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
1691 __m256i pl = _mm256_permute2x128_si256(ab0, ab1, 0 + 2*16);
1692 __m256i ph = _mm256_permute2x128_si256(ab0, ab1, 1 + 3*16);
1693 __m256i a0 = _mm256_unpacklo_epi64(pl, ph);
1694 __m256i b0 = _mm256_unpackhi_epi64(pl, ph);
1699 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r )
1701 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1702 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1703 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
1705 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
1706 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
1708 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
1709 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
1710 const __m256i m1 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
1711 -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1);
1713 __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
1714 __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
1715 __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
1718 sh_b = _mm256_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13,
1719 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13),
1720 sh_g = _mm256_setr_epi8(1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14,
1721 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14),
1722 sh_r = _mm256_setr_epi8(2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15,
1723 2, 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15);
1724 b0 = _mm256_shuffle_epi8(b0, sh_b);
1725 g0 = _mm256_shuffle_epi8(g0, sh_g);
1726 r0 = _mm256_shuffle_epi8(r0, sh_r);
1733 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r )
1735 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1736 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1737 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1739 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
1740 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
1742 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
1743 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
1744 const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
1745 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
1746 __m256i b0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_low, s02_high, m0), bgr1, m1);
1747 __m256i g0 = _mm256_blendv_epi8(_mm256_blendv_epi8(bgr1, s02_low, m0), s02_high, m1);
1748 __m256i r0 = _mm256_blendv_epi8(_mm256_blendv_epi8(s02_high, s02_low, m1), bgr1, m0);
1749 const __m256i sh_b = _mm256_setr_epi8(0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
1750 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
1751 const __m256i sh_g = _mm256_setr_epi8(2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13,
1752 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13);
1753 const __m256i sh_r = _mm256_setr_epi8(4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
1754 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
1755 b0 = _mm256_shuffle_epi8(b0, sh_b);
1756 g0 = _mm256_shuffle_epi8(g0, sh_g);
1757 r0 = _mm256_shuffle_epi8(r0, sh_r);
1759 b = v_uint16x16(b0);
1760 g = v_uint16x16(g0);
1761 r = v_uint16x16(r0);
1764 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r )
1766 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1767 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1768 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1770 __m256i s02_low = _mm256_permute2x128_si256(bgr0, bgr2, 0 + 2*16);
1771 __m256i s02_high = _mm256_permute2x128_si256(bgr0, bgr2, 1 + 3*16);
1773 __m256i b0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_low, s02_high, 0x24), bgr1, 0x92);
1774 __m256i g0 = _mm256_blend_epi32(_mm256_blend_epi32(s02_high, s02_low, 0x92), bgr1, 0x24);
1775 __m256i r0 = _mm256_blend_epi32(_mm256_blend_epi32(bgr1, s02_low, 0x24), s02_high, 0x92);
1777 b0 = _mm256_shuffle_epi32(b0, 0x6c);
1778 g0 = _mm256_shuffle_epi32(g0, 0xb1);
1779 r0 = _mm256_shuffle_epi32(r0, 0xc6);
1786 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r )
1788 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1789 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
1790 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1792 __m256i s01 = _mm256_blend_epi32(bgr0, bgr1, 0xf0);
1793 __m256i s12 = _mm256_blend_epi32(bgr1, bgr2, 0xf0);
1794 __m256i s20r = _mm256_permute4x64_epi64(_mm256_blend_epi32(bgr2, bgr0, 0xf0), 0x1b);
1795 __m256i b0 = _mm256_unpacklo_epi64(s01, s20r);
1796 __m256i g0 = _mm256_alignr_epi8(s12, s01, 8);
1797 __m256i r0 = _mm256_unpackhi_epi64(s20r, s12);
1804 inline void v_load_deinterleave( const uchar* ptr, v_uint8x32& b, v_uint8x32& g, v_uint8x32& r, v_uint8x32& a )
1806 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1807 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1808 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 64));
1809 __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 96));
1810 const __m256i sh = _mm256_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
1811 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
1813 __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
1814 __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
1815 __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
1816 __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
1818 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
1819 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
1820 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
1821 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
1823 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
1824 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
1825 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
1826 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
1828 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
1829 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
1830 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
1831 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
1839 inline void v_load_deinterleave( const ushort* ptr, v_uint16x16& b, v_uint16x16& g, v_uint16x16& r, v_uint16x16& a )
1841 __m256i bgr0 = _mm256_loadu_si256((const __m256i*)ptr);
1842 __m256i bgr1 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1843 __m256i bgr2 = _mm256_loadu_si256((const __m256i*)(ptr + 32));
1844 __m256i bgr3 = _mm256_loadu_si256((const __m256i*)(ptr + 48));
1845 const __m256i sh = _mm256_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
1846 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
1847 __m256i p0 = _mm256_shuffle_epi8(bgr0, sh);
1848 __m256i p1 = _mm256_shuffle_epi8(bgr1, sh);
1849 __m256i p2 = _mm256_shuffle_epi8(bgr2, sh);
1850 __m256i p3 = _mm256_shuffle_epi8(bgr3, sh);
1852 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
1853 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
1854 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
1855 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
1857 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
1858 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
1859 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
1860 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
1862 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
1863 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
1864 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
1865 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
1867 b = v_uint16x16(b0);
1868 g = v_uint16x16(g0);
1869 r = v_uint16x16(r0);
1870 a = v_uint16x16(a0);
1873 inline void v_load_deinterleave( const unsigned* ptr, v_uint32x8& b, v_uint32x8& g, v_uint32x8& r, v_uint32x8& a )
1875 __m256i p0 = _mm256_loadu_si256((const __m256i*)ptr);
1876 __m256i p1 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1877 __m256i p2 = _mm256_loadu_si256((const __m256i*)(ptr + 16));
1878 __m256i p3 = _mm256_loadu_si256((const __m256i*)(ptr + 24));
1880 __m256i p01l = _mm256_unpacklo_epi32(p0, p1);
1881 __m256i p01h = _mm256_unpackhi_epi32(p0, p1);
1882 __m256i p23l = _mm256_unpacklo_epi32(p2, p3);
1883 __m256i p23h = _mm256_unpackhi_epi32(p2, p3);
1885 __m256i pll = _mm256_permute2x128_si256(p01l, p23l, 0 + 2*16);
1886 __m256i plh = _mm256_permute2x128_si256(p01l, p23l, 1 + 3*16);
1887 __m256i phl = _mm256_permute2x128_si256(p01h, p23h, 0 + 2*16);
1888 __m256i phh = _mm256_permute2x128_si256(p01h, p23h, 1 + 3*16);
1890 __m256i b0 = _mm256_unpacklo_epi32(pll, plh);
1891 __m256i g0 = _mm256_unpackhi_epi32(pll, plh);
1892 __m256i r0 = _mm256_unpacklo_epi32(phl, phh);
1893 __m256i a0 = _mm256_unpackhi_epi32(phl, phh);
1901 inline void v_load_deinterleave( const uint64* ptr, v_uint64x4& b, v_uint64x4& g, v_uint64x4& r, v_uint64x4& a )
1903 __m256i bgra0 = _mm256_loadu_si256((const __m256i*)ptr);
1904 __m256i bgra1 = _mm256_loadu_si256((const __m256i*)(ptr + 4));
1905 __m256i bgra2 = _mm256_loadu_si256((const __m256i*)(ptr + 8));
1906 __m256i bgra3 = _mm256_loadu_si256((const __m256i*)(ptr + 12));
1908 __m256i l02 = _mm256_permute2x128_si256(bgra0, bgra2, 0 + 2*16);
1909 __m256i h02 = _mm256_permute2x128_si256(bgra0, bgra2, 1 + 3*16);
1910 __m256i l13 = _mm256_permute2x128_si256(bgra1, bgra3, 0 + 2*16);
1911 __m256i h13 = _mm256_permute2x128_si256(bgra1, bgra3, 1 + 3*16);
1913 __m256i b0 = _mm256_unpacklo_epi64(l02, l13);
1914 __m256i g0 = _mm256_unpackhi_epi64(l02, l13);
1915 __m256i r0 = _mm256_unpacklo_epi64(h02, h13);
1916 __m256i a0 = _mm256_unpackhi_epi64(h02, h13);
1924 ///////////////////////////// store interleave /////////////////////////////////////
1926 inline void v_store_interleave( uchar* ptr, const v_uint8x32& x, const v_uint8x32& y,
1927 hal::StoreMode mode=hal::STORE_UNALIGNED )
1929 __m256i xy_l = _mm256_unpacklo_epi8(x.val, y.val);
1930 __m256i xy_h = _mm256_unpackhi_epi8(x.val, y.val);
1932 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
1933 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
1935 if( mode == hal::STORE_ALIGNED_NOCACHE )
1937 _mm256_stream_si256((__m256i*)ptr, xy0);
1938 _mm256_stream_si256((__m256i*)(ptr + 32), xy1);
1940 else if( mode == hal::STORE_ALIGNED )
1942 _mm256_store_si256((__m256i*)ptr, xy0);
1943 _mm256_store_si256((__m256i*)(ptr + 32), xy1);
1947 _mm256_storeu_si256((__m256i*)ptr, xy0);
1948 _mm256_storeu_si256((__m256i*)(ptr + 32), xy1);
1952 inline void v_store_interleave( ushort* ptr, const v_uint16x16& x, const v_uint16x16& y,
1953 hal::StoreMode mode=hal::STORE_UNALIGNED )
1955 __m256i xy_l = _mm256_unpacklo_epi16(x.val, y.val);
1956 __m256i xy_h = _mm256_unpackhi_epi16(x.val, y.val);
1958 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
1959 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
1961 if( mode == hal::STORE_ALIGNED_NOCACHE )
1963 _mm256_stream_si256((__m256i*)ptr, xy0);
1964 _mm256_stream_si256((__m256i*)(ptr + 16), xy1);
1966 else if( mode == hal::STORE_ALIGNED )
1968 _mm256_store_si256((__m256i*)ptr, xy0);
1969 _mm256_store_si256((__m256i*)(ptr + 16), xy1);
1973 _mm256_storeu_si256((__m256i*)ptr, xy0);
1974 _mm256_storeu_si256((__m256i*)(ptr + 16), xy1);
1978 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& x, const v_uint32x8& y,
1979 hal::StoreMode mode=hal::STORE_UNALIGNED )
1981 __m256i xy_l = _mm256_unpacklo_epi32(x.val, y.val);
1982 __m256i xy_h = _mm256_unpackhi_epi32(x.val, y.val);
1984 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
1985 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
1987 if( mode == hal::STORE_ALIGNED_NOCACHE )
1989 _mm256_stream_si256((__m256i*)ptr, xy0);
1990 _mm256_stream_si256((__m256i*)(ptr + 8), xy1);
1992 else if( mode == hal::STORE_ALIGNED )
1994 _mm256_store_si256((__m256i*)ptr, xy0);
1995 _mm256_store_si256((__m256i*)(ptr + 8), xy1);
1999 _mm256_storeu_si256((__m256i*)ptr, xy0);
2000 _mm256_storeu_si256((__m256i*)(ptr + 8), xy1);
2004 inline void v_store_interleave( uint64* ptr, const v_uint64x4& x, const v_uint64x4& y,
2005 hal::StoreMode mode=hal::STORE_UNALIGNED )
2007 __m256i xy_l = _mm256_unpacklo_epi64(x.val, y.val);
2008 __m256i xy_h = _mm256_unpackhi_epi64(x.val, y.val);
2010 __m256i xy0 = _mm256_permute2x128_si256(xy_l, xy_h, 0 + 2*16);
2011 __m256i xy1 = _mm256_permute2x128_si256(xy_l, xy_h, 1 + 3*16);
2013 if( mode == hal::STORE_ALIGNED_NOCACHE )
2015 _mm256_stream_si256((__m256i*)ptr, xy0);
2016 _mm256_stream_si256((__m256i*)(ptr + 4), xy1);
2018 else if( mode == hal::STORE_ALIGNED )
2020 _mm256_store_si256((__m256i*)ptr, xy0);
2021 _mm256_store_si256((__m256i*)(ptr + 4), xy1);
2025 _mm256_storeu_si256((__m256i*)ptr, xy0);
2026 _mm256_storeu_si256((__m256i*)(ptr + 4), xy1);
2030 inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g, const v_uint8x32& r,
2031 hal::StoreMode mode=hal::STORE_UNALIGNED )
2033 const __m256i sh_b = _mm256_setr_epi8(
2034 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5,
2035 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10, 5);
2036 const __m256i sh_g = _mm256_setr_epi8(
2037 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10,
2038 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15, 10);
2039 const __m256i sh_r = _mm256_setr_epi8(
2040 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15,
2041 10, 5, 0, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4, 15);
2043 __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
2044 __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
2045 __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
2047 const __m256i m0 = _mm256_setr_epi8(0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0,
2048 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0);
2049 const __m256i m1 = _mm256_setr_epi8(0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0,
2050 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0);
2052 __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2053 __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2054 __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2056 __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2057 __m256i bgr1 = _mm256_permute2x128_si256(p2, p0, 0 + 3*16);
2058 __m256i bgr2 = _mm256_permute2x128_si256(p1, p2, 1 + 3*16);
2060 if( mode == hal::STORE_ALIGNED_NOCACHE )
2062 _mm256_stream_si256((__m256i*)ptr, bgr0);
2063 _mm256_stream_si256((__m256i*)(ptr + 32), bgr1);
2064 _mm256_stream_si256((__m256i*)(ptr + 64), bgr2);
2066 else if( mode == hal::STORE_ALIGNED )
2068 _mm256_store_si256((__m256i*)ptr, bgr0);
2069 _mm256_store_si256((__m256i*)(ptr + 32), bgr1);
2070 _mm256_store_si256((__m256i*)(ptr + 64), bgr2);
2074 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2075 _mm256_storeu_si256((__m256i*)(ptr + 32), bgr1);
2076 _mm256_storeu_si256((__m256i*)(ptr + 64), bgr2);
2080 inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g, const v_uint16x16& r,
2081 hal::StoreMode mode=hal::STORE_UNALIGNED )
2083 const __m256i sh_b = _mm256_setr_epi8(
2084 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11,
2085 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11);
2086 const __m256i sh_g = _mm256_setr_epi8(
2087 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5,
2088 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5);
2089 const __m256i sh_r = _mm256_setr_epi8(
2090 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15,
2091 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15);
2093 __m256i b0 = _mm256_shuffle_epi8(b.val, sh_b);
2094 __m256i g0 = _mm256_shuffle_epi8(g.val, sh_g);
2095 __m256i r0 = _mm256_shuffle_epi8(r.val, sh_r);
2097 const __m256i m0 = _mm256_setr_epi8(0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1,
2098 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0);
2099 const __m256i m1 = _mm256_setr_epi8(0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0,
2100 -1, -1, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, -1, -1, 0, 0);
2102 __m256i p0 = _mm256_blendv_epi8(_mm256_blendv_epi8(b0, g0, m0), r0, m1);
2103 __m256i p1 = _mm256_blendv_epi8(_mm256_blendv_epi8(g0, r0, m0), b0, m1);
2104 __m256i p2 = _mm256_blendv_epi8(_mm256_blendv_epi8(r0, b0, m0), g0, m1);
2106 __m256i bgr0 = _mm256_permute2x128_si256(p0, p2, 0 + 2*16);
2107 //__m256i bgr1 = p1;
2108 __m256i bgr2 = _mm256_permute2x128_si256(p0, p2, 1 + 3*16);
2110 if( mode == hal::STORE_ALIGNED_NOCACHE )
2112 _mm256_stream_si256((__m256i*)ptr, bgr0);
2113 _mm256_stream_si256((__m256i*)(ptr + 16), p1);
2114 _mm256_stream_si256((__m256i*)(ptr + 32), bgr2);
2116 else if( mode == hal::STORE_ALIGNED )
2118 _mm256_store_si256((__m256i*)ptr, bgr0);
2119 _mm256_store_si256((__m256i*)(ptr + 16), p1);
2120 _mm256_store_si256((__m256i*)(ptr + 32), bgr2);
2124 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2125 _mm256_storeu_si256((__m256i*)(ptr + 16), p1);
2126 _mm256_storeu_si256((__m256i*)(ptr + 32), bgr2);
2130 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g, const v_uint32x8& r,
2131 hal::StoreMode mode=hal::STORE_UNALIGNED )
2133 __m256i b0 = _mm256_shuffle_epi32(b.val, 0x6c);
2134 __m256i g0 = _mm256_shuffle_epi32(g.val, 0xb1);
2135 __m256i r0 = _mm256_shuffle_epi32(r.val, 0xc6);
2137 __m256i p0 = _mm256_blend_epi32(_mm256_blend_epi32(b0, g0, 0x92), r0, 0x24);
2138 __m256i p1 = _mm256_blend_epi32(_mm256_blend_epi32(g0, r0, 0x92), b0, 0x24);
2139 __m256i p2 = _mm256_blend_epi32(_mm256_blend_epi32(r0, b0, 0x92), g0, 0x24);
2141 __m256i bgr0 = _mm256_permute2x128_si256(p0, p1, 0 + 2*16);
2142 //__m256i bgr1 = p2;
2143 __m256i bgr2 = _mm256_permute2x128_si256(p0, p1, 1 + 3*16);
2145 if( mode == hal::STORE_ALIGNED_NOCACHE )
2147 _mm256_stream_si256((__m256i*)ptr, bgr0);
2148 _mm256_stream_si256((__m256i*)(ptr + 8), p2);
2149 _mm256_stream_si256((__m256i*)(ptr + 16), bgr2);
2151 else if( mode == hal::STORE_ALIGNED )
2153 _mm256_store_si256((__m256i*)ptr, bgr0);
2154 _mm256_store_si256((__m256i*)(ptr + 8), p2);
2155 _mm256_store_si256((__m256i*)(ptr + 16), bgr2);
2159 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2160 _mm256_storeu_si256((__m256i*)(ptr + 8), p2);
2161 _mm256_storeu_si256((__m256i*)(ptr + 16), bgr2);
2165 inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g, const v_uint64x4& r,
2166 hal::StoreMode mode=hal::STORE_UNALIGNED )
2168 __m256i s01 = _mm256_unpacklo_epi64(b.val, g.val);
2169 __m256i s12 = _mm256_unpackhi_epi64(g.val, r.val);
2170 __m256i s20 = _mm256_blend_epi32(r.val, b.val, 0xcc);
2172 __m256i bgr0 = _mm256_permute2x128_si256(s01, s20, 0 + 2*16);
2173 __m256i bgr1 = _mm256_blend_epi32(s01, s12, 0x0f);
2174 __m256i bgr2 = _mm256_permute2x128_si256(s20, s12, 1 + 3*16);
2176 if( mode == hal::STORE_ALIGNED_NOCACHE )
2178 _mm256_stream_si256((__m256i*)ptr, bgr0);
2179 _mm256_stream_si256((__m256i*)(ptr + 4), bgr1);
2180 _mm256_stream_si256((__m256i*)(ptr + 8), bgr2);
2182 else if( mode == hal::STORE_ALIGNED )
2184 _mm256_store_si256((__m256i*)ptr, bgr0);
2185 _mm256_store_si256((__m256i*)(ptr + 4), bgr1);
2186 _mm256_store_si256((__m256i*)(ptr + 8), bgr2);
2190 _mm256_storeu_si256((__m256i*)ptr, bgr0);
2191 _mm256_storeu_si256((__m256i*)(ptr + 4), bgr1);
2192 _mm256_storeu_si256((__m256i*)(ptr + 8), bgr2);
2196 inline void v_store_interleave( uchar* ptr, const v_uint8x32& b, const v_uint8x32& g,
2197 const v_uint8x32& r, const v_uint8x32& a,
2198 hal::StoreMode mode=hal::STORE_UNALIGNED )
2200 __m256i bg0 = _mm256_unpacklo_epi8(b.val, g.val);
2201 __m256i bg1 = _mm256_unpackhi_epi8(b.val, g.val);
2202 __m256i ra0 = _mm256_unpacklo_epi8(r.val, a.val);
2203 __m256i ra1 = _mm256_unpackhi_epi8(r.val, a.val);
2205 __m256i bgra0_ = _mm256_unpacklo_epi16(bg0, ra0);
2206 __m256i bgra1_ = _mm256_unpackhi_epi16(bg0, ra0);
2207 __m256i bgra2_ = _mm256_unpacklo_epi16(bg1, ra1);
2208 __m256i bgra3_ = _mm256_unpackhi_epi16(bg1, ra1);
2210 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2211 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2212 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2213 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2215 if( mode == hal::STORE_ALIGNED_NOCACHE )
2217 _mm256_stream_si256((__m256i*)ptr, bgra0);
2218 _mm256_stream_si256((__m256i*)(ptr + 32), bgra1);
2219 _mm256_stream_si256((__m256i*)(ptr + 64), bgra2);
2220 _mm256_stream_si256((__m256i*)(ptr + 96), bgra3);
2222 else if( mode == hal::STORE_ALIGNED )
2224 _mm256_store_si256((__m256i*)ptr, bgra0);
2225 _mm256_store_si256((__m256i*)(ptr + 32), bgra1);
2226 _mm256_store_si256((__m256i*)(ptr + 64), bgra2);
2227 _mm256_store_si256((__m256i*)(ptr + 96), bgra3);
2231 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2232 _mm256_storeu_si256((__m256i*)(ptr + 32), bgra1);
2233 _mm256_storeu_si256((__m256i*)(ptr + 64), bgra2);
2234 _mm256_storeu_si256((__m256i*)(ptr + 96), bgra3);
2238 inline void v_store_interleave( ushort* ptr, const v_uint16x16& b, const v_uint16x16& g,
2239 const v_uint16x16& r, const v_uint16x16& a,
2240 hal::StoreMode mode=hal::STORE_UNALIGNED )
2242 __m256i bg0 = _mm256_unpacklo_epi16(b.val, g.val);
2243 __m256i bg1 = _mm256_unpackhi_epi16(b.val, g.val);
2244 __m256i ra0 = _mm256_unpacklo_epi16(r.val, a.val);
2245 __m256i ra1 = _mm256_unpackhi_epi16(r.val, a.val);
2247 __m256i bgra0_ = _mm256_unpacklo_epi32(bg0, ra0);
2248 __m256i bgra1_ = _mm256_unpackhi_epi32(bg0, ra0);
2249 __m256i bgra2_ = _mm256_unpacklo_epi32(bg1, ra1);
2250 __m256i bgra3_ = _mm256_unpackhi_epi32(bg1, ra1);
2252 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2253 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2254 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2255 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2257 if( mode == hal::STORE_ALIGNED_NOCACHE )
2259 _mm256_stream_si256((__m256i*)ptr, bgra0);
2260 _mm256_stream_si256((__m256i*)(ptr + 16), bgra1);
2261 _mm256_stream_si256((__m256i*)(ptr + 32), bgra2);
2262 _mm256_stream_si256((__m256i*)(ptr + 48), bgra3);
2264 else if( mode == hal::STORE_ALIGNED )
2266 _mm256_store_si256((__m256i*)ptr, bgra0);
2267 _mm256_store_si256((__m256i*)(ptr + 16), bgra1);
2268 _mm256_store_si256((__m256i*)(ptr + 32), bgra2);
2269 _mm256_store_si256((__m256i*)(ptr + 48), bgra3);
2273 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2274 _mm256_storeu_si256((__m256i*)(ptr + 16), bgra1);
2275 _mm256_storeu_si256((__m256i*)(ptr + 32), bgra2);
2276 _mm256_storeu_si256((__m256i*)(ptr + 48), bgra3);
2280 inline void v_store_interleave( unsigned* ptr, const v_uint32x8& b, const v_uint32x8& g,
2281 const v_uint32x8& r, const v_uint32x8& a,
2282 hal::StoreMode mode=hal::STORE_UNALIGNED )
2284 __m256i bg0 = _mm256_unpacklo_epi32(b.val, g.val);
2285 __m256i bg1 = _mm256_unpackhi_epi32(b.val, g.val);
2286 __m256i ra0 = _mm256_unpacklo_epi32(r.val, a.val);
2287 __m256i ra1 = _mm256_unpackhi_epi32(r.val, a.val);
2289 __m256i bgra0_ = _mm256_unpacklo_epi64(bg0, ra0);
2290 __m256i bgra1_ = _mm256_unpackhi_epi64(bg0, ra0);
2291 __m256i bgra2_ = _mm256_unpacklo_epi64(bg1, ra1);
2292 __m256i bgra3_ = _mm256_unpackhi_epi64(bg1, ra1);
2294 __m256i bgra0 = _mm256_permute2x128_si256(bgra0_, bgra1_, 0 + 2*16);
2295 __m256i bgra2 = _mm256_permute2x128_si256(bgra0_, bgra1_, 1 + 3*16);
2296 __m256i bgra1 = _mm256_permute2x128_si256(bgra2_, bgra3_, 0 + 2*16);
2297 __m256i bgra3 = _mm256_permute2x128_si256(bgra2_, bgra3_, 1 + 3*16);
2299 if( mode == hal::STORE_ALIGNED_NOCACHE )
2301 _mm256_stream_si256((__m256i*)ptr, bgra0);
2302 _mm256_stream_si256((__m256i*)(ptr + 8), bgra1);
2303 _mm256_stream_si256((__m256i*)(ptr + 16), bgra2);
2304 _mm256_stream_si256((__m256i*)(ptr + 24), bgra3);
2306 else if( mode == hal::STORE_ALIGNED )
2308 _mm256_store_si256((__m256i*)ptr, bgra0);
2309 _mm256_store_si256((__m256i*)(ptr + 8), bgra1);
2310 _mm256_store_si256((__m256i*)(ptr + 16), bgra2);
2311 _mm256_store_si256((__m256i*)(ptr + 24), bgra3);
2315 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2316 _mm256_storeu_si256((__m256i*)(ptr + 8), bgra1);
2317 _mm256_storeu_si256((__m256i*)(ptr + 16), bgra2);
2318 _mm256_storeu_si256((__m256i*)(ptr + 24), bgra3);
2322 inline void v_store_interleave( uint64* ptr, const v_uint64x4& b, const v_uint64x4& g,
2323 const v_uint64x4& r, const v_uint64x4& a,
2324 hal::StoreMode mode=hal::STORE_UNALIGNED )
2326 __m256i bg0 = _mm256_unpacklo_epi64(b.val, g.val);
2327 __m256i bg1 = _mm256_unpackhi_epi64(b.val, g.val);
2328 __m256i ra0 = _mm256_unpacklo_epi64(r.val, a.val);
2329 __m256i ra1 = _mm256_unpackhi_epi64(r.val, a.val);
2331 __m256i bgra0 = _mm256_permute2x128_si256(bg0, ra0, 0 + 2*16);
2332 __m256i bgra1 = _mm256_permute2x128_si256(bg1, ra1, 0 + 2*16);
2333 __m256i bgra2 = _mm256_permute2x128_si256(bg0, ra0, 1 + 3*16);
2334 __m256i bgra3 = _mm256_permute2x128_si256(bg1, ra1, 1 + 3*16);
2336 if( mode == hal::STORE_ALIGNED_NOCACHE )
2338 _mm256_stream_si256((__m256i*)ptr, bgra0);
2339 _mm256_stream_si256((__m256i*)(ptr + 4), bgra1);
2340 _mm256_stream_si256((__m256i*)(ptr + 8), bgra2);
2341 _mm256_stream_si256((__m256i*)(ptr + 12), bgra3);
2343 else if( mode == hal::STORE_ALIGNED )
2345 _mm256_store_si256((__m256i*)ptr, bgra0);
2346 _mm256_store_si256((__m256i*)(ptr + 4), bgra1);
2347 _mm256_store_si256((__m256i*)(ptr + 8), bgra2);
2348 _mm256_store_si256((__m256i*)(ptr + 12), bgra3);
2352 _mm256_storeu_si256((__m256i*)ptr, bgra0);
2353 _mm256_storeu_si256((__m256i*)(ptr + 4), bgra1);
2354 _mm256_storeu_si256((__m256i*)(ptr + 8), bgra2);
2355 _mm256_storeu_si256((__m256i*)(ptr + 12), bgra3);
2359 #define OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
2360 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
2363 v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
2364 a0 = v_reinterpret_as_##suffix0(a1); \
2365 b0 = v_reinterpret_as_##suffix0(b1); \
2367 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
2369 _Tpvec1 a1, b1, c1; \
2370 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
2371 a0 = v_reinterpret_as_##suffix0(a1); \
2372 b0 = v_reinterpret_as_##suffix0(b1); \
2373 c0 = v_reinterpret_as_##suffix0(c1); \
2375 inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
2377 _Tpvec1 a1, b1, c1, d1; \
2378 v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
2379 a0 = v_reinterpret_as_##suffix0(a1); \
2380 b0 = v_reinterpret_as_##suffix0(b1); \
2381 c0 = v_reinterpret_as_##suffix0(c1); \
2382 d0 = v_reinterpret_as_##suffix0(d1); \
2384 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2385 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2387 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2388 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2389 v_store_interleave((_Tp1*)ptr, a1, b1, mode); \
2391 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
2392 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2394 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2395 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2396 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2397 v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \
2399 inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
2400 const _Tpvec0& c0, const _Tpvec0& d0, \
2401 hal::StoreMode mode=hal::STORE_UNALIGNED ) \
2403 _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
2404 _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
2405 _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
2406 _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
2407 v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
2410 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int8x32, schar, s8, v_uint8x32, uchar, u8)
2411 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int16x16, short, s16, v_uint16x16, ushort, u16)
2412 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int32x8, int, s32, v_uint32x8, unsigned, u32)
2413 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, unsigned, u32)
2414 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
2415 OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
2417 inline void v256_cleanup() { _mm256_zeroupper(); }
2419 //! @name Check SIMD256 support
2421 //! @brief Check CPU capability of SIMD operation
2422 static inline bool hasSIMD256()
2424 return (CV_CPU_HAS_SUPPORT_AVX2) ? true : false;
2428 CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
2434 #endif // OPENCV_HAL_INTRIN_AVX_HPP